fix: OCR editor gutter scroll sync + synthetic layout on ingest + CSS consolidation

This commit is contained in:
Sean McElwain 2026-04-06 12:07:26 -05:00
parent 431372438e
commit 87ce5cc2fb
3 changed files with 137 additions and 94 deletions

View File

@ -9,10 +9,9 @@ import subprocess
import tempfile
from difflib import SequenceMatcher
from pathlib import Path
from PIL import Image
from uuid import uuid4
from PIL import Image
from sqlalchemy import func
from sqlalchemy.orm import Session
@ -97,12 +96,13 @@ def extract_pdf_text(path: Path) -> str:
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ")
reader = csv.DictReader(io.StringIO(tsv_text), delimiter="\t")
grouped: dict[tuple[int, int, int, int], list[dict]] = {}
for row in reader:
if not row.get("text"):
continue
text = row["text"].strip()
if not text:
continue
@ -139,13 +139,14 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he
)
lines = []
for key, words in grouped.items():
for _, words in grouped.items():
words = sorted(words, key=lambda w: w["left"])
left = min(w["left"] for w in words)
top = min(w["top"] for w in words)
right = max(w["left"] + w["width"] for w in words)
bottom = max(w["top"] + w["height"] for w in words)
line_text = " ".join(w["text"] for w in words).strip()
avg_conf = None
valid_conf = [w["conf"] for w in words if w["conf"] is not None]
if valid_conf:
@ -168,6 +169,39 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he
}
def build_synthetic_layout_from_text(text: str) -> dict:
lines = []
line_index = 0
for raw_line in text.splitlines():
clean = raw_line.strip()
if not clean:
continue
lines.append(
{
"text": clean,
"bbox": None,
"confidence": None,
"synthetic": True,
"line_index": line_index,
}
)
line_index += 1
return {
"pages": [
{
"page": 1,
"image_width": None,
"image_height": None,
"synthetic": True,
"lines": lines,
}
]
}
def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
with Image.open(path) as img:
image_width, image_height = img.size
@ -222,9 +256,11 @@ def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
if suffix == ".pdf":
txt, layout = ocr_pdf_with_layout(path)
return txt.strip(), layout, "tesseract", tesseract_version
if suffix in {".jpg", ".jpeg", ".png"}:
txt, layout = ocr_image_with_layout(path)
return txt.strip(), layout, "tesseract", tesseract_version
return "", None, None, None
@ -234,7 +270,8 @@ def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None,
if suffix == ".pdf":
extracted = extract_pdf_text(path)
if len(extracted.strip()) >= 40:
return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"
synthetic_layout = build_synthetic_layout_from_text(extracted)
return extracted, synthetic_layout, "pdftotext", get_pdftotext_version(), "initial_ingest"
ocr_text, layout, engine, version = run_ocr_only(path)
return ocr_text, layout, engine, version, "initial_ingest_fallback"
@ -251,6 +288,7 @@ def compute_quality_score(source_text: str, reviewed_text: str) -> float:
return 100.0
if not source_text:
return 0.0
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
return round(ratio * 100, 2)
@ -424,7 +462,12 @@ def ingest_directory(
continue
try:
ingested.append(
ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
ingest_file(
db=db,
file_path=str(path),
source_system=source_system,
document_type=document_type,
)
)
except Exception:
continue

View File

@ -40,6 +40,7 @@ a {
color: var(--accent);
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
@ -395,19 +396,81 @@ textarea {
.editor-wrap {
display: grid;
grid-template-columns: 52px 1fr;
gap: 0.5rem;
align-items: start;
grid-template-columns: 4rem minmax(0, 1fr);
border: 1px solid var(--border);
border-radius: 0.85rem;
overflow: hidden;
background: #fff;
align-items: stretch;
}
.line-numbers {
font-family: var(--mono);
white-space: pre;
text-align: right;
margin: 0;
padding: 0.9rem 0.75rem;
background: var(--panel-muted);
border-right: 1px solid var(--border);
color: var(--text-muted);
text-align: right;
user-select: none;
padding-top: 0.75rem;
line-height: 1.45;
white-space: pre;
overflow-y: auto;
overflow-x: hidden;
height: 34rem;
min-height: 34rem;
font: 500 0.95rem/1.55 var(--mono);
scrollbar-width: none;
-ms-overflow-style: none;
}
.line-numbers::-webkit-scrollbar {
display: none;
}
.editor-wrap textarea {
margin: 0;
border: 0;
border-radius: 0;
resize: none;
height: 34rem;
min-height: 34rem;
padding: 0.9rem 1rem;
overflow: auto;
white-space: pre;
font: 500 0.95rem/1.55 var(--mono);
line-height: 1.55;
outline: none;
}
.detail-sticky-header {
position: sticky;
top: 0;
z-index: 30;
background: rgba(243,245,249,0.96);
backdrop-filter: blur(8px);
padding-bottom: 0.75rem;
margin-bottom: 1rem;
}
.workspace-grid {
display: grid;
grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr);
gap: 1rem;
align-items: start;
margin-bottom: 1rem;
}
.preview-card {
position: sticky;
top: 7.5rem;
}
.right-pane-tabs {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.75rem;
}
pre.codeblock {
@ -444,15 +507,19 @@ pre.codeblock {
@media (max-width: 1100px) {
.grid-2,
.form-grid,
.meta-grid {
.meta-grid,
.workspace-grid {
grid-template-columns: 1fr;
}
.preview-frame {
height: 720px;
}
}
.preview-card {
position: static;
}
}
.doc-header-sticky {
position: sticky;

View File

@ -4,81 +4,6 @@
<meta charset="UTF-8">
<title>{{ document.document_id }}</title>
<link rel="stylesheet" href="/static/app.css">
<style>
.detail-sticky-header {
position: sticky;
top: 0;
z-index: 30;
background: rgba(243,245,249,0.96);
backdrop-filter: blur(8px);
padding-bottom: 0.75rem;
margin-bottom: 1rem;
}
.queue-nav-row {
display: flex;
flex-wrap: wrap;
gap: 0.65rem;
margin-top: 0.75rem;
}
.workspace-grid {
display: grid;
grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr);
gap: 1rem;
align-items: start;
margin-bottom: 1rem;
}
.right-pane-tabs {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.75rem;
}
.tab-button {
appearance: none;
border: 1px solid var(--border);
background: var(--panel-muted);
color: var(--text);
border-radius: 999px;
padding: 0.45rem 0.85rem;
cursor: pointer;
font: inherit;
}
.tab-button.active {
background: var(--accent);
color: #fff;
border-color: var(--accent);
}
.tab-panel {
display: none;
}
.tab-panel.active {
display: block;
}
.preview-card {
position: sticky;
top: 7.5rem;
}
@media (max-width: 1100px) {
.workspace-grid {
grid-template-columns: 1fr;
}
.preview-card {
position: static;
}
}
</style>
</head>
<body>
<div class="app-shell" id="app-shell">
@ -208,9 +133,9 @@
<div class="form-field full">
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line)</label>
<div class="editor-wrap">
<div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
{% endfor %}</div>
<textarea id="reviewed_text" name="reviewed_text" rows="34">{{ review_text_value }}</textarea>
<pre class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
{% endfor %}</pre>
<textarea id="reviewed_text" name="reviewed_text" rows="34" spellcheck="false">{{ review_text_value }}</textarea>
</div>
</div>
@ -399,6 +324,10 @@
lineNumbersEl.textContent = nums;
}
function syncScroll() {
lineNumbersEl.scrollTop = textarea.scrollTop;
}
function updateEditorState() {
const actual = countLines(textarea.value);
actualLinesEl.textContent = actual.toString();
@ -406,10 +335,14 @@
const mismatch = expectedLines > 0 && actual !== expectedLines;
warningEl.style.display = mismatch ? "inline" : "none";
saveBtn.disabled = mismatch;
syncScroll();
}
textarea.addEventListener("input", updateEditorState);
textarea.addEventListener("scroll", syncScroll);
updateEditorState();
syncScroll();
}
})();
</script>