fix: OCR editor gutter scroll sync + synthetic layout on ingest + CSS consolidation

This commit is contained in:
Sean McElwain 2026-04-06 12:07:26 -05:00
parent 431372438e
commit 87ce5cc2fb
3 changed files with 137 additions and 94 deletions

View File

@ -9,10 +9,9 @@ import subprocess
import tempfile import tempfile
from difflib import SequenceMatcher from difflib import SequenceMatcher
from pathlib import Path from pathlib import Path
from PIL import Image
from uuid import uuid4 from uuid import uuid4
from PIL import Image
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@ -97,12 +96,13 @@ def extract_pdf_text(path: Path) -> str:
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict: def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ") reader = csv.DictReader(io.StringIO(tsv_text), delimiter="\t")
grouped: dict[tuple[int, int, int, int], list[dict]] = {} grouped: dict[tuple[int, int, int, int], list[dict]] = {}
for row in reader: for row in reader:
if not row.get("text"): if not row.get("text"):
continue continue
text = row["text"].strip() text = row["text"].strip()
if not text: if not text:
continue continue
@ -139,13 +139,14 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he
) )
lines = [] lines = []
for key, words in grouped.items(): for _, words in grouped.items():
words = sorted(words, key=lambda w: w["left"]) words = sorted(words, key=lambda w: w["left"])
left = min(w["left"] for w in words) left = min(w["left"] for w in words)
top = min(w["top"] for w in words) top = min(w["top"] for w in words)
right = max(w["left"] + w["width"] for w in words) right = max(w["left"] + w["width"] for w in words)
bottom = max(w["top"] + w["height"] for w in words) bottom = max(w["top"] + w["height"] for w in words)
line_text = " ".join(w["text"] for w in words).strip() line_text = " ".join(w["text"] for w in words).strip()
avg_conf = None avg_conf = None
valid_conf = [w["conf"] for w in words if w["conf"] is not None] valid_conf = [w["conf"] for w in words if w["conf"] is not None]
if valid_conf: if valid_conf:
@ -168,6 +169,39 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he
} }
def build_synthetic_layout_from_text(text: str) -> dict:
lines = []
line_index = 0
for raw_line in text.splitlines():
clean = raw_line.strip()
if not clean:
continue
lines.append(
{
"text": clean,
"bbox": None,
"confidence": None,
"synthetic": True,
"line_index": line_index,
}
)
line_index += 1
return {
"pages": [
{
"page": 1,
"image_width": None,
"image_height": None,
"synthetic": True,
"lines": lines,
}
]
}
def ocr_image_with_layout(path: Path) -> tuple[str, dict]: def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
with Image.open(path) as img: with Image.open(path) as img:
image_width, image_height = img.size image_width, image_height = img.size
@ -222,9 +256,11 @@ def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
if suffix == ".pdf": if suffix == ".pdf":
txt, layout = ocr_pdf_with_layout(path) txt, layout = ocr_pdf_with_layout(path)
return txt.strip(), layout, "tesseract", tesseract_version return txt.strip(), layout, "tesseract", tesseract_version
if suffix in {".jpg", ".jpeg", ".png"}: if suffix in {".jpg", ".jpeg", ".png"}:
txt, layout = ocr_image_with_layout(path) txt, layout = ocr_image_with_layout(path)
return txt.strip(), layout, "tesseract", tesseract_version return txt.strip(), layout, "tesseract", tesseract_version
return "", None, None, None return "", None, None, None
@ -234,7 +270,8 @@ def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None,
if suffix == ".pdf": if suffix == ".pdf":
extracted = extract_pdf_text(path) extracted = extract_pdf_text(path)
if len(extracted.strip()) >= 40: if len(extracted.strip()) >= 40:
return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest" synthetic_layout = build_synthetic_layout_from_text(extracted)
return extracted, synthetic_layout, "pdftotext", get_pdftotext_version(), "initial_ingest"
ocr_text, layout, engine, version = run_ocr_only(path) ocr_text, layout, engine, version = run_ocr_only(path)
return ocr_text, layout, engine, version, "initial_ingest_fallback" return ocr_text, layout, engine, version, "initial_ingest_fallback"
@ -251,6 +288,7 @@ def compute_quality_score(source_text: str, reviewed_text: str) -> float:
return 100.0 return 100.0
if not source_text: if not source_text:
return 0.0 return 0.0
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio() ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
return round(ratio * 100, 2) return round(ratio * 100, 2)
@ -424,7 +462,12 @@ def ingest_directory(
continue continue
try: try:
ingested.append( ingested.append(
ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type) ingest_file(
db=db,
file_path=str(path),
source_system=source_system,
document_type=document_type,
)
) )
except Exception: except Exception:
continue continue

View File

@ -40,6 +40,7 @@ a {
color: var(--accent); color: var(--accent);
text-decoration: none; text-decoration: none;
} }
a:hover { a:hover {
text-decoration: underline; text-decoration: underline;
} }
@ -395,19 +396,81 @@ textarea {
.editor-wrap { .editor-wrap {
display: grid; display: grid;
grid-template-columns: 52px 1fr; grid-template-columns: 4rem minmax(0, 1fr);
gap: 0.5rem; border: 1px solid var(--border);
align-items: start; border-radius: 0.85rem;
overflow: hidden;
background: #fff;
align-items: stretch;
} }
.line-numbers { .line-numbers {
font-family: var(--mono); margin: 0;
white-space: pre; padding: 0.9rem 0.75rem;
text-align: right; background: var(--panel-muted);
border-right: 1px solid var(--border);
color: var(--text-muted); color: var(--text-muted);
text-align: right;
user-select: none; user-select: none;
padding-top: 0.75rem; white-space: pre;
line-height: 1.45; overflow-y: auto;
overflow-x: hidden;
height: 34rem;
min-height: 34rem;
font: 500 0.95rem/1.55 var(--mono);
scrollbar-width: none;
-ms-overflow-style: none;
}
.line-numbers::-webkit-scrollbar {
display: none;
}
.editor-wrap textarea {
margin: 0;
border: 0;
border-radius: 0;
resize: none;
height: 34rem;
min-height: 34rem;
padding: 0.9rem 1rem;
overflow: auto;
white-space: pre;
font: 500 0.95rem/1.55 var(--mono);
line-height: 1.55;
outline: none;
}
.detail-sticky-header {
position: sticky;
top: 0;
z-index: 30;
background: rgba(243,245,249,0.96);
backdrop-filter: blur(8px);
padding-bottom: 0.75rem;
margin-bottom: 1rem;
}
.workspace-grid {
display: grid;
grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr);
gap: 1rem;
align-items: start;
margin-bottom: 1rem;
}
.preview-card {
position: sticky;
top: 7.5rem;
}
.right-pane-tabs {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.75rem;
} }
pre.codeblock { pre.codeblock {
@ -444,15 +507,19 @@ pre.codeblock {
@media (max-width: 1100px) { @media (max-width: 1100px) {
.grid-2, .grid-2,
.form-grid, .form-grid,
.meta-grid { .meta-grid,
.workspace-grid {
grid-template-columns: 1fr; grid-template-columns: 1fr;
} }
.preview-frame { .preview-frame {
height: 720px; height: 720px;
} }
}
.preview-card {
position: static;
}
}
.doc-header-sticky { .doc-header-sticky {
position: sticky; position: sticky;

View File

@ -4,81 +4,6 @@
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>{{ document.document_id }}</title> <title>{{ document.document_id }}</title>
<link rel="stylesheet" href="/static/app.css"> <link rel="stylesheet" href="/static/app.css">
<style>
.detail-sticky-header {
position: sticky;
top: 0;
z-index: 30;
background: rgba(243,245,249,0.96);
backdrop-filter: blur(8px);
padding-bottom: 0.75rem;
margin-bottom: 1rem;
}
.queue-nav-row {
display: flex;
flex-wrap: wrap;
gap: 0.65rem;
margin-top: 0.75rem;
}
.workspace-grid {
display: grid;
grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr);
gap: 1rem;
align-items: start;
margin-bottom: 1rem;
}
.right-pane-tabs {
display: flex;
flex-wrap: wrap;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.75rem;
}
.tab-button {
appearance: none;
border: 1px solid var(--border);
background: var(--panel-muted);
color: var(--text);
border-radius: 999px;
padding: 0.45rem 0.85rem;
cursor: pointer;
font: inherit;
}
.tab-button.active {
background: var(--accent);
color: #fff;
border-color: var(--accent);
}
.tab-panel {
display: none;
}
.tab-panel.active {
display: block;
}
.preview-card {
position: sticky;
top: 7.5rem;
}
@media (max-width: 1100px) {
.workspace-grid {
grid-template-columns: 1fr;
}
.preview-card {
position: static;
}
}
</style>
</head> </head>
<body> <body>
<div class="app-shell" id="app-shell"> <div class="app-shell" id="app-shell">
@ -208,9 +133,9 @@
<div class="form-field full"> <div class="form-field full">
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line)</label> <label for="reviewed_text">Edit reviewed OCR text (one line per OCR line)</label>
<div class="editor-wrap"> <div class="editor-wrap">
<div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }} <pre class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
{% endfor %}</div> {% endfor %}</pre>
<textarea id="reviewed_text" name="reviewed_text" rows="34">{{ review_text_value }}</textarea> <textarea id="reviewed_text" name="reviewed_text" rows="34" spellcheck="false">{{ review_text_value }}</textarea>
</div> </div>
</div> </div>
@ -399,6 +324,10 @@
lineNumbersEl.textContent = nums; lineNumbersEl.textContent = nums;
} }
function syncScroll() {
lineNumbersEl.scrollTop = textarea.scrollTop;
}
function updateEditorState() { function updateEditorState() {
const actual = countLines(textarea.value); const actual = countLines(textarea.value);
actualLinesEl.textContent = actual.toString(); actualLinesEl.textContent = actual.toString();
@ -406,10 +335,14 @@
const mismatch = expectedLines > 0 && actual !== expectedLines; const mismatch = expectedLines > 0 && actual !== expectedLines;
warningEl.style.display = mismatch ? "inline" : "none"; warningEl.style.display = mismatch ? "inline" : "none";
saveBtn.disabled = mismatch; saveBtn.disabled = mismatch;
syncScroll();
} }
textarea.addEventListener("input", updateEditorState); textarea.addEventListener("input", updateEditorState);
textarea.addEventListener("scroll", syncScroll);
updateEditorState(); updateEditorState();
syncScroll();
} }
})(); })();
</script> </script>