fix: OCR editor gutter scroll sync + synthetic layout on ingest + CSS consolidation
This commit is contained in:
parent
431372438e
commit
87ce5cc2fb
|
|
@ -9,10 +9,9 @@ import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from PIL import Image
|
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
|
@ -97,12 +96,13 @@ def extract_pdf_text(path: Path) -> str:
|
||||||
|
|
||||||
|
|
||||||
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
|
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
|
||||||
reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ")
|
reader = csv.DictReader(io.StringIO(tsv_text), delimiter="\t")
|
||||||
grouped: dict[tuple[int, int, int, int], list[dict]] = {}
|
grouped: dict[tuple[int, int, int, int], list[dict]] = {}
|
||||||
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
if not row.get("text"):
|
if not row.get("text"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
text = row["text"].strip()
|
text = row["text"].strip()
|
||||||
if not text:
|
if not text:
|
||||||
continue
|
continue
|
||||||
|
|
@ -139,13 +139,14 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he
|
||||||
)
|
)
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
for key, words in grouped.items():
|
for _, words in grouped.items():
|
||||||
words = sorted(words, key=lambda w: w["left"])
|
words = sorted(words, key=lambda w: w["left"])
|
||||||
left = min(w["left"] for w in words)
|
left = min(w["left"] for w in words)
|
||||||
top = min(w["top"] for w in words)
|
top = min(w["top"] for w in words)
|
||||||
right = max(w["left"] + w["width"] for w in words)
|
right = max(w["left"] + w["width"] for w in words)
|
||||||
bottom = max(w["top"] + w["height"] for w in words)
|
bottom = max(w["top"] + w["height"] for w in words)
|
||||||
line_text = " ".join(w["text"] for w in words).strip()
|
line_text = " ".join(w["text"] for w in words).strip()
|
||||||
|
|
||||||
avg_conf = None
|
avg_conf = None
|
||||||
valid_conf = [w["conf"] for w in words if w["conf"] is not None]
|
valid_conf = [w["conf"] for w in words if w["conf"] is not None]
|
||||||
if valid_conf:
|
if valid_conf:
|
||||||
|
|
@ -168,6 +169,39 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_synthetic_layout_from_text(text: str) -> dict:
|
||||||
|
lines = []
|
||||||
|
line_index = 0
|
||||||
|
|
||||||
|
for raw_line in text.splitlines():
|
||||||
|
clean = raw_line.strip()
|
||||||
|
if not clean:
|
||||||
|
continue
|
||||||
|
|
||||||
|
lines.append(
|
||||||
|
{
|
||||||
|
"text": clean,
|
||||||
|
"bbox": None,
|
||||||
|
"confidence": None,
|
||||||
|
"synthetic": True,
|
||||||
|
"line_index": line_index,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
line_index += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
"pages": [
|
||||||
|
{
|
||||||
|
"page": 1,
|
||||||
|
"image_width": None,
|
||||||
|
"image_height": None,
|
||||||
|
"synthetic": True,
|
||||||
|
"lines": lines,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
|
def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
|
||||||
with Image.open(path) as img:
|
with Image.open(path) as img:
|
||||||
image_width, image_height = img.size
|
image_width, image_height = img.size
|
||||||
|
|
@ -222,9 +256,11 @@ def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
|
||||||
if suffix == ".pdf":
|
if suffix == ".pdf":
|
||||||
txt, layout = ocr_pdf_with_layout(path)
|
txt, layout = ocr_pdf_with_layout(path)
|
||||||
return txt.strip(), layout, "tesseract", tesseract_version
|
return txt.strip(), layout, "tesseract", tesseract_version
|
||||||
|
|
||||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||||
txt, layout = ocr_image_with_layout(path)
|
txt, layout = ocr_image_with_layout(path)
|
||||||
return txt.strip(), layout, "tesseract", tesseract_version
|
return txt.strip(), layout, "tesseract", tesseract_version
|
||||||
|
|
||||||
return "", None, None, None
|
return "", None, None, None
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -234,7 +270,8 @@ def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None,
|
||||||
if suffix == ".pdf":
|
if suffix == ".pdf":
|
||||||
extracted = extract_pdf_text(path)
|
extracted = extract_pdf_text(path)
|
||||||
if len(extracted.strip()) >= 40:
|
if len(extracted.strip()) >= 40:
|
||||||
return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
synthetic_layout = build_synthetic_layout_from_text(extracted)
|
||||||
|
return extracted, synthetic_layout, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
||||||
|
|
||||||
ocr_text, layout, engine, version = run_ocr_only(path)
|
ocr_text, layout, engine, version = run_ocr_only(path)
|
||||||
return ocr_text, layout, engine, version, "initial_ingest_fallback"
|
return ocr_text, layout, engine, version, "initial_ingest_fallback"
|
||||||
|
|
@ -251,6 +288,7 @@ def compute_quality_score(source_text: str, reviewed_text: str) -> float:
|
||||||
return 100.0
|
return 100.0
|
||||||
if not source_text:
|
if not source_text:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
|
ratio = SequenceMatcher(None, source_text, reviewed_text).ratio()
|
||||||
return round(ratio * 100, 2)
|
return round(ratio * 100, 2)
|
||||||
|
|
||||||
|
|
@ -424,7 +462,12 @@ def ingest_directory(
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
ingested.append(
|
ingested.append(
|
||||||
ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
|
ingest_file(
|
||||||
|
db=db,
|
||||||
|
file_path=str(path),
|
||||||
|
source_system=source_system,
|
||||||
|
document_type=document_type,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,7 @@ a {
|
||||||
color: var(--accent);
|
color: var(--accent);
|
||||||
text-decoration: none;
|
text-decoration: none;
|
||||||
}
|
}
|
||||||
|
|
||||||
a:hover {
|
a:hover {
|
||||||
text-decoration: underline;
|
text-decoration: underline;
|
||||||
}
|
}
|
||||||
|
|
@ -395,19 +396,81 @@ textarea {
|
||||||
|
|
||||||
.editor-wrap {
|
.editor-wrap {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: 52px 1fr;
|
grid-template-columns: 4rem minmax(0, 1fr);
|
||||||
gap: 0.5rem;
|
border: 1px solid var(--border);
|
||||||
align-items: start;
|
border-radius: 0.85rem;
|
||||||
|
overflow: hidden;
|
||||||
|
background: #fff;
|
||||||
|
align-items: stretch;
|
||||||
}
|
}
|
||||||
|
|
||||||
.line-numbers {
|
.line-numbers {
|
||||||
font-family: var(--mono);
|
margin: 0;
|
||||||
white-space: pre;
|
padding: 0.9rem 0.75rem;
|
||||||
text-align: right;
|
background: var(--panel-muted);
|
||||||
|
border-right: 1px solid var(--border);
|
||||||
color: var(--text-muted);
|
color: var(--text-muted);
|
||||||
|
text-align: right;
|
||||||
user-select: none;
|
user-select: none;
|
||||||
padding-top: 0.75rem;
|
white-space: pre;
|
||||||
line-height: 1.45;
|
overflow-y: auto;
|
||||||
|
overflow-x: hidden;
|
||||||
|
height: 34rem;
|
||||||
|
min-height: 34rem;
|
||||||
|
font: 500 0.95rem/1.55 var(--mono);
|
||||||
|
scrollbar-width: none;
|
||||||
|
-ms-overflow-style: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.line-numbers::-webkit-scrollbar {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.editor-wrap textarea {
|
||||||
|
margin: 0;
|
||||||
|
border: 0;
|
||||||
|
border-radius: 0;
|
||||||
|
resize: none;
|
||||||
|
height: 34rem;
|
||||||
|
min-height: 34rem;
|
||||||
|
padding: 0.9rem 1rem;
|
||||||
|
overflow: auto;
|
||||||
|
white-space: pre;
|
||||||
|
font: 500 0.95rem/1.55 var(--mono);
|
||||||
|
line-height: 1.55;
|
||||||
|
outline: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.detail-sticky-header {
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 30;
|
||||||
|
background: rgba(243,245,249,0.96);
|
||||||
|
backdrop-filter: blur(8px);
|
||||||
|
padding-bottom: 0.75rem;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.workspace-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr);
|
||||||
|
gap: 1rem;
|
||||||
|
align-items: start;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.preview-card {
|
||||||
|
position: sticky;
|
||||||
|
top: 7.5rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.right-pane-tabs {
|
||||||
|
display: flex;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
gap: 0.5rem;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
border-bottom: 1px solid var(--border);
|
||||||
|
padding-bottom: 0.75rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
pre.codeblock {
|
pre.codeblock {
|
||||||
|
|
@ -444,15 +507,19 @@ pre.codeblock {
|
||||||
@media (max-width: 1100px) {
|
@media (max-width: 1100px) {
|
||||||
.grid-2,
|
.grid-2,
|
||||||
.form-grid,
|
.form-grid,
|
||||||
.meta-grid {
|
.meta-grid,
|
||||||
|
.workspace-grid {
|
||||||
grid-template-columns: 1fr;
|
grid-template-columns: 1fr;
|
||||||
}
|
}
|
||||||
|
|
||||||
.preview-frame {
|
.preview-frame {
|
||||||
height: 720px;
|
height: 720px;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
.preview-card {
|
||||||
|
position: static;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
.doc-header-sticky {
|
.doc-header-sticky {
|
||||||
position: sticky;
|
position: sticky;
|
||||||
|
|
|
||||||
|
|
@ -4,81 +4,6 @@
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>{{ document.document_id }}</title>
|
<title>{{ document.document_id }}</title>
|
||||||
<link rel="stylesheet" href="/static/app.css">
|
<link rel="stylesheet" href="/static/app.css">
|
||||||
<style>
|
|
||||||
.detail-sticky-header {
|
|
||||||
position: sticky;
|
|
||||||
top: 0;
|
|
||||||
z-index: 30;
|
|
||||||
background: rgba(243,245,249,0.96);
|
|
||||||
backdrop-filter: blur(8px);
|
|
||||||
padding-bottom: 0.75rem;
|
|
||||||
margin-bottom: 1rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
.queue-nav-row {
|
|
||||||
display: flex;
|
|
||||||
flex-wrap: wrap;
|
|
||||||
gap: 0.65rem;
|
|
||||||
margin-top: 0.75rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
.workspace-grid {
|
|
||||||
display: grid;
|
|
||||||
grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr);
|
|
||||||
gap: 1rem;
|
|
||||||
align-items: start;
|
|
||||||
margin-bottom: 1rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
.right-pane-tabs {
|
|
||||||
display: flex;
|
|
||||||
flex-wrap: wrap;
|
|
||||||
gap: 0.5rem;
|
|
||||||
margin-bottom: 1rem;
|
|
||||||
border-bottom: 1px solid var(--border);
|
|
||||||
padding-bottom: 0.75rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
.tab-button {
|
|
||||||
appearance: none;
|
|
||||||
border: 1px solid var(--border);
|
|
||||||
background: var(--panel-muted);
|
|
||||||
color: var(--text);
|
|
||||||
border-radius: 999px;
|
|
||||||
padding: 0.45rem 0.85rem;
|
|
||||||
cursor: pointer;
|
|
||||||
font: inherit;
|
|
||||||
}
|
|
||||||
|
|
||||||
.tab-button.active {
|
|
||||||
background: var(--accent);
|
|
||||||
color: #fff;
|
|
||||||
border-color: var(--accent);
|
|
||||||
}
|
|
||||||
|
|
||||||
.tab-panel {
|
|
||||||
display: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
.tab-panel.active {
|
|
||||||
display: block;
|
|
||||||
}
|
|
||||||
|
|
||||||
.preview-card {
|
|
||||||
position: sticky;
|
|
||||||
top: 7.5rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
@media (max-width: 1100px) {
|
|
||||||
.workspace-grid {
|
|
||||||
grid-template-columns: 1fr;
|
|
||||||
}
|
|
||||||
|
|
||||||
.preview-card {
|
|
||||||
position: static;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div class="app-shell" id="app-shell">
|
<div class="app-shell" id="app-shell">
|
||||||
|
|
@ -208,9 +133,9 @@
|
||||||
<div class="form-field full">
|
<div class="form-field full">
|
||||||
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line)</label>
|
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line)</label>
|
||||||
<div class="editor-wrap">
|
<div class="editor-wrap">
|
||||||
<div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
|
<pre class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
|
||||||
{% endfor %}</div>
|
{% endfor %}</pre>
|
||||||
<textarea id="reviewed_text" name="reviewed_text" rows="34">{{ review_text_value }}</textarea>
|
<textarea id="reviewed_text" name="reviewed_text" rows="34" spellcheck="false">{{ review_text_value }}</textarea>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
@ -399,6 +324,10 @@
|
||||||
lineNumbersEl.textContent = nums;
|
lineNumbersEl.textContent = nums;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function syncScroll() {
|
||||||
|
lineNumbersEl.scrollTop = textarea.scrollTop;
|
||||||
|
}
|
||||||
|
|
||||||
function updateEditorState() {
|
function updateEditorState() {
|
||||||
const actual = countLines(textarea.value);
|
const actual = countLines(textarea.value);
|
||||||
actualLinesEl.textContent = actual.toString();
|
actualLinesEl.textContent = actual.toString();
|
||||||
|
|
@ -406,10 +335,14 @@
|
||||||
const mismatch = expectedLines > 0 && actual !== expectedLines;
|
const mismatch = expectedLines > 0 && actual !== expectedLines;
|
||||||
warningEl.style.display = mismatch ? "inline" : "none";
|
warningEl.style.display = mismatch ? "inline" : "none";
|
||||||
saveBtn.disabled = mismatch;
|
saveBtn.disabled = mismatch;
|
||||||
|
syncScroll();
|
||||||
}
|
}
|
||||||
|
|
||||||
textarea.addEventListener("input", updateEditorState);
|
textarea.addEventListener("input", updateEditorState);
|
||||||
|
textarea.addEventListener("scroll", syncScroll);
|
||||||
|
|
||||||
updateEditorState();
|
updateEditorState();
|
||||||
|
syncScroll();
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
</script>
|
</script>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue