diff --git a/app/logic/ingest.py b/app/logic/ingest.py index f70c2cc..bbd42ac 100644 --- a/app/logic/ingest.py +++ b/app/logic/ingest.py @@ -9,10 +9,9 @@ import subprocess import tempfile from difflib import SequenceMatcher from pathlib import Path - -from PIL import Image from uuid import uuid4 +from PIL import Image from sqlalchemy import func from sqlalchemy.orm import Session @@ -97,12 +96,13 @@ def extract_pdf_text(path: Path) -> str: def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict: - reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ") + reader = csv.DictReader(io.StringIO(tsv_text), delimiter="\t") grouped: dict[tuple[int, int, int, int], list[dict]] = {} for row in reader: if not row.get("text"): continue + text = row["text"].strip() if not text: continue @@ -139,13 +139,14 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he ) lines = [] - for key, words in grouped.items(): + for _, words in grouped.items(): words = sorted(words, key=lambda w: w["left"]) left = min(w["left"] for w in words) top = min(w["top"] for w in words) right = max(w["left"] + w["width"] for w in words) bottom = max(w["top"] + w["height"] for w in words) line_text = " ".join(w["text"] for w in words).strip() + avg_conf = None valid_conf = [w["conf"] for w in words if w["conf"] is not None] if valid_conf: @@ -168,6 +169,39 @@ def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_he } +def build_synthetic_layout_from_text(text: str) -> dict: + lines = [] + line_index = 0 + + for raw_line in text.splitlines(): + clean = raw_line.strip() + if not clean: + continue + + lines.append( + { + "text": clean, + "bbox": None, + "confidence": None, + "synthetic": True, + "line_index": line_index, + } + ) + line_index += 1 + + return { + "pages": [ + { + "page": 1, + "image_width": None, + "image_height": None, + "synthetic": True, + "lines": lines, + } + ] + } + + def ocr_image_with_layout(path: Path) -> tuple[str, dict]: with Image.open(path) as img: image_width, image_height = img.size @@ -222,9 +256,11 @@ def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]: if suffix == ".pdf": txt, layout = ocr_pdf_with_layout(path) return txt.strip(), layout, "tesseract", tesseract_version + if suffix in {".jpg", ".jpeg", ".png"}: txt, layout = ocr_image_with_layout(path) return txt.strip(), layout, "tesseract", tesseract_version + return "", None, None, None @@ -234,7 +270,8 @@ def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, if suffix == ".pdf": extracted = extract_pdf_text(path) if len(extracted.strip()) >= 40: - return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest" + synthetic_layout = build_synthetic_layout_from_text(extracted) + return extracted, synthetic_layout, "pdftotext", get_pdftotext_version(), "initial_ingest" ocr_text, layout, engine, version = run_ocr_only(path) return ocr_text, layout, engine, version, "initial_ingest_fallback" @@ -251,6 +288,7 @@ def compute_quality_score(source_text: str, reviewed_text: str) -> float: return 100.0 if not source_text: return 0.0 + ratio = SequenceMatcher(None, source_text, reviewed_text).ratio() return round(ratio * 100, 2) @@ -424,7 +462,12 @@ def ingest_directory( continue try: ingested.append( - ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type) + ingest_file( + db=db, + file_path=str(path), + source_system=source_system, + document_type=document_type, + ) ) except Exception: continue diff --git a/app/static/app.css b/app/static/app.css index 3430237..3683068 100644 --- a/app/static/app.css +++ b/app/static/app.css @@ -40,6 +40,7 @@ a { color: var(--accent); text-decoration: none; } + a:hover { text-decoration: underline; } @@ -395,19 +396,81 @@ textarea { .editor-wrap { display: grid; - grid-template-columns: 52px 1fr; - gap: 0.5rem; - align-items: start; + grid-template-columns: 4rem minmax(0, 1fr); + border: 1px solid var(--border); + border-radius: 0.85rem; + overflow: hidden; + background: #fff; + align-items: stretch; } .line-numbers { - font-family: var(--mono); - white-space: pre; - text-align: right; + margin: 0; + padding: 0.9rem 0.75rem; + background: var(--panel-muted); + border-right: 1px solid var(--border); color: var(--text-muted); + text-align: right; user-select: none; - padding-top: 0.75rem; - line-height: 1.45; + white-space: pre; + overflow-y: auto; + overflow-x: hidden; + height: 34rem; + min-height: 34rem; + font: 500 0.95rem/1.55 var(--mono); + scrollbar-width: none; + -ms-overflow-style: none; +} + +.line-numbers::-webkit-scrollbar { + display: none; +} + +.editor-wrap textarea { + margin: 0; + border: 0; + border-radius: 0; + resize: none; + height: 34rem; + min-height: 34rem; + padding: 0.9rem 1rem; + overflow: auto; + white-space: pre; + font: 500 0.95rem/1.55 var(--mono); + line-height: 1.55; + outline: none; +} + +.detail-sticky-header { + position: sticky; + top: 0; + z-index: 30; + background: rgba(243,245,249,0.96); + backdrop-filter: blur(8px); + padding-bottom: 0.75rem; + margin-bottom: 1rem; +} + +.workspace-grid { + display: grid; + grid-template-columns: minmax(0, 1.2fr) minmax(380px, 0.9fr); + gap: 1rem; + align-items: start; + margin-bottom: 1rem; +} + +.preview-card { + position: sticky; + top: 7.5rem; +} + +.right-pane-tabs { + display: flex; + flex-wrap: wrap; + gap: 0.5rem; + margin-bottom: 1rem; + border-bottom: 1px solid var(--border); + padding-bottom: 0.75rem; } pre.codeblock { @@ -444,15 +507,19 @@ pre.codeblock { @media (max-width: 1100px) { .grid-2, .form-grid, - .meta-grid { + .meta-grid, + .workspace-grid { grid-template-columns: 1fr; } .preview-frame { height: 720px; } -} + .preview-card { + position: static; + } +} .doc-header-sticky { position: sticky; diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 19d9fc5..b0c5e2a 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -4,81 +4,6 @@ {{ document.document_id }} -
@@ -208,9 +133,9 @@
-
{% for n in line_numbers %}{{ n }} -{% endfor %}
- +
{% for n in line_numbers %}{{ n }}
+{% endfor %}
+
@@ -399,6 +324,10 @@ lineNumbersEl.textContent = nums; } + function syncScroll() { + lineNumbersEl.scrollTop = textarea.scrollTop; + } + function updateEditorState() { const actual = countLines(textarea.value); actualLinesEl.textContent = actual.toString(); @@ -406,10 +335,14 @@ const mismatch = expectedLines > 0 && actual !== expectedLines; warningEl.style.display = mismatch ? "inline" : "none"; saveBtn.disabled = mismatch; + syncScroll(); } textarea.addEventListener("input", updateEditorState); + textarea.addEventListener("scroll", syncScroll); + updateEditorState(); + syncScroll(); } })();