feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation

2026-04-03 11:56:23 -05:00 · 2026-04-03 11:56:23 -05:00 · e67a67f80a
parent 0d70e6b7bb
commit e67a67f80a
7 changed files with 634 additions and 115 deletions
--- a/app/core/config.py
+++ b/app/core/config.py
@ -8,3 +8,5 @@ DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/documen
 DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
 INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
 UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
 OCR_CORRECTED_ROOT = os.getenv("OCR_CORRECTED_ROOT", "/mnt/storage/document-processor/outputs/ocr_corrected")
 FIELD_ENRICHED_ROOT = os.getenv("FIELD_ENRICHED_ROOT", "/mnt/storage/document-processor/outputs/field_enriched")
--- a/app/logic/document_outputs.py
+++ b/app/logic/document_outputs.py
@ -0,0 +1,243 @@
 from __future__ import annotations
 import hashlib
 import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 from PIL import Image
 from pypdf import PdfReader
 from reportlab.lib.utils import ImageReader
 from reportlab.pdfbase.pdfmetrics import stringWidth
 from reportlab.pdfgen import canvas
 from sqlalchemy import func
 from sqlalchemy.orm import Session
 from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT
 from app.models.document import Document
 from app.models.document_version import DocumentVersion
 from app.models.text_version import TextVersion
 def sha256_for_file(path: Path) -> str:
    hasher = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            hasher.update(chunk)
    return hasher.hexdigest()
 def get_next_document_version_number(db: Session, document_id: int) -> int:
    max_version = (
        db.query(func.max(DocumentVersion.version_number))
        .filter(DocumentVersion.document_id == document_id)
        .scalar()
    )
    return (max_version or 0) + 1
 def _build_output_path(root: str, document: Document, version_type: str) -> Path:
    source = Path(document.current_path or "")
    suffix = source.suffix.lower() if source.suffix else ".pdf"
    filename = f"{document.document_id}_{version_type}{suffix}"
    return Path(root) / filename
 def _latest_current_text_version(document: Document, version_type: str) -> TextVersion | None:
    candidates = [tv for tv in document.text_versions if tv.version_type == version_type and tv.is_current]
    if not candidates:
        return None
    return sorted(candidates, key=lambda x: (x.version_number, x.created_at), reverse=True)[0]
 def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
    prefix = tmpdir / "page"
    subprocess.run(
        ["pdftoppm", "-png", str(pdf_path), str(prefix)],
        capture_output=True,
        text=True,
        check=True,
    )
    return sorted(tmpdir.glob("page-*.png"))
 def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
    if not text:
        return max(6.0, box_height * 0.80)
    font_size = max(6.0, box_height * 0.88)
    while font_size > 4.5 and stringWidth(text, "Helvetica", font_size) > box_width * 0.98:
        font_size -= 0.25
    min_reasonable = max(6.0, box_height * 0.68)
    return max(min_reasonable, font_size)
 def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
    if not layout_json:
        return []
    flattened = []
    for page in layout_json.get("pages", []):
        for line in page.get("lines", []):
            flattened.append(
                {
                    "page": page["page"],
                    "bbox": line["bbox"],
                    "text": line.get("text", ""),
                }
            )
    return flattened
 def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion:
    if not document.current_path:
        raise ValueError("Document has no current_path")
    current_file = Path(document.current_path)
    if not current_file.exists():
        raise FileNotFoundError(f"Current file not found: {current_file}")
    raw_ocr = _latest_current_text_version(document, "raw_ocr")
    reviewed = _latest_current_text_version(document, "reviewed")
    if raw_ocr is None:
        raise ValueError("No current raw OCR version found")
    if reviewed is None:
        raise ValueError("No current reviewed text found")
    if current_file.suffix.lower() != ".pdf":
        raise ValueError("C1 corrected PDF generation currently supports PDFs only")
    raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
    reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
    if not raw_lines:
        raise ValueError("No OCR line boxes found in raw OCR layout data")
    if reviewed_lines and len(reviewed_lines) != len(raw_lines):
        raise ValueError("Reviewed line layout does not match raw OCR line layout")
    source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
    if not source_layout:
        raise ValueError("No source layout found")
    out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    reader = PdfReader(str(current_file))
    with tempfile.TemporaryDirectory() as tmpdirname:
        tmpdir = Path(tmpdirname)
        images = _render_pdf_page_images(current_file, tmpdir)
        overlay_pdf_path = tmpdir / "overlay.pdf"
        c = None
        page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
        for page_num, img_path in enumerate(images, start=1):
            pdf_page = reader.pages[page_num - 1]
            page_w = float(pdf_page.mediabox.width)
            page_h = float(pdf_page.mediabox.height)
            img = Image.open(img_path)
            if c is None:
                c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
            else:
                c.setPageSize((page_w, page_h))
            c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
            page_layout = page_layouts.get(page_num, {"lines": []})
            src_w = float(page_layout.get("image_width") or img.size[0])
            src_h = float(page_layout.get("image_height") or img.size[1])
            scale_x = page_w / src_w
            scale_y = page_h / src_h
            for line in page_layout.get("lines", []):
                text_line = (line.get("text") or "").strip()
                if not text_line:
                    continue
                left, top, right, bottom = line["bbox"]
                pdf_x = left * scale_x
                pdf_y = page_h - (bottom * scale_y)
                box_width = max(10.0, (right - left) * scale_x)
                box_height = max(6.0, (bottom - top) * scale_y)
                font_size = _fit_font_size(text_line, box_width, box_height)
                text_obj = c.beginText()
                text_obj.setTextRenderMode(3)
                text_obj.setFont("Helvetica", font_size)
                text_obj.setTextOrigin(pdf_x, pdf_y + 1)
                text_obj.textLine(text_line)
                c.drawText(text_obj)
            c.showPage()
        if c is None:
            raise ValueError("Failed to build overlay PDF")
        c.save()
        shutil.copy2(overlay_pdf_path, out_path)
    file_hash = sha256_for_file(out_path)
    version = DocumentVersion(
        document_id=document.id,
        version_number=get_next_document_version_number(db, document.id),
        version_type="ocr_corrected",
        file_path=str(out_path),
        sha256=file_hash,
        created_by="save_ocr_corrected_pdf",
        notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
    )
    db.add(version)
    document.current_path = str(out_path)
    document.canonical_filename = out_path.name
    document.sha256_current = file_hash
    db.commit()
    db.refresh(version)
    return version
 def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion:
    if not document.current_path:
        raise ValueError("Document has no current_path")
    current_file = Path(document.current_path)
    if not current_file.exists():
        raise FileNotFoundError(f"Current file not found: {current_file}")
    out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(current_file, out_path)
    file_hash = sha256_for_file(out_path)
    version = DocumentVersion(
        document_id=document.id,
        version_number=get_next_document_version_number(db, document.id),
        version_type="field_enriched",
        file_path=str(out_path),
        sha256=file_hash,
        created_by="save_field_enriched_pdf",
        notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
    )
    db.add(version)
    document.current_path = str(out_path)
    document.canonical_filename = out_path.name
    document.sha256_current = file_hash
    db.commit()
    db.refresh(version)
    return version
--- a/app/logic/ingest.py
+++ b/app/logic/ingest.py
@ -1,12 +1,16 @@
 from __future__ import annotations
 import csv
 import hashlib
 import io
 import mimetypes
 import shutil
 import subprocess
 import tempfile
 from difflib import SequenceMatcher
 from pathlib import Path
 from PIL import Image
 from uuid import uuid4
 from sqlalchemy import func
@ -61,8 +65,7 @@ def get_tesseract_version() -> str | None:
            text=True,
            check=True,
        )
-        line = result.stdout.splitlines()[0].strip()
+        return result.stdout.splitlines()[0].strip()
        return line
    except Exception:
        return None
@ -93,67 +96,154 @@ def extract_pdf_text(path: Path) -> str:
        return ""
-def ocr_image(path: Path) -> str:
+def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
    reader = csv.DictReader(io.StringIO(tsv_text), delimiter="	")
    grouped: dict[tuple[int, int, int, int], list[dict]] = {}
    for row in reader:
        if not row.get("text"):
            continue
        text = row["text"].strip()
        if not text:
            continue
        try:
-        result = subprocess.run(
+            level = int(row["level"])
            page_num = int(row["page_num"])
            block_num = int(row["block_num"])
            par_num = int(row["par_num"])
            line_num = int(row["line_num"])
            left = int(row["left"])
            top = int(row["top"])
            width = int(row["width"])
            height = int(row["height"])
            conf = float(row["conf"]) if row["conf"] not in ("-1", "", None) else None
        except Exception:
            continue
        if level != 5:
            continue
        if page_num != page_number:
            continue
        key = (page_num, block_num, par_num, line_num)
        grouped.setdefault(key, []).append(
            {
                "text": text,
                "left": left,
                "top": top,
                "width": width,
                "height": height,
                "conf": conf,
            }
        )
    lines = []
    for key, words in grouped.items():
        words = sorted(words, key=lambda w: w["left"])
        left = min(w["left"] for w in words)
        top = min(w["top"] for w in words)
        right = max(w["left"] + w["width"] for w in words)
        bottom = max(w["top"] + w["height"] for w in words)
        line_text = " ".join(w["text"] for w in words).strip()
        avg_conf = None
        valid_conf = [w["conf"] for w in words if w["conf"] is not None]
        if valid_conf:
            avg_conf = round(sum(valid_conf) / len(valid_conf), 2)
        lines.append(
            {
                "text": line_text,
                "bbox": [left, top, right, bottom],
                "confidence": avg_conf,
            }
        )
    lines.sort(key=lambda x: (x["bbox"][1], x["bbox"][0]))
    return {
        "page": page_number,
        "image_width": image_width,
        "image_height": image_height,
        "lines": lines,
    }
 def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
    with Image.open(path) as img:
        image_width, image_height = img.size
    txt = subprocess.run(
        ["tesseract", str(path), "stdout"],
        capture_output=True,
        text=True,
        check=True,
-        )
+    ).stdout.strip()
-        return result.stdout.strip()
+
-    except Exception:
+    tsv = subprocess.run(
-        return ""
+        ["tesseract", str(path), "stdout", "tsv"],
        capture_output=True,
        text=True,
        check=True,
    ).stdout
    layout = {"pages": [_parse_tsv_lines(tsv, 1, image_width, image_height)]}
    return txt, layout
-def ocr_pdf(path: Path) -> str:
+def ocr_pdf_with_layout(path: Path) -> tuple[str, dict]:
    with tempfile.TemporaryDirectory() as tmpdir:
        output_prefix = Path(tmpdir) / "page"
        try:
        subprocess.run(
            ["pdftoppm", "-png", str(path), str(output_prefix)],
            capture_output=True,
            text=True,
            check=True,
        )
        except Exception:
            return ""
-        texts: list[str] = []
+        all_text = []
-        for img in sorted(Path(tmpdir).glob("page-*.png")):
+        pages = []
            text = ocr_image(img)
            if text:
                texts.append(text)
-        return "\n\n".join(texts).strip()
+        for idx, img in enumerate(sorted(Path(tmpdir).glob("page-*.png")), start=1):
            txt, layout = ocr_image_with_layout(img)
            if txt:
                all_text.append(txt)
            if layout.get("pages"):
                page_layout = layout["pages"][0]
                page_layout["page"] = idx
                pages.append(page_layout)
        return "\n\n".join(all_text).strip(), {"pages": pages}
-def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
+def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
    suffix = path.suffix.lower()
    tesseract_version = get_tesseract_version()
    if suffix == ".pdf":
-        return ocr_pdf(path).strip(), "tesseract", tesseract_version
+        txt, layout = ocr_pdf_with_layout(path)
        return txt.strip(), layout, "tesseract", tesseract_version
    if suffix in {".jpg", ".jpeg", ".png"}:
-        return ocr_image(path).strip(), "tesseract", tesseract_version
+        txt, layout = ocr_image_with_layout(path)
-    return "", None, None
+        return txt.strip(), layout, "tesseract", tesseract_version
    return "", None, None, None
-def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
+def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, str | None, str | None]:
    suffix = path.suffix.lower()
    if suffix == ".pdf":
        extracted = extract_pdf_text(path)
        if len(extracted.strip()) >= 40:
-            return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
+            return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"
-        ocr_text = ocr_pdf(path).strip()
+        ocr_text, layout, engine, version = run_ocr_only(path)
-        return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
+        return ocr_text, layout, engine, version, "initial_ingest_fallback"
    if suffix in {".jpg", ".jpeg", ".png"}:
-        return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
+        ocr_text, layout, engine, version = run_ocr_only(path)
        return ocr_text, layout, engine, version, "initial_ingest"
-    return "", None, None, None
+    return "", None, None, None, None
 def compute_quality_score(source_text: str, reviewed_text: str) -> float:
@ -173,7 +263,6 @@ def archive_document(
 ) -> Document:
    if not source.exists():
        raise FileNotFoundError(f"Source file not found: {source}")
    if not is_supported_file(source):
        raise ValueError(f"Unsupported file type: {source.suffix}")
@ -187,7 +276,7 @@ def archive_document(
    mime_type = guess_mime_type(current_path)
    sha256_current = sha256_for_file(current_path)
-    raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
+    raw_text, layout_json, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
    document = Document(
        document_id=document_id,
@ -230,6 +319,7 @@ def archive_document(
            rerun_source=rerun_source,
            quality_flags=[],
            quality_note=None,
            layout_json=layout_json,
        )
        db.add(text_version)
@ -246,7 +336,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
    if not current_file.exists():
        raise FileNotFoundError(f"Current file not found: {current_file}")
-    raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
+    raw_text, layout_json, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
    if not raw_text:
        raise ValueError("OCR produced no text")
@ -278,6 +368,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
        quality_flags=[],
        quality_note=None,
        derived_from_version_id=previous_raw_id,
        layout_json=layout_json,
    )
    db.add(new_text)
@ -288,19 +379,9 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
    return new_text
-def ingest_file(
+def ingest_file(db: Session, file_path: str, source_system: str, document_type: str = "receipt") -> Document:
    db: Session,
    file_path: str,
    source_system: str,
    document_type: str = "receipt",
 ) -> Document:
    source = Path(file_path).expanduser().resolve()
-    return archive_document(
+    return archive_document(db=db, source=source, source_system=source_system, document_type=document_type)
        db=db,
        source=source,
        source_system=source_system,
        document_type=document_type,
    )
 def ingest_uploaded_file(
@ -321,12 +402,7 @@ def ingest_uploaded_file(
    staged_path = upload_root / staged_name
    staged_path.write_bytes(file_bytes)
-    return archive_document(
+    return archive_document(db=db, source=staged_path, source_system=source_system, document_type=document_type)
        db=db,
        source=staged_path,
        source_system=source_system,
        document_type=document_type,
    )
 def ingest_directory(
@ -337,7 +413,6 @@ def ingest_directory(
    document_type: str = "receipt",
 ) -> list[Document]:
    source_dir = Path(directory_path).expanduser().resolve()
    if not source_dir.exists() or not source_dir.is_dir():
        raise NotADirectoryError(f"Directory not found: {source_dir}")
@ -349,12 +424,7 @@ def ingest_directory(
            continue
        try:
            ingested.append(
-                ingest_file(
+                ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
                    db=db,
                    file_path=str(path),
                    source_system=source_system,
                    document_type=document_type,
                )
            )
        except Exception:
            continue
--- a/app/models/document.py
+++ b/app/models/document.py
@ -16,6 +16,7 @@ class Document(Base):
    source_path: Mapped[str] = mapped_column(Text, nullable=False)
    original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
    current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
    share_path: Mapped[str | None] = mapped_column(Text, nullable=True)
    original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
    canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
--- a/app/models/text_version.py
+++ b/app/models/text_version.py
@ -16,7 +16,7 @@ class TextVersion(Base):
    )
    version_number: Mapped[int] = mapped_column(Integer, nullable=False)
-    version_type: Mapped[str] = mapped_column(String(50), nullable=False)  # raw_ocr, reviewed
+    version_type: Mapped[str] = mapped_column(String(50), nullable=False)
    text_content: Mapped[str] = mapped_column(Text, nullable=False)
@ -36,6 +36,8 @@ class TextVersion(Base):
        nullable=True,
    )
    layout_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime, default=datetime.utcnow, nullable=False
    )
--- a/app/routes/documents.py
+++ b/app/routes/documents.py
@ -1,3 +1,4 @@
 from copy import deepcopy
 from pathlib import Path
 from uuid import uuid4
@ -7,6 +8,10 @@ from fastapi.templating import Jinja2Templates
 from sqlalchemy.orm import Session, selectinload
 from app.db.deps import get_db
 from app.logic.document_outputs import (
    create_field_enriched_pdf_version,
    create_ocr_corrected_pdf_version,
 )
 from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
 from app.models.document import Document
 from app.models.document_version import DocumentVersion
@ -39,6 +44,68 @@ QUALITY_FLAG_OPTIONS = [
 ]
 def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
    sorted_text_versions = sorted(
        document.text_versions,
        key=lambda x: (x.version_number, x.created_at),
        reverse=True,
    )
    raw_ocr = next(
        (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
        None,
    )
    reviewed_ocr = next(
        (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
        None,
    )
    return raw_ocr, reviewed_ocr
 def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
    if not layout_json:
        return []
    lines: list[str] = []
    for page in layout_json.get("pages", []):
        for line in page.get("lines", []):
            lines.append((line.get("text") or "").strip())
    return lines
 def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
    # Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
    # the editable line set. Reviewed text remains visible above as history/state.
    source = raw_ocr or reviewed_ocr
    if source and source.layout_json:
        return "\n".join(_extract_line_texts_from_layout(source.layout_json))
    if source and source.text_content:
        return source.text_content
    return ""
 def _line_count_from_layout(layout_json: dict | None) -> int:
    return len(_extract_line_texts_from_layout(layout_json))
 def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
    if not base_layout:
        return None
    reviewed_lines = reviewed_text.splitlines()
    new_layout = deepcopy(base_layout)
    idx = 0
    for page in new_layout.get("pages", []):
        for line in page.get("lines", []):
            line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
            idx += 1
    return new_layout
@router.get("/", response_class=HTMLResponse)
 def list_documents(request: Request, db: Session = Depends(get_db)):
    documents = db.query(Document).order_by(Document.created_at.desc()).all()
@ -85,12 +152,7 @@ def test_ingest(db: Session = Depends(get_db)):
        document_id=document.id,
        version_number=1,
        version_type="raw_ocr",
-        text_content=(
+        text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n",
            "CVS PHARMACY\n"
            "Date: 2026-04-01\n"
            "Total: 12.34 USD\n"
            "Household supplies\n"
        ),
        created_by="system",
        is_current=True,
        ocr_engine="test_seed",
@ -116,8 +178,36 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
    try:
        rerun_ocr_for_document(db, document)
    except Exception:
        return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
    return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
 def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
    document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
    if document is None:
        return RedirectResponse(url="/documents/", status_code=303)
    try:
        create_ocr_corrected_pdf_version(db, document)
    except Exception:
        return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
    return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
 def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
    document = db.query(Document).filter(Document.document_id == document_id).first()
    if document is None:
        return RedirectResponse(url="/documents/", status_code=303)
    try:
        create_field_enriched_pdf_version(db, document)
    except Exception:
        return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
    return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@ -139,15 +229,14 @@ def save_reviewed_text(
    if document is None:
        return RedirectResponse(url="/documents/", status_code=303)
-    sorted_text_versions = sorted(
+    raw_ocr, _ = _get_current_text_versions(document)
-        document.text_versions,
+    expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
-        key=lambda x: (x.version_number, x.created_at),
+    actual_line_count = len(reviewed_text.splitlines())
        reverse=True,
    )
-    current_raw = next(
+    if expected_line_count and actual_line_count != expected_line_count:
-        (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
+        return RedirectResponse(
-        None,
+            url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
            status_code=303,
        )
    existing_reviewed = [
@ -156,6 +245,11 @@ def save_reviewed_text(
    for tv in existing_reviewed:
        tv.is_current = False
    reviewed_layout = _apply_reviewed_lines_to_layout(
        raw_ocr.layout_json if raw_ocr else None,
        reviewed_text,
    )
    reviewed_version = TextVersion(
        document_id=document.id,
        version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
@ -163,14 +257,15 @@ def save_reviewed_text(
        text_content=reviewed_text,
        created_by="mcelwain",
        is_current=True,
-        derived_from_version_id=current_raw.id if current_raw else None,
+        derived_from_version_id=raw_ocr.id if raw_ocr else None,
        layout_json=reviewed_layout,
    )
    db.add(reviewed_version)
-    if current_raw:
+    if raw_ocr:
-        current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
+        raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
-        current_raw.quality_flags = quality_flags or []
+        raw_ocr.quality_flags = quality_flags or []
-        current_raw.quality_note = quality_note or None
+        raw_ocr.quality_note = quality_note or None
    document.review_status = "reviewed"
@ -196,27 +291,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
    if document is None:
        return HTMLResponse(content="Document not found", status_code=404)
-    sorted_text_versions = sorted(
+    raw_ocr, reviewed_ocr = _get_current_text_versions(document)
-        document.text_versions,
+    review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
        key=lambda x: (x.version_number, x.created_at),
        reverse=True,
    )
-    raw_ocr = next(
+    base_layout = (
-        (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
+        reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
-        None,
+        else raw_ocr.layout_json if raw_ocr else None
    )
    reviewed_ocr = next(
        (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
        None,
    )
    review_text_value = (
        reviewed_ocr.text_content
        if reviewed_ocr is not None
        else raw_ocr.text_content if raw_ocr is not None else ""
    )
    expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
    actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
    line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
    file_url = None
    if document.current_path:
@ -228,6 +312,11 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
        except Exception:
            file_url = None
    app_url = str(request.url_for("document_detail", document_id=document.document_id))
    error = request.query_params.get("error")
    error_expected = request.query_params.get("expected")
    error_actual = request.query_params.get("actual")
    return templates.TemplateResponse(
        request=request,
        name="documents/detail.html",
@ -238,8 +327,15 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
            "reviewed_ocr": reviewed_ocr,
            "review_text_value": review_text_value,
            "file_url": file_url,
            "app_url": app_url,
            "quality_flag_options": QUALITY_FLAG_OPTIONS,
            "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
            "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
            "line_numbers": line_numbers,
            "expected_line_count": expected_line_count,
            "actual_line_count": actual_line_count,
            "error": error,
            "error_expected": error_expected,
            "error_actual": error_actual,
        },
    )
--- a/app/templates/documents/detail.html
+++ b/app/templates/documents/detail.html
@ -3,17 +3,67 @@
 <head>
    <meta charset="UTF-8">
    <title>{{ document.document_id }}</title>
    <style>
        body { font-family: sans-serif; }
        textarea { font-family: monospace; }
        .editor-wrap {
            display: flex;
            align-items: flex-start;
            gap: 0.5rem;
        }
        .line-numbers {
            font-family: monospace;
            white-space: pre;
            text-align: right;
            color: #666;
            user-select: none;
            padding-top: 2px;
            min-width: 3rem;
        }
        .line-warning {
            color: #8a5a00;
            font-weight: 600;
        }
        .error-box {
            background: #ffe8e8;
            color: #8b0000;
            padding: 0.75rem;
            border: 1px solid #cc9999;
            margin-bottom: 1rem;
        }
    </style>
 </head>
 <body>
    <p><a href="/documents/">Back to documents</a></p>
    <h1>{{ document.document_id }}</h1>
    {% if error == "line_count_mismatch" %}
        <div class="error-box">
            Could not save reviewed OCR because line count did not match OCR layout.
            Expected {{ error_expected }}, got {{ error_actual }}.
        </div>
    {% elif error == "save_ocr_corrected_failed" %}
        <div class="error-box">
            Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count.
        </div>
    {% elif error == "rerun_ocr_failed" %}
        <div class="error-box">
            OCR rerun failed.
        </div>
    {% elif error == "save_field_enriched_failed" %}
        <div class="error-box">
            Could not save field-enriched PDF.
        </div>
    {% endif %}
    <h2>Document metadata</h2>
    <ul>
        <li>Type: {{ document.document_type }}</li>
        <li>Source path: {{ document.source_path }}</li>
        <li>Current path: {{ document.current_path }}</li>
        <li>Share path: {{ document.share_path or "" }}</li>
        <li>App URL: <a href="{{ app_url }}">{{ app_url }}</a></li>
        <li>Original filename: {{ document.original_filename }}</li>
        <li>Canonical filename: {{ document.canonical_filename }}</li>
        <li>MIME type: {{ document.mime_type }}</li>
@ -25,6 +75,14 @@
        <li>Updated at: {{ document.updated_at }}</li>
    </ul>
    <h2>Saved PDF scaffolds</h2>
    <form method="post" action="/documents/{{ document.document_id }}/save-ocr-corrected-pdf" style="display:inline;">
        <button type="submit">Save OCR-corrected PDF</button>
    </form>
    <form method="post" action="/documents/{{ document.document_id }}/save-field-enriched-pdf" style="display:inline; margin-left: 1rem;">
        <button type="submit">Save field-enriched PDF</button>
    </form>
    <h2>Document preview</h2>
    {% if file_url %}
        {% if document.mime_type == "application/pdf" %}
@ -47,6 +105,7 @@
                {{ version.version_type }} —
                {{ version.file_path }} —
                {{ version.created_at }}
                {% if version.notes %}<br><em>{{ version.notes }}</em>{% endif %}
            </li>
        {% endfor %}
        </ul>
@ -84,12 +143,23 @@
        <p>No reviewed OCR saved yet.</p>
    {% endif %}
    <p>
        Expected OCR lines: <span id="expected-lines">{{ expected_line_count }}</span><br>
        Current editor lines: <span id="actual-lines">{{ actual_line_count }}</span>
        <br><span id="line-warning" class="line-warning" {% if expected_line_count == actual_line_count %}style="display:none;"{% endif %}>
            Line count mismatch may affect corrected PDF layout.
        </span>
    </p>
    <form method="post" action="/documents/{{ document.document_id }}/review-text">
        <div>
-            <label for="reviewed_text">Edit reviewed OCR text:</label>
+            <label for="reviewed_text">Edit reviewed OCR text (one line per OCR line):</label>
        </div>
-        <div>
+
-            <textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
+        <div class="editor-wrap">
            <div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
 {% endfor %}</div>
            <textarea id="reviewed_text" name="reviewed_text" rows="{{ [actual_line_count + 2, 20]|max }}" cols="100">{{ review_text_value }}</textarea>
        </div>
        <h3>Quality flags</h3>
@ -113,8 +183,43 @@
        </div>
        <div style="margin-top: 1rem;">
-            <button type="submit">Save reviewed OCR</button>
+            <button type="submit" id="save-reviewed-btn">Save reviewed OCR</button>
        </div>
    </form>
    <script>
        const textarea = document.getElementById("reviewed_text");
        const expectedLines = parseInt(document.getElementById("expected-lines").textContent || "0", 10);
        const actualLinesEl = document.getElementById("actual-lines");
        const warningEl = document.getElementById("line-warning");
        const saveBtn = document.getElementById("save-reviewed-btn");
        const lineNumbersEl = document.getElementById("line-numbers");
        function countLines(text) {
            if (text.length === 0) return 0;
            return text.split('\n').length;
        }
        function rebuildLineNumbers(lineCount) {
            let nums = "";
            for (let i = 1; i <= lineCount; i++) {
                nums += i + "\n";
            }
            lineNumbersEl.textContent = nums;
        }
        function updateEditorState() {
            const actual = countLines(textarea.value);
            actualLinesEl.textContent = actual.toString();
            rebuildLineNumbers(Math.max(actual, expectedLines));
            const mismatch = expectedLines > 0 && actual !== expectedLines;
            warningEl.style.display = mismatch ? "inline" : "none";
            saveBtn.disabled = mismatch;
        }
        textarea.addEventListener("input", updateEditorState);
        updateEditorState();
    </script>
 </body>
 </html>