feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation

2026-04-03 11:56:23 -05:00 · 2026-04-03 11:56:23 -05:00 · e67a67f80a
parent 0d70e6b7bb
commit e67a67f80a
7 changed files with 634 additions and 115 deletions
--- a/app/core/config.py
+++ b/app/core/config.py
@ -8,3 +8,5 @@ DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/documen
 DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
 INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
 UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
+OCR_CORRECTED_ROOT = os.getenv("OCR_CORRECTED_ROOT", "/mnt/storage/document-processor/outputs/ocr_corrected")
+FIELD_ENRICHED_ROOT = os.getenv("FIELD_ENRICHED_ROOT", "/mnt/storage/document-processor/outputs/field_enriched")
--- a/app/logic/document_outputs.py
+++ b/app/logic/document_outputs.py
@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import hashlib
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+from PIL import Image
+from pypdf import PdfReader
+from reportlab.lib.utils import ImageReader
+from reportlab.pdfbase.pdfmetrics import stringWidth
+from reportlab.pdfgen import canvas
+from sqlalchemy import func
+from sqlalchemy.orm import Session
+
+from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT
+from app.models.document import Document
+from app.models.document_version import DocumentVersion
+from app.models.text_version import TextVersion
+
+
+def sha256_for_file(path: Path) -> str:
+    hasher = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b""):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+
+
+def get_next_document_version_number(db: Session, document_id: int) -> int:
+    max_version = (
+        db.query(func.max(DocumentVersion.version_number))
+        .filter(DocumentVersion.document_id == document_id)
+        .scalar()
+    )
+    return (max_version or 0) + 1
+
+
+def _build_output_path(root: str, document: Document, version_type: str) -> Path:
+    source = Path(document.current_path or "")
+    suffix = source.suffix.lower() if source.suffix else ".pdf"
+    filename = f"{document.document_id}_{version_type}{suffix}"
+    return Path(root) / filename
+
+
+def _latest_current_text_version(document: Document, version_type: str) -> TextVersion | None:
+    candidates = [tv for tv in document.text_versions if tv.version_type == version_type and tv.is_current]
+    if not candidates:
+        return None
+    return sorted(candidates, key=lambda x: (x.version_number, x.created_at), reverse=True)[0]
+
+
+def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
+    prefix = tmpdir / "page"
+    subprocess.run(
+        ["pdftoppm", "-png", str(pdf_path), str(prefix)],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+    return sorted(tmpdir.glob("page-*.png"))
+
+
+def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
+    if not text:
+        return max(6.0, box_height * 0.80)
+
+    font_size = max(6.0, box_height * 0.88)
+
+    while font_size > 4.5 and stringWidth(text, "Helvetica", font_size) > box_width * 0.98:
+        font_size -= 0.25
+
+    min_reasonable = max(6.0, box_height * 0.68)
+    return max(min_reasonable, font_size)
+
+
+def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
+    if not layout_json:
+        return []
+
+    flattened = []
+    for page in layout_json.get("pages", []):
+        for line in page.get("lines", []):
+            flattened.append(
+                {
+                    "page": page["page"],
+                    "bbox": line["bbox"],
+                    "text": line.get("text", ""),
+                }
+            )
+    return flattened
+
+
+def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion:
+    if not document.current_path:
+        raise ValueError("Document has no current_path")
+
+    current_file = Path(document.current_path)
+    if not current_file.exists():
+        raise FileNotFoundError(f"Current file not found: {current_file}")
+
+    raw_ocr = _latest_current_text_version(document, "raw_ocr")
+    reviewed = _latest_current_text_version(document, "reviewed")
+
+    if raw_ocr is None:
+        raise ValueError("No current raw OCR version found")
+    if reviewed is None:
+        raise ValueError("No current reviewed text found")
+    if current_file.suffix.lower() != ".pdf":
+        raise ValueError("C1 corrected PDF generation currently supports PDFs only")
+
+    raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
+    reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
+
+    if not raw_lines:
+        raise ValueError("No OCR line boxes found in raw OCR layout data")
+
+    if reviewed_lines and len(reviewed_lines) != len(raw_lines):
+        raise ValueError("Reviewed line layout does not match raw OCR line layout")
+
+    source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
+    if not source_layout:
+        raise ValueError("No source layout found")
+
+    out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    reader = PdfReader(str(current_file))
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        tmpdir = Path(tmpdirname)
+        images = _render_pdf_page_images(current_file, tmpdir)
+
+        overlay_pdf_path = tmpdir / "overlay.pdf"
+        c = None
+
+        page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
+
+        for page_num, img_path in enumerate(images, start=1):
+            pdf_page = reader.pages[page_num - 1]
+            page_w = float(pdf_page.mediabox.width)
+            page_h = float(pdf_page.mediabox.height)
+
+            img = Image.open(img_path)
+
+            if c is None:
+                c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
+            else:
+                c.setPageSize((page_w, page_h))
+
+            c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
+
+            page_layout = page_layouts.get(page_num, {"lines": []})
+            src_w = float(page_layout.get("image_width") or img.size[0])
+            src_h = float(page_layout.get("image_height") or img.size[1])
+
+            scale_x = page_w / src_w
+            scale_y = page_h / src_h
+
+            for line in page_layout.get("lines", []):
+                text_line = (line.get("text") or "").strip()
+                if not text_line:
+                    continue
+
+                left, top, right, bottom = line["bbox"]
+
+                pdf_x = left * scale_x
+                pdf_y = page_h - (bottom * scale_y)
+                box_width = max(10.0, (right - left) * scale_x)
+                box_height = max(6.0, (bottom - top) * scale_y)
+
+                font_size = _fit_font_size(text_line, box_width, box_height)
+
+                text_obj = c.beginText()
+                text_obj.setTextRenderMode(3)
+                text_obj.setFont("Helvetica", font_size)
+                text_obj.setTextOrigin(pdf_x, pdf_y + 1)
+                text_obj.textLine(text_line)
+                c.drawText(text_obj)
+
+            c.showPage()
+
+        if c is None:
+            raise ValueError("Failed to build overlay PDF")
+
+        c.save()
+        shutil.copy2(overlay_pdf_path, out_path)
+
+    file_hash = sha256_for_file(out_path)
+
+    version = DocumentVersion(
+        document_id=document.id,
+        version_number=get_next_document_version_number(db, document.id),
+        version_type="ocr_corrected",
+        file_path=str(out_path),
+        sha256=file_hash,
+        created_by="save_ocr_corrected_pdf",
+        notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
+    )
+    db.add(version)
+
+    document.current_path = str(out_path)
+    document.canonical_filename = out_path.name
+    document.sha256_current = file_hash
+
+    db.commit()
+    db.refresh(version)
+    return version
+
+
+def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion:
+    if not document.current_path:
+        raise ValueError("Document has no current_path")
+
+    current_file = Path(document.current_path)
+    if not current_file.exists():
+        raise FileNotFoundError(f"Current file not found: {current_file}")
+
+    out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    shutil.copy2(current_file, out_path)
+    file_hash = sha256_for_file(out_path)
+
+    version = DocumentVersion(
+        document_id=document.id,
+        version_number=get_next_document_version_number(db, document.id),
+        version_type="field_enriched",
+        file_path=str(out_path),
+        sha256=file_hash,
+        created_by="save_field_enriched_pdf",
+        notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
+    )
+    db.add(version)
+
+    document.current_path = str(out_path)
+    document.canonical_filename = out_path.name
+    document.sha256_current = file_hash
+
+    db.commit()
+    db.refresh(version)
+    return version
--- a/app/logic/ingest.py
+++ b/app/logic/ingest.py
@ -1,12 +1,16 @@
 from __future__ import annotations

+import csv
 import hashlib
+import io
 import mimetypes
 import shutil
 import subprocess
 import tempfile
 from difflib import SequenceMatcher
 from pathlib import Path
+
+from PIL import Image
 from uuid import uuid4

 from sqlalchemy import func
@ -61,8 +65,7 @@ def get_tesseract_version() -> str | None:
            text=True,
            check=True,
        )
-        line = result.stdout.splitlines()[0].strip()
-        return line
+        return result.stdout.splitlines()[0].strip()
    except Exception:
        return None

@ -93,67 +96,154 @@ def extract_pdf_text(path: Path) -> str:
        return ""


-def ocr_image(path: Path) -> str:
-    try:
-        result = subprocess.run(
-            ["tesseract", str(path), "stdout"],
+def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
+    reader = csv.DictReader(io.StringIO(tsv_text), delimiter="	")
+    grouped: dict[tuple[int, int, int, int], list[dict]] = {}
+
+    for row in reader:
+        if not row.get("text"):
+            continue
+        text = row["text"].strip()
+        if not text:
+            continue
+
+        try:
+            level = int(row["level"])
+            page_num = int(row["page_num"])
+            block_num = int(row["block_num"])
+            par_num = int(row["par_num"])
+            line_num = int(row["line_num"])
+            left = int(row["left"])
+            top = int(row["top"])
+            width = int(row["width"])
+            height = int(row["height"])
+            conf = float(row["conf"]) if row["conf"] not in ("-1", "", None) else None
+        except Exception:
+            continue
+
+        if level != 5:
+            continue
+        if page_num != page_number:
+            continue
+
+        key = (page_num, block_num, par_num, line_num)
+        grouped.setdefault(key, []).append(
+            {
+                "text": text,
+                "left": left,
+                "top": top,
+                "width": width,
+                "height": height,
+                "conf": conf,
+            }
+        )
+
+    lines = []
+    for key, words in grouped.items():
+        words = sorted(words, key=lambda w: w["left"])
+        left = min(w["left"] for w in words)
+        top = min(w["top"] for w in words)
+        right = max(w["left"] + w["width"] for w in words)
+        bottom = max(w["top"] + w["height"] for w in words)
+        line_text = " ".join(w["text"] for w in words).strip()
+        avg_conf = None
+        valid_conf = [w["conf"] for w in words if w["conf"] is not None]
+        if valid_conf:
+            avg_conf = round(sum(valid_conf) / len(valid_conf), 2)
+
+        lines.append(
+            {
+                "text": line_text,
+                "bbox": [left, top, right, bottom],
+                "confidence": avg_conf,
+            }
+        )
+
+    lines.sort(key=lambda x: (x["bbox"][1], x["bbox"][0]))
+    return {
+        "page": page_number,
+        "image_width": image_width,
+        "image_height": image_height,
+        "lines": lines,
+    }
+
+
+def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
+    with Image.open(path) as img:
+        image_width, image_height = img.size
+
+    txt = subprocess.run(
+        ["tesseract", str(path), "stdout"],
+        capture_output=True,
+        text=True,
+        check=True,
+    ).stdout.strip()
+
+    tsv = subprocess.run(
+        ["tesseract", str(path), "stdout", "tsv"],
+        capture_output=True,
+        text=True,
+        check=True,
+    ).stdout
+
+    layout = {"pages": [_parse_tsv_lines(tsv, 1, image_width, image_height)]}
+    return txt, layout
+
+
+def ocr_pdf_with_layout(path: Path) -> tuple[str, dict]:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_prefix = Path(tmpdir) / "page"
+        subprocess.run(
+            ["pdftoppm", "-png", str(path), str(output_prefix)],
            capture_output=True,
            text=True,
            check=True,
        )
-        return result.stdout.strip()
-    except Exception:
-        return ""
+
+        all_text = []
+        pages = []
+
+        for idx, img in enumerate(sorted(Path(tmpdir).glob("page-*.png")), start=1):
+            txt, layout = ocr_image_with_layout(img)
+            if txt:
+                all_text.append(txt)
+            if layout.get("pages"):
+                page_layout = layout["pages"][0]
+                page_layout["page"] = idx
+                pages.append(page_layout)
+
+        return "\n\n".join(all_text).strip(), {"pages": pages}


-def ocr_pdf(path: Path) -> str:
-    with tempfile.TemporaryDirectory() as tmpdir:
-        output_prefix = Path(tmpdir) / "page"
-        try:
-            subprocess.run(
-                ["pdftoppm", "-png", str(path), str(output_prefix)],
-                capture_output=True,
-                text=True,
-                check=True,
-            )
-        except Exception:
-            return ""
-
-        texts: list[str] = []
-        for img in sorted(Path(tmpdir).glob("page-*.png")):
-            text = ocr_image(img)
-            if text:
-                texts.append(text)
-
-        return "\n\n".join(texts).strip()
-
-
-def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
+def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
    suffix = path.suffix.lower()
    tesseract_version = get_tesseract_version()

    if suffix == ".pdf":
-        return ocr_pdf(path).strip(), "tesseract", tesseract_version
+        txt, layout = ocr_pdf_with_layout(path)
+        return txt.strip(), layout, "tesseract", tesseract_version
    if suffix in {".jpg", ".jpeg", ".png"}:
-        return ocr_image(path).strip(), "tesseract", tesseract_version
-    return "", None, None
+        txt, layout = ocr_image_with_layout(path)
+        return txt.strip(), layout, "tesseract", tesseract_version
+    return "", None, None, None


-def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
+def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, str | None, str | None]:
    suffix = path.suffix.lower()

    if suffix == ".pdf":
        extracted = extract_pdf_text(path)
        if len(extracted.strip()) >= 40:
-            return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
+            return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"

-        ocr_text = ocr_pdf(path).strip()
-        return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
+        ocr_text, layout, engine, version = run_ocr_only(path)
+        return ocr_text, layout, engine, version, "initial_ingest_fallback"

    if suffix in {".jpg", ".jpeg", ".png"}:
-        return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
+        ocr_text, layout, engine, version = run_ocr_only(path)
+        return ocr_text, layout, engine, version, "initial_ingest"

-    return "", None, None, None
+    return "", None, None, None, None


 def compute_quality_score(source_text: str, reviewed_text: str) -> float:
@ -173,7 +263,6 @@ def archive_document(
 ) -> Document:
    if not source.exists():
        raise FileNotFoundError(f"Source file not found: {source}")
-
    if not is_supported_file(source):
        raise ValueError(f"Unsupported file type: {source.suffix}")

@ -187,7 +276,7 @@ def archive_document(
    mime_type = guess_mime_type(current_path)
    sha256_current = sha256_for_file(current_path)

-    raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
+    raw_text, layout_json, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)

    document = Document(
        document_id=document_id,
@ -230,6 +319,7 @@ def archive_document(
            rerun_source=rerun_source,
            quality_flags=[],
            quality_note=None,
+            layout_json=layout_json,
        )
        db.add(text_version)

@ -246,7 +336,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
    if not current_file.exists():
        raise FileNotFoundError(f"Current file not found: {current_file}")

-    raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
+    raw_text, layout_json, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
    if not raw_text:
        raise ValueError("OCR produced no text")

@ -278,6 +368,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
        quality_flags=[],
        quality_note=None,
        derived_from_version_id=previous_raw_id,
+        layout_json=layout_json,
    )
    db.add(new_text)

@ -288,19 +379,9 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
    return new_text


-def ingest_file(
-    db: Session,
-    file_path: str,
-    source_system: str,
-    document_type: str = "receipt",
-) -> Document:
+def ingest_file(db: Session, file_path: str, source_system: str, document_type: str = "receipt") -> Document:
    source = Path(file_path).expanduser().resolve()
-    return archive_document(
-        db=db,
-        source=source,
-        source_system=source_system,
-        document_type=document_type,
-    )
+    return archive_document(db=db, source=source, source_system=source_system, document_type=document_type)


 def ingest_uploaded_file(
@ -321,12 +402,7 @@ def ingest_uploaded_file(
    staged_path = upload_root / staged_name
    staged_path.write_bytes(file_bytes)

-    return archive_document(
-        db=db,
-        source=staged_path,
-        source_system=source_system,
-        document_type=document_type,
-    )
+    return archive_document(db=db, source=staged_path, source_system=source_system, document_type=document_type)


 def ingest_directory(
@ -337,7 +413,6 @@ def ingest_directory(
    document_type: str = "receipt",
 ) -> list[Document]:
    source_dir = Path(directory_path).expanduser().resolve()
-
    if not source_dir.exists() or not source_dir.is_dir():
        raise NotADirectoryError(f"Directory not found: {source_dir}")

@ -349,12 +424,7 @@ def ingest_directory(
            continue
        try:
            ingested.append(
-                ingest_file(
-                    db=db,
-                    file_path=str(path),
-                    source_system=source_system,
-                    document_type=document_type,
-                )
+                ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
            )
        except Exception:
            continue
--- a/app/models/document.py
+++ b/app/models/document.py
@ -16,6 +16,7 @@ class Document(Base):
    source_path: Mapped[str] = mapped_column(Text, nullable=False)
    original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
    current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
+    share_path: Mapped[str | None] = mapped_column(Text, nullable=True)

    original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
    canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
--- a/app/models/text_version.py
+++ b/app/models/text_version.py
@ -16,7 +16,7 @@ class TextVersion(Base):
    )

    version_number: Mapped[int] = mapped_column(Integer, nullable=False)
-    version_type: Mapped[str] = mapped_column(String(50), nullable=False)  # raw_ocr, reviewed
+    version_type: Mapped[str] = mapped_column(String(50), nullable=False)

    text_content: Mapped[str] = mapped_column(Text, nullable=False)

@ -36,6 +36,8 @@ class TextVersion(Base):
        nullable=True,
    )

+    layout_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
+
    created_at: Mapped[datetime] = mapped_column(
        DateTime, default=datetime.utcnow, nullable=False
    )
--- a/app/routes/documents.py
+++ b/app/routes/documents.py
@ -1,3 +1,4 @@
+from copy import deepcopy
 from pathlib import Path
 from uuid import uuid4

@ -7,6 +8,10 @@ from fastapi.templating import Jinja2Templates
 from sqlalchemy.orm import Session, selectinload

 from app.db.deps import get_db
+from app.logic.document_outputs import (
+    create_field_enriched_pdf_version,
+    create_ocr_corrected_pdf_version,
+)
 from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
 from app.models.document import Document
 from app.models.document_version import DocumentVersion
@ -39,6 +44,68 @@ QUALITY_FLAG_OPTIONS = [
 ]


+def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
+    sorted_text_versions = sorted(
+        document.text_versions,
+        key=lambda x: (x.version_number, x.created_at),
+        reverse=True,
+    )
+
+    raw_ocr = next(
+        (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
+        None,
+    )
+
+    reviewed_ocr = next(
+        (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
+        None,
+    )
+
+    return raw_ocr, reviewed_ocr
+
+
+def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
+    if not layout_json:
+        return []
+
+    lines: list[str] = []
+    for page in layout_json.get("pages", []):
+        for line in page.get("lines", []):
+            lines.append((line.get("text") or "").strip())
+    return lines
+
+
+def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
+    # Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
+    # the editable line set. Reviewed text remains visible above as history/state.
+    source = raw_ocr or reviewed_ocr
+    if source and source.layout_json:
+        return "\n".join(_extract_line_texts_from_layout(source.layout_json))
+    if source and source.text_content:
+        return source.text_content
+    return ""
+
+
+def _line_count_from_layout(layout_json: dict | None) -> int:
+    return len(_extract_line_texts_from_layout(layout_json))
+
+
+def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
+    if not base_layout:
+        return None
+
+    reviewed_lines = reviewed_text.splitlines()
+    new_layout = deepcopy(base_layout)
+
+    idx = 0
+    for page in new_layout.get("pages", []):
+        for line in page.get("lines", []):
+            line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
+            idx += 1
+
+    return new_layout
+
+
@router.get("/", response_class=HTMLResponse)
 def list_documents(request: Request, db: Session = Depends(get_db)):
    documents = db.query(Document).order_by(Document.created_at.desc()).all()
@ -85,12 +152,7 @@ def test_ingest(db: Session = Depends(get_db)):
        document_id=document.id,
        version_number=1,
        version_type="raw_ocr",
-        text_content=(
-            "CVS PHARMACY\n"
-            "Date: 2026-04-01\n"
-            "Total: 12.34 USD\n"
-            "Household supplies\n"
-        ),
+        text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n",
        created_by="system",
        is_current=True,
        ocr_engine="test_seed",
@ -116,7 +178,35 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
    try:
        rerun_ocr_for_document(db, document)
    except Exception:
-        return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
+        return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
+
+    return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
+
+
+@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
+def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
+    document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
+    if document is None:
+        return RedirectResponse(url="/documents/", status_code=303)
+
+    try:
+        create_ocr_corrected_pdf_version(db, document)
+    except Exception:
+        return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
+
+    return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
+
+
+@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
+def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
+    document = db.query(Document).filter(Document.document_id == document_id).first()
+    if document is None:
+        return RedirectResponse(url="/documents/", status_code=303)
+
+    try:
+        create_field_enriched_pdf_version(db, document)
+    except Exception:
+        return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)

    return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)

@ -139,16 +229,15 @@ def save_reviewed_text(
    if document is None:
        return RedirectResponse(url="/documents/", status_code=303)

-    sorted_text_versions = sorted(
-        document.text_versions,
-        key=lambda x: (x.version_number, x.created_at),
-        reverse=True,
-    )
+    raw_ocr, _ = _get_current_text_versions(document)
+    expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
+    actual_line_count = len(reviewed_text.splitlines())

-    current_raw = next(
-        (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
-        None,
-    )
+    if expected_line_count and actual_line_count != expected_line_count:
+        return RedirectResponse(
+            url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
+            status_code=303,
+        )

    existing_reviewed = [
        tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
@ -156,6 +245,11 @@ def save_reviewed_text(
    for tv in existing_reviewed:
        tv.is_current = False

+    reviewed_layout = _apply_reviewed_lines_to_layout(
+        raw_ocr.layout_json if raw_ocr else None,
+        reviewed_text,
+    )
+
    reviewed_version = TextVersion(
        document_id=document.id,
        version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
@ -163,14 +257,15 @@ def save_reviewed_text(
        text_content=reviewed_text,
        created_by="mcelwain",
        is_current=True,
-        derived_from_version_id=current_raw.id if current_raw else None,
+        derived_from_version_id=raw_ocr.id if raw_ocr else None,
+        layout_json=reviewed_layout,
    )
    db.add(reviewed_version)

-    if current_raw:
-        current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
-        current_raw.quality_flags = quality_flags or []
-        current_raw.quality_note = quality_note or None
+    if raw_ocr:
+        raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
+        raw_ocr.quality_flags = quality_flags or []
+        raw_ocr.quality_note = quality_note or None

    document.review_status = "reviewed"

@ -196,27 +291,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
    if document is None:
        return HTMLResponse(content="Document not found", status_code=404)

-    sorted_text_versions = sorted(
-        document.text_versions,
-        key=lambda x: (x.version_number, x.created_at),
-        reverse=True,
-    )
+    raw_ocr, reviewed_ocr = _get_current_text_versions(document)
+    review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)

-    raw_ocr = next(
-        (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
-        None,
-    )
-
-    reviewed_ocr = next(
-        (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
-        None,
-    )
-
-    review_text_value = (
-        reviewed_ocr.text_content
-        if reviewed_ocr is not None
-        else raw_ocr.text_content if raw_ocr is not None else ""
+    base_layout = (
+        reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
+        else raw_ocr.layout_json if raw_ocr else None
    )
+    expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
+    actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
+    line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))

    file_url = None
    if document.current_path:
@ -228,6 +312,11 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
        except Exception:
            file_url = None

+    app_url = str(request.url_for("document_detail", document_id=document.document_id))
+    error = request.query_params.get("error")
+    error_expected = request.query_params.get("expected")
+    error_actual = request.query_params.get("actual")
+
    return templates.TemplateResponse(
        request=request,
        name="documents/detail.html",
@ -238,8 +327,15 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
            "reviewed_ocr": reviewed_ocr,
            "review_text_value": review_text_value,
            "file_url": file_url,
+            "app_url": app_url,
            "quality_flag_options": QUALITY_FLAG_OPTIONS,
            "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
            "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
+            "line_numbers": line_numbers,
+            "expected_line_count": expected_line_count,
+            "actual_line_count": actual_line_count,
+            "error": error,
+            "error_expected": error_expected,
+            "error_actual": error_actual,
        },
    )
--- a/app/templates/documents/detail.html
+++ b/app/templates/documents/detail.html
@ -3,17 +3,67 @@
 <head>
    <meta charset="UTF-8">
    <title>{{ document.document_id }}</title>
+    <style>
+        body { font-family: sans-serif; }
+        textarea { font-family: monospace; }
+        .editor-wrap {
+            display: flex;
+            align-items: flex-start;
+            gap: 0.5rem;
+        }
+        .line-numbers {
+            font-family: monospace;
+            white-space: pre;
+            text-align: right;
+            color: #666;
+            user-select: none;
+            padding-top: 2px;
+            min-width: 3rem;
+        }
+        .line-warning {
+            color: #8a5a00;
+            font-weight: 600;
+        }
+        .error-box {
+            background: #ffe8e8;
+            color: #8b0000;
+            padding: 0.75rem;
+            border: 1px solid #cc9999;
+            margin-bottom: 1rem;
+        }
+    </style>
 </head>
 <body>
    <p><a href="/documents/">Back to documents</a></p>

    <h1>{{ document.document_id }}</h1>

+    {% if error == "line_count_mismatch" %}
+        <div class="error-box">
+            Could not save reviewed OCR because line count did not match OCR layout.
+            Expected {{ error_expected }}, got {{ error_actual }}.
+        </div>
+    {% elif error == "save_ocr_corrected_failed" %}
+        <div class="error-box">
+            Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count.
+        </div>
+    {% elif error == "rerun_ocr_failed" %}
+        <div class="error-box">
+            OCR rerun failed.
+        </div>
+    {% elif error == "save_field_enriched_failed" %}
+        <div class="error-box">
+            Could not save field-enriched PDF.
+        </div>
+    {% endif %}
+
    <h2>Document metadata</h2>
    <ul>
        <li>Type: {{ document.document_type }}</li>
        <li>Source path: {{ document.source_path }}</li>
        <li>Current path: {{ document.current_path }}</li>
+        <li>Share path: {{ document.share_path or "" }}</li>
+        <li>App URL: <a href="{{ app_url }}">{{ app_url }}</a></li>
        <li>Original filename: {{ document.original_filename }}</li>
        <li>Canonical filename: {{ document.canonical_filename }}</li>
        <li>MIME type: {{ document.mime_type }}</li>
@ -25,6 +75,14 @@
        <li>Updated at: {{ document.updated_at }}</li>
    </ul>

+    <h2>Saved PDF scaffolds</h2>
+    <form method="post" action="/documents/{{ document.document_id }}/save-ocr-corrected-pdf" style="display:inline;">
+        <button type="submit">Save OCR-corrected PDF</button>
+    </form>
+    <form method="post" action="/documents/{{ document.document_id }}/save-field-enriched-pdf" style="display:inline; margin-left: 1rem;">
+        <button type="submit">Save field-enriched PDF</button>
+    </form>
+
    <h2>Document preview</h2>
    {% if file_url %}
        {% if document.mime_type == "application/pdf" %}
@ -47,6 +105,7 @@
                {{ version.version_type }} —
                {{ version.file_path }} —
                {{ version.created_at }}
+                {% if version.notes %}<br><em>{{ version.notes }}</em>{% endif %}
            </li>
        {% endfor %}
        </ul>
@ -84,12 +143,23 @@
        <p>No reviewed OCR saved yet.</p>
    {% endif %}

+    <p>
+        Expected OCR lines: <span id="expected-lines">{{ expected_line_count }}</span><br>
+        Current editor lines: <span id="actual-lines">{{ actual_line_count }}</span>
+        <br><span id="line-warning" class="line-warning" {% if expected_line_count == actual_line_count %}style="display:none;"{% endif %}>
+            Line count mismatch may affect corrected PDF layout.
+        </span>
+    </p>
+
    <form method="post" action="/documents/{{ document.document_id }}/review-text">
        <div>
-            <label for="reviewed_text">Edit reviewed OCR text:</label>
+            <label for="reviewed_text">Edit reviewed OCR text (one line per OCR line):</label>
        </div>
-        <div>
-            <textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
+
+        <div class="editor-wrap">
+            <div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
+{% endfor %}</div>
+            <textarea id="reviewed_text" name="reviewed_text" rows="{{ [actual_line_count + 2, 20]|max }}" cols="100">{{ review_text_value }}</textarea>
        </div>

        <h3>Quality flags</h3>
@ -113,8 +183,43 @@
        </div>

        <div style="margin-top: 1rem;">
-            <button type="submit">Save reviewed OCR</button>
+            <button type="submit" id="save-reviewed-btn">Save reviewed OCR</button>
        </div>
    </form>
+
+    <script>
+        const textarea = document.getElementById("reviewed_text");
+        const expectedLines = parseInt(document.getElementById("expected-lines").textContent || "0", 10);
+        const actualLinesEl = document.getElementById("actual-lines");
+        const warningEl = document.getElementById("line-warning");
+        const saveBtn = document.getElementById("save-reviewed-btn");
+        const lineNumbersEl = document.getElementById("line-numbers");
+
+        function countLines(text) {
+            if (text.length === 0) return 0;
+            return text.split('\n').length;
+        }
+
+        function rebuildLineNumbers(lineCount) {
+            let nums = "";
+            for (let i = 1; i <= lineCount; i++) {
+                nums += i + "\n";
+            }
+            lineNumbersEl.textContent = nums;
+        }
+
+        function updateEditorState() {
+            const actual = countLines(textarea.value);
+            actualLinesEl.textContent = actual.toString();
+            rebuildLineNumbers(Math.max(actual, expectedLines));
+
+            const mismatch = expectedLines > 0 && actual !== expectedLines;
+            warningEl.style.display = mismatch ? "inline" : "none";
+            saveBtn.disabled = mismatch;
+        }
+
+        textarea.addEventListener("input", updateEditorState);
+        updateEditorState();
+    </script>
 </body>
 </html>