From e67a67f80afe41149655a3e70b3ceccd5bb4f763 Mon Sep 17 00:00:00 2001 From: McElwain Date: Fri, 3 Apr 2026 11:56:23 -0500 Subject: [PATCH] feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation --- app/core/config.py | 2 + app/logic/document_outputs.py | 243 ++++++++++++++++++++++++++++ app/logic/ingest.py | 208 ++++++++++++++++-------- app/models/document.py | 3 +- app/models/text_version.py | 4 +- app/routes/documents.py | 176 +++++++++++++++----- app/templates/documents/detail.html | 113 ++++++++++++- 7 files changed, 634 insertions(+), 115 deletions(-) create mode 100644 app/logic/document_outputs.py diff --git a/app/core/config.py b/app/core/config.py index 0ed83ae..b90d31b 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -8,3 +8,5 @@ DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/documen DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current") INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox") UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads") +OCR_CORRECTED_ROOT = os.getenv("OCR_CORRECTED_ROOT", "/mnt/storage/document-processor/outputs/ocr_corrected") +FIELD_ENRICHED_ROOT = os.getenv("FIELD_ENRICHED_ROOT", "/mnt/storage/document-processor/outputs/field_enriched") diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py new file mode 100644 index 0000000..84af546 --- /dev/null +++ b/app/logic/document_outputs.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +import hashlib +import shutil +import subprocess +import tempfile +from pathlib import Path + +from PIL import Image +from pypdf import PdfReader +from reportlab.lib.utils import ImageReader +from reportlab.pdfbase.pdfmetrics import stringWidth +from reportlab.pdfgen import canvas +from sqlalchemy import func +from sqlalchemy.orm import Session + +from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT +from app.models.document import Document +from app.models.document_version import DocumentVersion +from app.models.text_version import TextVersion + + +def sha256_for_file(path: Path) -> str: + hasher = hashlib.sha256() + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + hasher.update(chunk) + return hasher.hexdigest() + + +def get_next_document_version_number(db: Session, document_id: int) -> int: + max_version = ( + db.query(func.max(DocumentVersion.version_number)) + .filter(DocumentVersion.document_id == document_id) + .scalar() + ) + return (max_version or 0) + 1 + + +def _build_output_path(root: str, document: Document, version_type: str) -> Path: + source = Path(document.current_path or "") + suffix = source.suffix.lower() if source.suffix else ".pdf" + filename = f"{document.document_id}_{version_type}{suffix}" + return Path(root) / filename + + +def _latest_current_text_version(document: Document, version_type: str) -> TextVersion | None: + candidates = [tv for tv in document.text_versions if tv.version_type == version_type and tv.is_current] + if not candidates: + return None + return sorted(candidates, key=lambda x: (x.version_number, x.created_at), reverse=True)[0] + + +def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]: + prefix = tmpdir / "page" + subprocess.run( + ["pdftoppm", "-png", str(pdf_path), str(prefix)], + capture_output=True, + text=True, + check=True, + ) + return sorted(tmpdir.glob("page-*.png")) + + +def _fit_font_size(text: str, box_width: float, box_height: float) -> float: + if not text: + return max(6.0, box_height * 0.80) + + font_size = max(6.0, box_height * 0.88) + + while font_size > 4.5 and stringWidth(text, "Helvetica", font_size) > box_width * 0.98: + font_size -= 0.25 + + min_reasonable = max(6.0, box_height * 0.68) + return max(min_reasonable, font_size) + + +def _flatten_layout_lines(layout_json: dict | None) -> list[dict]: + if not layout_json: + return [] + + flattened = [] + for page in layout_json.get("pages", []): + for line in page.get("lines", []): + flattened.append( + { + "page": page["page"], + "bbox": line["bbox"], + "text": line.get("text", ""), + } + ) + return flattened + + +def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion: + if not document.current_path: + raise ValueError("Document has no current_path") + + current_file = Path(document.current_path) + if not current_file.exists(): + raise FileNotFoundError(f"Current file not found: {current_file}") + + raw_ocr = _latest_current_text_version(document, "raw_ocr") + reviewed = _latest_current_text_version(document, "reviewed") + + if raw_ocr is None: + raise ValueError("No current raw OCR version found") + if reviewed is None: + raise ValueError("No current reviewed text found") + if current_file.suffix.lower() != ".pdf": + raise ValueError("C1 corrected PDF generation currently supports PDFs only") + + raw_lines = _flatten_layout_lines(raw_ocr.layout_json) + reviewed_lines = _flatten_layout_lines(reviewed.layout_json) + + if not raw_lines: + raise ValueError("No OCR line boxes found in raw OCR layout data") + + if reviewed_lines and len(reviewed_lines) != len(raw_lines): + raise ValueError("Reviewed line layout does not match raw OCR line layout") + + source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json + if not source_layout: + raise ValueError("No source layout found") + + out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected") + out_path.parent.mkdir(parents=True, exist_ok=True) + + reader = PdfReader(str(current_file)) + + with tempfile.TemporaryDirectory() as tmpdirname: + tmpdir = Path(tmpdirname) + images = _render_pdf_page_images(current_file, tmpdir) + + overlay_pdf_path = tmpdir / "overlay.pdf" + c = None + + page_layouts = {page["page"]: page for page in source_layout.get("pages", [])} + + for page_num, img_path in enumerate(images, start=1): + pdf_page = reader.pages[page_num - 1] + page_w = float(pdf_page.mediabox.width) + page_h = float(pdf_page.mediabox.height) + + img = Image.open(img_path) + + if c is None: + c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h)) + else: + c.setPageSize((page_w, page_h)) + + c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h) + + page_layout = page_layouts.get(page_num, {"lines": []}) + src_w = float(page_layout.get("image_width") or img.size[0]) + src_h = float(page_layout.get("image_height") or img.size[1]) + + scale_x = page_w / src_w + scale_y = page_h / src_h + + for line in page_layout.get("lines", []): + text_line = (line.get("text") or "").strip() + if not text_line: + continue + + left, top, right, bottom = line["bbox"] + + pdf_x = left * scale_x + pdf_y = page_h - (bottom * scale_y) + box_width = max(10.0, (right - left) * scale_x) + box_height = max(6.0, (bottom - top) * scale_y) + + font_size = _fit_font_size(text_line, box_width, box_height) + + text_obj = c.beginText() + text_obj.setTextRenderMode(3) + text_obj.setFont("Helvetica", font_size) + text_obj.setTextOrigin(pdf_x, pdf_y + 1) + text_obj.textLine(text_line) + c.drawText(text_obj) + + c.showPage() + + if c is None: + raise ValueError("Failed to build overlay PDF") + + c.save() + shutil.copy2(overlay_pdf_path, out_path) + + file_hash = sha256_for_file(out_path) + + version = DocumentVersion( + document_id=document.id, + version_number=get_next_document_version_number(db, document.id), + version_type="ocr_corrected", + file_path=str(out_path), + sha256=file_hash, + created_by="save_ocr_corrected_pdf", + notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.", + ) + db.add(version) + + document.current_path = str(out_path) + document.canonical_filename = out_path.name + document.sha256_current = file_hash + + db.commit() + db.refresh(version) + return version + + +def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion: + if not document.current_path: + raise ValueError("Document has no current_path") + + current_file = Path(document.current_path) + if not current_file.exists(): + raise FileNotFoundError(f"Current file not found: {current_file}") + + out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched") + out_path.parent.mkdir(parents=True, exist_ok=True) + + shutil.copy2(current_file, out_path) + file_hash = sha256_for_file(out_path) + + version = DocumentVersion( + document_id=document.id, + version_number=get_next_document_version_number(db, document.id), + version_type="field_enriched", + file_path=str(out_path), + sha256=file_hash, + created_by="save_field_enriched_pdf", + notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.", + ) + db.add(version) + + document.current_path = str(out_path) + document.canonical_filename = out_path.name + document.sha256_current = file_hash + + db.commit() + db.refresh(version) + return version diff --git a/app/logic/ingest.py b/app/logic/ingest.py index 2791a2d..f70c2cc 100644 --- a/app/logic/ingest.py +++ b/app/logic/ingest.py @@ -1,12 +1,16 @@ from __future__ import annotations +import csv import hashlib +import io import mimetypes import shutil import subprocess import tempfile from difflib import SequenceMatcher from pathlib import Path + +from PIL import Image from uuid import uuid4 from sqlalchemy import func @@ -61,8 +65,7 @@ def get_tesseract_version() -> str | None: text=True, check=True, ) - line = result.stdout.splitlines()[0].strip() - return line + return result.stdout.splitlines()[0].strip() except Exception: return None @@ -93,67 +96,154 @@ def extract_pdf_text(path: Path) -> str: return "" -def ocr_image(path: Path) -> str: - try: - result = subprocess.run( - ["tesseract", str(path), "stdout"], +def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict: + reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ") + grouped: dict[tuple[int, int, int, int], list[dict]] = {} + + for row in reader: + if not row.get("text"): + continue + text = row["text"].strip() + if not text: + continue + + try: + level = int(row["level"]) + page_num = int(row["page_num"]) + block_num = int(row["block_num"]) + par_num = int(row["par_num"]) + line_num = int(row["line_num"]) + left = int(row["left"]) + top = int(row["top"]) + width = int(row["width"]) + height = int(row["height"]) + conf = float(row["conf"]) if row["conf"] not in ("-1", "", None) else None + except Exception: + continue + + if level != 5: + continue + if page_num != page_number: + continue + + key = (page_num, block_num, par_num, line_num) + grouped.setdefault(key, []).append( + { + "text": text, + "left": left, + "top": top, + "width": width, + "height": height, + "conf": conf, + } + ) + + lines = [] + for key, words in grouped.items(): + words = sorted(words, key=lambda w: w["left"]) + left = min(w["left"] for w in words) + top = min(w["top"] for w in words) + right = max(w["left"] + w["width"] for w in words) + bottom = max(w["top"] + w["height"] for w in words) + line_text = " ".join(w["text"] for w in words).strip() + avg_conf = None + valid_conf = [w["conf"] for w in words if w["conf"] is not None] + if valid_conf: + avg_conf = round(sum(valid_conf) / len(valid_conf), 2) + + lines.append( + { + "text": line_text, + "bbox": [left, top, right, bottom], + "confidence": avg_conf, + } + ) + + lines.sort(key=lambda x: (x["bbox"][1], x["bbox"][0])) + return { + "page": page_number, + "image_width": image_width, + "image_height": image_height, + "lines": lines, + } + + +def ocr_image_with_layout(path: Path) -> tuple[str, dict]: + with Image.open(path) as img: + image_width, image_height = img.size + + txt = subprocess.run( + ["tesseract", str(path), "stdout"], + capture_output=True, + text=True, + check=True, + ).stdout.strip() + + tsv = subprocess.run( + ["tesseract", str(path), "stdout", "tsv"], + capture_output=True, + text=True, + check=True, + ).stdout + + layout = {"pages": [_parse_tsv_lines(tsv, 1, image_width, image_height)]} + return txt, layout + + +def ocr_pdf_with_layout(path: Path) -> tuple[str, dict]: + with tempfile.TemporaryDirectory() as tmpdir: + output_prefix = Path(tmpdir) / "page" + subprocess.run( + ["pdftoppm", "-png", str(path), str(output_prefix)], capture_output=True, text=True, check=True, ) - return result.stdout.strip() - except Exception: - return "" + + all_text = [] + pages = [] + + for idx, img in enumerate(sorted(Path(tmpdir).glob("page-*.png")), start=1): + txt, layout = ocr_image_with_layout(img) + if txt: + all_text.append(txt) + if layout.get("pages"): + page_layout = layout["pages"][0] + page_layout["page"] = idx + pages.append(page_layout) + + return "\n\n".join(all_text).strip(), {"pages": pages} -def ocr_pdf(path: Path) -> str: - with tempfile.TemporaryDirectory() as tmpdir: - output_prefix = Path(tmpdir) / "page" - try: - subprocess.run( - ["pdftoppm", "-png", str(path), str(output_prefix)], - capture_output=True, - text=True, - check=True, - ) - except Exception: - return "" - - texts: list[str] = [] - for img in sorted(Path(tmpdir).glob("page-*.png")): - text = ocr_image(img) - if text: - texts.append(text) - - return "\n\n".join(texts).strip() - - -def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]: +def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]: suffix = path.suffix.lower() tesseract_version = get_tesseract_version() if suffix == ".pdf": - return ocr_pdf(path).strip(), "tesseract", tesseract_version + txt, layout = ocr_pdf_with_layout(path) + return txt.strip(), layout, "tesseract", tesseract_version if suffix in {".jpg", ".jpeg", ".png"}: - return ocr_image(path).strip(), "tesseract", tesseract_version - return "", None, None + txt, layout = ocr_image_with_layout(path) + return txt.strip(), layout, "tesseract", tesseract_version + return "", None, None, None -def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]: +def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, str | None, str | None]: suffix = path.suffix.lower() if suffix == ".pdf": extracted = extract_pdf_text(path) if len(extracted.strip()) >= 40: - return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest" + return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest" - ocr_text = ocr_pdf(path).strip() - return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback" + ocr_text, layout, engine, version = run_ocr_only(path) + return ocr_text, layout, engine, version, "initial_ingest_fallback" if suffix in {".jpg", ".jpeg", ".png"}: - return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest" + ocr_text, layout, engine, version = run_ocr_only(path) + return ocr_text, layout, engine, version, "initial_ingest" - return "", None, None, None + return "", None, None, None, None def compute_quality_score(source_text: str, reviewed_text: str) -> float: @@ -173,7 +263,6 @@ def archive_document( ) -> Document: if not source.exists(): raise FileNotFoundError(f"Source file not found: {source}") - if not is_supported_file(source): raise ValueError(f"Unsupported file type: {source.suffix}") @@ -187,7 +276,7 @@ def archive_document( mime_type = guess_mime_type(current_path) sha256_current = sha256_for_file(current_path) - raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path) + raw_text, layout_json, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path) document = Document( document_id=document_id, @@ -230,6 +319,7 @@ def archive_document( rerun_source=rerun_source, quality_flags=[], quality_note=None, + layout_json=layout_json, ) db.add(text_version) @@ -246,7 +336,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion: if not current_file.exists(): raise FileNotFoundError(f"Current file not found: {current_file}") - raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file) + raw_text, layout_json, ocr_engine, ocr_engine_version = run_ocr_only(current_file) if not raw_text: raise ValueError("OCR produced no text") @@ -278,6 +368,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion: quality_flags=[], quality_note=None, derived_from_version_id=previous_raw_id, + layout_json=layout_json, ) db.add(new_text) @@ -288,19 +379,9 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion: return new_text -def ingest_file( - db: Session, - file_path: str, - source_system: str, - document_type: str = "receipt", -) -> Document: +def ingest_file(db: Session, file_path: str, source_system: str, document_type: str = "receipt") -> Document: source = Path(file_path).expanduser().resolve() - return archive_document( - db=db, - source=source, - source_system=source_system, - document_type=document_type, - ) + return archive_document(db=db, source=source, source_system=source_system, document_type=document_type) def ingest_uploaded_file( @@ -321,12 +402,7 @@ def ingest_uploaded_file( staged_path = upload_root / staged_name staged_path.write_bytes(file_bytes) - return archive_document( - db=db, - source=staged_path, - source_system=source_system, - document_type=document_type, - ) + return archive_document(db=db, source=staged_path, source_system=source_system, document_type=document_type) def ingest_directory( @@ -337,7 +413,6 @@ def ingest_directory( document_type: str = "receipt", ) -> list[Document]: source_dir = Path(directory_path).expanduser().resolve() - if not source_dir.exists() or not source_dir.is_dir(): raise NotADirectoryError(f"Directory not found: {source_dir}") @@ -349,12 +424,7 @@ def ingest_directory( continue try: ingested.append( - ingest_file( - db=db, - file_path=str(path), - source_system=source_system, - document_type=document_type, - ) + ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type) ) except Exception: continue diff --git a/app/models/document.py b/app/models/document.py index 323354b..e0422ae 100644 --- a/app/models/document.py +++ b/app/models/document.py @@ -16,6 +16,7 @@ class Document(Base): source_path: Mapped[str] = mapped_column(Text, nullable=False) original_path: Mapped[str | None] = mapped_column(Text, nullable=True) current_path: Mapped[str | None] = mapped_column(Text, nullable=True) + share_path: Mapped[str | None] = mapped_column(Text, nullable=True) original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True) canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True) @@ -48,4 +49,4 @@ class Document(Base): layer1_candidates: Mapped[list["Layer1Candidate"]] = relationship( back_populates="document", cascade="all, delete-orphan", - ) \ No newline at end of file + ) diff --git a/app/models/text_version.py b/app/models/text_version.py index d55973f..50db5b6 100644 --- a/app/models/text_version.py +++ b/app/models/text_version.py @@ -16,7 +16,7 @@ class TextVersion(Base): ) version_number: Mapped[int] = mapped_column(Integer, nullable=False) - version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed + version_type: Mapped[str] = mapped_column(String(50), nullable=False) text_content: Mapped[str] = mapped_column(Text, nullable=False) @@ -36,6 +36,8 @@ class TextVersion(Base): nullable=True, ) + layout_json: Mapped[dict | None] = mapped_column(JSON, nullable=True) + created_at: Mapped[datetime] = mapped_column( DateTime, default=datetime.utcnow, nullable=False ) diff --git a/app/routes/documents.py b/app/routes/documents.py index 5d62aff..bc6d6ad 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -1,3 +1,4 @@ +from copy import deepcopy from pathlib import Path from uuid import uuid4 @@ -7,6 +8,10 @@ from fastapi.templating import Jinja2Templates from sqlalchemy.orm import Session, selectinload from app.db.deps import get_db +from app.logic.document_outputs import ( + create_field_enriched_pdf_version, + create_ocr_corrected_pdf_version, +) from app.logic.ingest import compute_quality_score, rerun_ocr_for_document from app.models.document import Document from app.models.document_version import DocumentVersion @@ -39,6 +44,68 @@ QUALITY_FLAG_OPTIONS = [ ] +def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]: + sorted_text_versions = sorted( + document.text_versions, + key=lambda x: (x.version_number, x.created_at), + reverse=True, + ) + + raw_ocr = next( + (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), + None, + ) + + reviewed_ocr = next( + (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current), + None, + ) + + return raw_ocr, reviewed_ocr + + +def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]: + if not layout_json: + return [] + + lines: list[str] = [] + for page in layout_json.get("pages", []): + for line in page.get("lines", []): + lines.append((line.get("text") or "").strip()) + return lines + + +def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str: + # Prefer the current raw OCR in the editor so rerun OCR immediately refreshes + # the editable line set. Reviewed text remains visible above as history/state. + source = raw_ocr or reviewed_ocr + if source and source.layout_json: + return "\n".join(_extract_line_texts_from_layout(source.layout_json)) + if source and source.text_content: + return source.text_content + return "" + + +def _line_count_from_layout(layout_json: dict | None) -> int: + return len(_extract_line_texts_from_layout(layout_json)) + + +def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None: + if not base_layout: + return None + + reviewed_lines = reviewed_text.splitlines() + new_layout = deepcopy(base_layout) + + idx = 0 + for page in new_layout.get("pages", []): + for line in page.get("lines", []): + line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else "" + idx += 1 + + return new_layout + + @router.get("/", response_class=HTMLResponse) def list_documents(request: Request, db: Session = Depends(get_db)): documents = db.query(Document).order_by(Document.created_at.desc()).all() @@ -85,12 +152,7 @@ def test_ingest(db: Session = Depends(get_db)): document_id=document.id, version_number=1, version_type="raw_ocr", - text_content=( - "CVS PHARMACY\n" - "Date: 2026-04-01\n" - "Total: 12.34 USD\n" - "Household supplies\n" - ), + text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n", created_by="system", is_current=True, ocr_engine="test_seed", @@ -116,7 +178,35 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)): try: rerun_ocr_for_document(db, document) except Exception: - return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303) + + return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + + +@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse) +def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first() + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + try: + create_ocr_corrected_pdf_version(db, document) + except Exception: + return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303) + + return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) + + +@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse) +def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)): + document = db.query(Document).filter(Document.document_id == document_id).first() + if document is None: + return RedirectResponse(url="/documents/", status_code=303) + + try: + create_field_enriched_pdf_version(db, document) + except Exception: + return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303) return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303) @@ -139,16 +229,15 @@ def save_reviewed_text( if document is None: return RedirectResponse(url="/documents/", status_code=303) - sorted_text_versions = sorted( - document.text_versions, - key=lambda x: (x.version_number, x.created_at), - reverse=True, - ) + raw_ocr, _ = _get_current_text_versions(document) + expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None) + actual_line_count = len(reviewed_text.splitlines()) - current_raw = next( - (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), - None, - ) + if expected_line_count and actual_line_count != expected_line_count: + return RedirectResponse( + url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}", + status_code=303, + ) existing_reviewed = [ tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current @@ -156,6 +245,11 @@ def save_reviewed_text( for tv in existing_reviewed: tv.is_current = False + reviewed_layout = _apply_reviewed_lines_to_layout( + raw_ocr.layout_json if raw_ocr else None, + reviewed_text, + ) + reviewed_version = TextVersion( document_id=document.id, version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1, @@ -163,14 +257,15 @@ def save_reviewed_text( text_content=reviewed_text, created_by="mcelwain", is_current=True, - derived_from_version_id=current_raw.id if current_raw else None, + derived_from_version_id=raw_ocr.id if raw_ocr else None, + layout_json=reviewed_layout, ) db.add(reviewed_version) - if current_raw: - current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text) - current_raw.quality_flags = quality_flags or [] - current_raw.quality_note = quality_note or None + if raw_ocr: + raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text) + raw_ocr.quality_flags = quality_flags or [] + raw_ocr.quality_note = quality_note or None document.review_status = "reviewed" @@ -196,27 +291,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge if document is None: return HTMLResponse(content="Document not found", status_code=404) - sorted_text_versions = sorted( - document.text_versions, - key=lambda x: (x.version_number, x.created_at), - reverse=True, - ) + raw_ocr, reviewed_ocr = _get_current_text_versions(document) + review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr) - raw_ocr = next( - (tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current), - None, - ) - - reviewed_ocr = next( - (tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current), - None, - ) - - review_text_value = ( - reviewed_ocr.text_content - if reviewed_ocr is not None - else raw_ocr.text_content if raw_ocr is not None else "" + base_layout = ( + reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json + else raw_ocr.layout_json if raw_ocr else None ) + expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None) + actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0 + line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1)) file_url = None if document.current_path: @@ -228,6 +312,11 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge except Exception: file_url = None + app_url = str(request.url_for("document_detail", document_id=document.document_id)) + error = request.query_params.get("error") + error_expected = request.query_params.get("expected") + error_actual = request.query_params.get("actual") + return templates.TemplateResponse( request=request, name="documents/detail.html", @@ -238,8 +327,15 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge "reviewed_ocr": reviewed_ocr, "review_text_value": review_text_value, "file_url": file_url, + "app_url": app_url, "quality_flag_options": QUALITY_FLAG_OPTIONS, "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "", + "line_numbers": line_numbers, + "expected_line_count": expected_line_count, + "actual_line_count": actual_line_count, + "error": error, + "error_expected": error_expected, + "error_actual": error_actual, }, ) diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 74cda7b..ffd5e5b 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -3,17 +3,67 @@ {{ document.document_id }} +

Back to documents

{{ document.document_id }}

+ {% if error == "line_count_mismatch" %} +
+ Could not save reviewed OCR because line count did not match OCR layout. + Expected {{ error_expected }}, got {{ error_actual }}. +
+ {% elif error == "save_ocr_corrected_failed" %} +
+ Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count. +
+ {% elif error == "rerun_ocr_failed" %} +
+ OCR rerun failed. +
+ {% elif error == "save_field_enriched_failed" %} +
+ Could not save field-enriched PDF. +
+ {% endif %} +

Document metadata

+

Saved PDF scaffolds

+
+ +
+
+ +
+

Document preview

{% if file_url %} {% if document.mime_type == "application/pdf" %} @@ -47,6 +105,7 @@ {{ version.version_type }} — {{ version.file_path }} — {{ version.created_at }} + {% if version.notes %}
{{ version.notes }}{% endif %} {% endfor %} @@ -84,12 +143,23 @@

No reviewed OCR saved yet.

{% endif %} +

+ Expected OCR lines: {{ expected_line_count }}
+ Current editor lines: {{ actual_line_count }} +
+ Line count mismatch may affect corrected PDF layout. + +

+
- +
-
- + +
+
{% for n in line_numbers %}{{ n }} +{% endfor %}
+

Quality flags

@@ -113,8 +183,43 @@
- +
+ +