feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation

This commit is contained in:
Sean McElwain 2026-04-03 11:56:23 -05:00
parent 0d70e6b7bb
commit e67a67f80a
7 changed files with 634 additions and 115 deletions

View File

@ -8,3 +8,5 @@ DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/documen
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
OCR_CORRECTED_ROOT = os.getenv("OCR_CORRECTED_ROOT", "/mnt/storage/document-processor/outputs/ocr_corrected")
FIELD_ENRICHED_ROOT = os.getenv("FIELD_ENRICHED_ROOT", "/mnt/storage/document-processor/outputs/field_enriched")

View File

@ -0,0 +1,243 @@
from __future__ import annotations
import hashlib
import shutil
import subprocess
import tempfile
from pathlib import Path
from PIL import Image
from pypdf import PdfReader
from reportlab.lib.utils import ImageReader
from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.pdfgen import canvas
from sqlalchemy import func
from sqlalchemy.orm import Session
from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
def sha256_for_file(path: Path) -> str:
hasher = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
hasher.update(chunk)
return hasher.hexdigest()
def get_next_document_version_number(db: Session, document_id: int) -> int:
max_version = (
db.query(func.max(DocumentVersion.version_number))
.filter(DocumentVersion.document_id == document_id)
.scalar()
)
return (max_version or 0) + 1
def _build_output_path(root: str, document: Document, version_type: str) -> Path:
source = Path(document.current_path or "")
suffix = source.suffix.lower() if source.suffix else ".pdf"
filename = f"{document.document_id}_{version_type}{suffix}"
return Path(root) / filename
def _latest_current_text_version(document: Document, version_type: str) -> TextVersion | None:
candidates = [tv for tv in document.text_versions if tv.version_type == version_type and tv.is_current]
if not candidates:
return None
return sorted(candidates, key=lambda x: (x.version_number, x.created_at), reverse=True)[0]
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
prefix = tmpdir / "page"
subprocess.run(
["pdftoppm", "-png", str(pdf_path), str(prefix)],
capture_output=True,
text=True,
check=True,
)
return sorted(tmpdir.glob("page-*.png"))
def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
if not text:
return max(6.0, box_height * 0.80)
font_size = max(6.0, box_height * 0.88)
while font_size > 4.5 and stringWidth(text, "Helvetica", font_size) > box_width * 0.98:
font_size -= 0.25
min_reasonable = max(6.0, box_height * 0.68)
return max(min_reasonable, font_size)
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
if not layout_json:
return []
flattened = []
for page in layout_json.get("pages", []):
for line in page.get("lines", []):
flattened.append(
{
"page": page["page"],
"bbox": line["bbox"],
"text": line.get("text", ""),
}
)
return flattened
def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion:
if not document.current_path:
raise ValueError("Document has no current_path")
current_file = Path(document.current_path)
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
raw_ocr = _latest_current_text_version(document, "raw_ocr")
reviewed = _latest_current_text_version(document, "reviewed")
if raw_ocr is None:
raise ValueError("No current raw OCR version found")
if reviewed is None:
raise ValueError("No current reviewed text found")
if current_file.suffix.lower() != ".pdf":
raise ValueError("C1 corrected PDF generation currently supports PDFs only")
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
if not raw_lines:
raise ValueError("No OCR line boxes found in raw OCR layout data")
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
raise ValueError("Reviewed line layout does not match raw OCR line layout")
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
if not source_layout:
raise ValueError("No source layout found")
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
out_path.parent.mkdir(parents=True, exist_ok=True)
reader = PdfReader(str(current_file))
with tempfile.TemporaryDirectory() as tmpdirname:
tmpdir = Path(tmpdirname)
images = _render_pdf_page_images(current_file, tmpdir)
overlay_pdf_path = tmpdir / "overlay.pdf"
c = None
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
for page_num, img_path in enumerate(images, start=1):
pdf_page = reader.pages[page_num - 1]
page_w = float(pdf_page.mediabox.width)
page_h = float(pdf_page.mediabox.height)
img = Image.open(img_path)
if c is None:
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
else:
c.setPageSize((page_w, page_h))
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
page_layout = page_layouts.get(page_num, {"lines": []})
src_w = float(page_layout.get("image_width") or img.size[0])
src_h = float(page_layout.get("image_height") or img.size[1])
scale_x = page_w / src_w
scale_y = page_h / src_h
for line in page_layout.get("lines", []):
text_line = (line.get("text") or "").strip()
if not text_line:
continue
left, top, right, bottom = line["bbox"]
pdf_x = left * scale_x
pdf_y = page_h - (bottom * scale_y)
box_width = max(10.0, (right - left) * scale_x)
box_height = max(6.0, (bottom - top) * scale_y)
font_size = _fit_font_size(text_line, box_width, box_height)
text_obj = c.beginText()
text_obj.setTextRenderMode(3)
text_obj.setFont("Helvetica", font_size)
text_obj.setTextOrigin(pdf_x, pdf_y + 1)
text_obj.textLine(text_line)
c.drawText(text_obj)
c.showPage()
if c is None:
raise ValueError("Failed to build overlay PDF")
c.save()
shutil.copy2(overlay_pdf_path, out_path)
file_hash = sha256_for_file(out_path)
version = DocumentVersion(
document_id=document.id,
version_number=get_next_document_version_number(db, document.id),
version_type="ocr_corrected",
file_path=str(out_path),
sha256=file_hash,
created_by="save_ocr_corrected_pdf",
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
)
db.add(version)
document.current_path = str(out_path)
document.canonical_filename = out_path.name
document.sha256_current = file_hash
db.commit()
db.refresh(version)
return version
def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion:
if not document.current_path:
raise ValueError("Document has no current_path")
current_file = Path(document.current_path)
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
out_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(current_file, out_path)
file_hash = sha256_for_file(out_path)
version = DocumentVersion(
document_id=document.id,
version_number=get_next_document_version_number(db, document.id),
version_type="field_enriched",
file_path=str(out_path),
sha256=file_hash,
created_by="save_field_enriched_pdf",
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
)
db.add(version)
document.current_path = str(out_path)
document.canonical_filename = out_path.name
document.sha256_current = file_hash
db.commit()
db.refresh(version)
return version

View File

@ -1,12 +1,16 @@
from __future__ import annotations
import csv
import hashlib
import io
import mimetypes
import shutil
import subprocess
import tempfile
from difflib import SequenceMatcher
from pathlib import Path
from PIL import Image
from uuid import uuid4
from sqlalchemy import func
@ -61,8 +65,7 @@ def get_tesseract_version() -> str | None:
text=True,
check=True,
)
line = result.stdout.splitlines()[0].strip()
return line
return result.stdout.splitlines()[0].strip()
except Exception:
return None
@ -93,67 +96,154 @@ def extract_pdf_text(path: Path) -> str:
return ""
def ocr_image(path: Path) -> str:
try:
result = subprocess.run(
["tesseract", str(path), "stdout"],
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ")
grouped: dict[tuple[int, int, int, int], list[dict]] = {}
for row in reader:
if not row.get("text"):
continue
text = row["text"].strip()
if not text:
continue
try:
level = int(row["level"])
page_num = int(row["page_num"])
block_num = int(row["block_num"])
par_num = int(row["par_num"])
line_num = int(row["line_num"])
left = int(row["left"])
top = int(row["top"])
width = int(row["width"])
height = int(row["height"])
conf = float(row["conf"]) if row["conf"] not in ("-1", "", None) else None
except Exception:
continue
if level != 5:
continue
if page_num != page_number:
continue
key = (page_num, block_num, par_num, line_num)
grouped.setdefault(key, []).append(
{
"text": text,
"left": left,
"top": top,
"width": width,
"height": height,
"conf": conf,
}
)
lines = []
for key, words in grouped.items():
words = sorted(words, key=lambda w: w["left"])
left = min(w["left"] for w in words)
top = min(w["top"] for w in words)
right = max(w["left"] + w["width"] for w in words)
bottom = max(w["top"] + w["height"] for w in words)
line_text = " ".join(w["text"] for w in words).strip()
avg_conf = None
valid_conf = [w["conf"] for w in words if w["conf"] is not None]
if valid_conf:
avg_conf = round(sum(valid_conf) / len(valid_conf), 2)
lines.append(
{
"text": line_text,
"bbox": [left, top, right, bottom],
"confidence": avg_conf,
}
)
lines.sort(key=lambda x: (x["bbox"][1], x["bbox"][0]))
return {
"page": page_number,
"image_width": image_width,
"image_height": image_height,
"lines": lines,
}
def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
with Image.open(path) as img:
image_width, image_height = img.size
txt = subprocess.run(
["tesseract", str(path), "stdout"],
capture_output=True,
text=True,
check=True,
).stdout.strip()
tsv = subprocess.run(
["tesseract", str(path), "stdout", "tsv"],
capture_output=True,
text=True,
check=True,
).stdout
layout = {"pages": [_parse_tsv_lines(tsv, 1, image_width, image_height)]}
return txt, layout
def ocr_pdf_with_layout(path: Path) -> tuple[str, dict]:
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = Path(tmpdir) / "page"
subprocess.run(
["pdftoppm", "-png", str(path), str(output_prefix)],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip()
except Exception:
return ""
all_text = []
pages = []
for idx, img in enumerate(sorted(Path(tmpdir).glob("page-*.png")), start=1):
txt, layout = ocr_image_with_layout(img)
if txt:
all_text.append(txt)
if layout.get("pages"):
page_layout = layout["pages"][0]
page_layout["page"] = idx
pages.append(page_layout)
return "\n\n".join(all_text).strip(), {"pages": pages}
def ocr_pdf(path: Path) -> str:
with tempfile.TemporaryDirectory() as tmpdir:
output_prefix = Path(tmpdir) / "page"
try:
subprocess.run(
["pdftoppm", "-png", str(path), str(output_prefix)],
capture_output=True,
text=True,
check=True,
)
except Exception:
return ""
texts: list[str] = []
for img in sorted(Path(tmpdir).glob("page-*.png")):
text = ocr_image(img)
if text:
texts.append(text)
return "\n\n".join(texts).strip()
def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
suffix = path.suffix.lower()
tesseract_version = get_tesseract_version()
if suffix == ".pdf":
return ocr_pdf(path).strip(), "tesseract", tesseract_version
txt, layout = ocr_pdf_with_layout(path)
return txt.strip(), layout, "tesseract", tesseract_version
if suffix in {".jpg", ".jpeg", ".png"}:
return ocr_image(path).strip(), "tesseract", tesseract_version
return "", None, None
txt, layout = ocr_image_with_layout(path)
return txt.strip(), layout, "tesseract", tesseract_version
return "", None, None, None
def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, str | None, str | None]:
suffix = path.suffix.lower()
if suffix == ".pdf":
extracted = extract_pdf_text(path)
if len(extracted.strip()) >= 40:
return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"
ocr_text = ocr_pdf(path).strip()
return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
ocr_text, layout, engine, version = run_ocr_only(path)
return ocr_text, layout, engine, version, "initial_ingest_fallback"
if suffix in {".jpg", ".jpeg", ".png"}:
return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
ocr_text, layout, engine, version = run_ocr_only(path)
return ocr_text, layout, engine, version, "initial_ingest"
return "", None, None, None
return "", None, None, None, None
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
@ -173,7 +263,6 @@ def archive_document(
) -> Document:
if not source.exists():
raise FileNotFoundError(f"Source file not found: {source}")
if not is_supported_file(source):
raise ValueError(f"Unsupported file type: {source.suffix}")
@ -187,7 +276,7 @@ def archive_document(
mime_type = guess_mime_type(current_path)
sha256_current = sha256_for_file(current_path)
raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
raw_text, layout_json, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
document = Document(
document_id=document_id,
@ -230,6 +319,7 @@ def archive_document(
rerun_source=rerun_source,
quality_flags=[],
quality_note=None,
layout_json=layout_json,
)
db.add(text_version)
@ -246,7 +336,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
if not current_file.exists():
raise FileNotFoundError(f"Current file not found: {current_file}")
raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
raw_text, layout_json, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
if not raw_text:
raise ValueError("OCR produced no text")
@ -278,6 +368,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
quality_flags=[],
quality_note=None,
derived_from_version_id=previous_raw_id,
layout_json=layout_json,
)
db.add(new_text)
@ -288,19 +379,9 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
return new_text
def ingest_file(
db: Session,
file_path: str,
source_system: str,
document_type: str = "receipt",
) -> Document:
def ingest_file(db: Session, file_path: str, source_system: str, document_type: str = "receipt") -> Document:
source = Path(file_path).expanduser().resolve()
return archive_document(
db=db,
source=source,
source_system=source_system,
document_type=document_type,
)
return archive_document(db=db, source=source, source_system=source_system, document_type=document_type)
def ingest_uploaded_file(
@ -321,12 +402,7 @@ def ingest_uploaded_file(
staged_path = upload_root / staged_name
staged_path.write_bytes(file_bytes)
return archive_document(
db=db,
source=staged_path,
source_system=source_system,
document_type=document_type,
)
return archive_document(db=db, source=staged_path, source_system=source_system, document_type=document_type)
def ingest_directory(
@ -337,7 +413,6 @@ def ingest_directory(
document_type: str = "receipt",
) -> list[Document]:
source_dir = Path(directory_path).expanduser().resolve()
if not source_dir.exists() or not source_dir.is_dir():
raise NotADirectoryError(f"Directory not found: {source_dir}")
@ -349,12 +424,7 @@ def ingest_directory(
continue
try:
ingested.append(
ingest_file(
db=db,
file_path=str(path),
source_system=source_system,
document_type=document_type,
)
ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
)
except Exception:
continue

View File

@ -16,6 +16,7 @@ class Document(Base):
source_path: Mapped[str] = mapped_column(Text, nullable=False)
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
share_path: Mapped[str | None] = mapped_column(Text, nullable=True)
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)

View File

@ -16,7 +16,7 @@ class TextVersion(Base):
)
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
version_type: Mapped[str] = mapped_column(String(50), nullable=False)
text_content: Mapped[str] = mapped_column(Text, nullable=False)
@ -36,6 +36,8 @@ class TextVersion(Base):
nullable=True,
)
layout_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime, default=datetime.utcnow, nullable=False
)

View File

@ -1,3 +1,4 @@
from copy import deepcopy
from pathlib import Path
from uuid import uuid4
@ -7,6 +8,10 @@ from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session, selectinload
from app.db.deps import get_db
from app.logic.document_outputs import (
create_field_enriched_pdf_version,
create_ocr_corrected_pdf_version,
)
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
from app.models.document import Document
from app.models.document_version import DocumentVersion
@ -39,6 +44,68 @@ QUALITY_FLAG_OPTIONS = [
]
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
sorted_text_versions = sorted(
document.text_versions,
key=lambda x: (x.version_number, x.created_at),
reverse=True,
)
raw_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None,
)
reviewed_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
None,
)
return raw_ocr, reviewed_ocr
def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
if not layout_json:
return []
lines: list[str] = []
for page in layout_json.get("pages", []):
for line in page.get("lines", []):
lines.append((line.get("text") or "").strip())
return lines
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
# the editable line set. Reviewed text remains visible above as history/state.
source = raw_ocr or reviewed_ocr
if source and source.layout_json:
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
if source and source.text_content:
return source.text_content
return ""
def _line_count_from_layout(layout_json: dict | None) -> int:
return len(_extract_line_texts_from_layout(layout_json))
def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
if not base_layout:
return None
reviewed_lines = reviewed_text.splitlines()
new_layout = deepcopy(base_layout)
idx = 0
for page in new_layout.get("pages", []):
for line in page.get("lines", []):
line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
idx += 1
return new_layout
@router.get("/", response_class=HTMLResponse)
def list_documents(request: Request, db: Session = Depends(get_db)):
documents = db.query(Document).order_by(Document.created_at.desc()).all()
@ -85,12 +152,7 @@ def test_ingest(db: Session = Depends(get_db)):
document_id=document.id,
version_number=1,
version_type="raw_ocr",
text_content=(
"CVS PHARMACY\n"
"Date: 2026-04-01\n"
"Total: 12.34 USD\n"
"Household supplies\n"
),
text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n",
created_by="system",
is_current=True,
ocr_engine="test_seed",
@ -116,7 +178,35 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
try:
rerun_ocr_for_document(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
try:
create_ocr_corrected_pdf_version(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
document = db.query(Document).filter(Document.document_id == document_id).first()
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
try:
create_field_enriched_pdf_version(db, document)
except Exception:
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
@ -139,16 +229,15 @@ def save_reviewed_text(
if document is None:
return RedirectResponse(url="/documents/", status_code=303)
sorted_text_versions = sorted(
document.text_versions,
key=lambda x: (x.version_number, x.created_at),
reverse=True,
)
raw_ocr, _ = _get_current_text_versions(document)
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
actual_line_count = len(reviewed_text.splitlines())
current_raw = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None,
)
if expected_line_count and actual_line_count != expected_line_count:
return RedirectResponse(
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
status_code=303,
)
existing_reviewed = [
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
@ -156,6 +245,11 @@ def save_reviewed_text(
for tv in existing_reviewed:
tv.is_current = False
reviewed_layout = _apply_reviewed_lines_to_layout(
raw_ocr.layout_json if raw_ocr else None,
reviewed_text,
)
reviewed_version = TextVersion(
document_id=document.id,
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
@ -163,14 +257,15 @@ def save_reviewed_text(
text_content=reviewed_text,
created_by="mcelwain",
is_current=True,
derived_from_version_id=current_raw.id if current_raw else None,
derived_from_version_id=raw_ocr.id if raw_ocr else None,
layout_json=reviewed_layout,
)
db.add(reviewed_version)
if current_raw:
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
current_raw.quality_flags = quality_flags or []
current_raw.quality_note = quality_note or None
if raw_ocr:
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
raw_ocr.quality_flags = quality_flags or []
raw_ocr.quality_note = quality_note or None
document.review_status = "reviewed"
@ -196,27 +291,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
if document is None:
return HTMLResponse(content="Document not found", status_code=404)
sorted_text_versions = sorted(
document.text_versions,
key=lambda x: (x.version_number, x.created_at),
reverse=True,
)
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
raw_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
None,
)
reviewed_ocr = next(
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
None,
)
review_text_value = (
reviewed_ocr.text_content
if reviewed_ocr is not None
else raw_ocr.text_content if raw_ocr is not None else ""
base_layout = (
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
else raw_ocr.layout_json if raw_ocr else None
)
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
file_url = None
if document.current_path:
@ -228,6 +312,11 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
except Exception:
file_url = None
app_url = str(request.url_for("document_detail", document_id=document.document_id))
error = request.query_params.get("error")
error_expected = request.query_params.get("expected")
error_actual = request.query_params.get("actual")
return templates.TemplateResponse(
request=request,
name="documents/detail.html",
@ -238,8 +327,15 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
"reviewed_ocr": reviewed_ocr,
"review_text_value": review_text_value,
"file_url": file_url,
"app_url": app_url,
"quality_flag_options": QUALITY_FLAG_OPTIONS,
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
"line_numbers": line_numbers,
"expected_line_count": expected_line_count,
"actual_line_count": actual_line_count,
"error": error,
"error_expected": error_expected,
"error_actual": error_actual,
},
)

View File

@ -3,17 +3,67 @@
<head>
<meta charset="UTF-8">
<title>{{ document.document_id }}</title>
<style>
body { font-family: sans-serif; }
textarea { font-family: monospace; }
.editor-wrap {
display: flex;
align-items: flex-start;
gap: 0.5rem;
}
.line-numbers {
font-family: monospace;
white-space: pre;
text-align: right;
color: #666;
user-select: none;
padding-top: 2px;
min-width: 3rem;
}
.line-warning {
color: #8a5a00;
font-weight: 600;
}
.error-box {
background: #ffe8e8;
color: #8b0000;
padding: 0.75rem;
border: 1px solid #cc9999;
margin-bottom: 1rem;
}
</style>
</head>
<body>
<p><a href="/documents/">Back to documents</a></p>
<h1>{{ document.document_id }}</h1>
{% if error == "line_count_mismatch" %}
<div class="error-box">
Could not save reviewed OCR because line count did not match OCR layout.
Expected {{ error_expected }}, got {{ error_actual }}.
</div>
{% elif error == "save_ocr_corrected_failed" %}
<div class="error-box">
Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count.
</div>
{% elif error == "rerun_ocr_failed" %}
<div class="error-box">
OCR rerun failed.
</div>
{% elif error == "save_field_enriched_failed" %}
<div class="error-box">
Could not save field-enriched PDF.
</div>
{% endif %}
<h2>Document metadata</h2>
<ul>
<li>Type: {{ document.document_type }}</li>
<li>Source path: {{ document.source_path }}</li>
<li>Current path: {{ document.current_path }}</li>
<li>Share path: {{ document.share_path or "" }}</li>
<li>App URL: <a href="{{ app_url }}">{{ app_url }}</a></li>
<li>Original filename: {{ document.original_filename }}</li>
<li>Canonical filename: {{ document.canonical_filename }}</li>
<li>MIME type: {{ document.mime_type }}</li>
@ -25,6 +75,14 @@
<li>Updated at: {{ document.updated_at }}</li>
</ul>
<h2>Saved PDF scaffolds</h2>
<form method="post" action="/documents/{{ document.document_id }}/save-ocr-corrected-pdf" style="display:inline;">
<button type="submit">Save OCR-corrected PDF</button>
</form>
<form method="post" action="/documents/{{ document.document_id }}/save-field-enriched-pdf" style="display:inline; margin-left: 1rem;">
<button type="submit">Save field-enriched PDF</button>
</form>
<h2>Document preview</h2>
{% if file_url %}
{% if document.mime_type == "application/pdf" %}
@ -47,6 +105,7 @@
{{ version.version_type }} —
{{ version.file_path }} —
{{ version.created_at }}
{% if version.notes %}<br><em>{{ version.notes }}</em>{% endif %}
</li>
{% endfor %}
</ul>
@ -84,12 +143,23 @@
<p>No reviewed OCR saved yet.</p>
{% endif %}
<p>
Expected OCR lines: <span id="expected-lines">{{ expected_line_count }}</span><br>
Current editor lines: <span id="actual-lines">{{ actual_line_count }}</span>
<br><span id="line-warning" class="line-warning" {% if expected_line_count == actual_line_count %}style="display:none;"{% endif %}>
Line count mismatch may affect corrected PDF layout.
</span>
</p>
<form method="post" action="/documents/{{ document.document_id }}/review-text">
<div>
<label for="reviewed_text">Edit reviewed OCR text:</label>
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line):</label>
</div>
<div>
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
<div class="editor-wrap">
<div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
{% endfor %}</div>
<textarea id="reviewed_text" name="reviewed_text" rows="{{ [actual_line_count + 2, 20]|max }}" cols="100">{{ review_text_value }}</textarea>
</div>
<h3>Quality flags</h3>
@ -113,8 +183,43 @@
</div>
<div style="margin-top: 1rem;">
<button type="submit">Save reviewed OCR</button>
<button type="submit" id="save-reviewed-btn">Save reviewed OCR</button>
</div>
</form>
<script>
const textarea = document.getElementById("reviewed_text");
const expectedLines = parseInt(document.getElementById("expected-lines").textContent || "0", 10);
const actualLinesEl = document.getElementById("actual-lines");
const warningEl = document.getElementById("line-warning");
const saveBtn = document.getElementById("save-reviewed-btn");
const lineNumbersEl = document.getElementById("line-numbers");
function countLines(text) {
if (text.length === 0) return 0;
return text.split('\n').length;
}
function rebuildLineNumbers(lineCount) {
let nums = "";
for (let i = 1; i <= lineCount; i++) {
nums += i + "\n";
}
lineNumbersEl.textContent = nums;
}
function updateEditorState() {
const actual = countLines(textarea.value);
actualLinesEl.textContent = actual.toString();
rebuildLineNumbers(Math.max(actual, expectedLines));
const mismatch = expectedLines > 0 && actual !== expectedLines;
warningEl.style.display = mismatch ? "inline" : "none";
saveBtn.disabled = mismatch;
}
textarea.addEventListener("input", updateEditorState);
updateEditorState();
</script>
</body>
</html>