feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation
This commit is contained in:
parent
0d70e6b7bb
commit
e67a67f80a
|
|
@ -8,3 +8,5 @@ DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/documen
|
|||
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
|
||||
INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
|
||||
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
|
||||
OCR_CORRECTED_ROOT = os.getenv("OCR_CORRECTED_ROOT", "/mnt/storage/document-processor/outputs/ocr_corrected")
|
||||
FIELD_ENRICHED_ROOT = os.getenv("FIELD_ENRICHED_ROOT", "/mnt/storage/document-processor/outputs/field_enriched")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,243 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
from reportlab.lib.utils import ImageReader
|
||||
from reportlab.pdfbase.pdfmetrics import stringWidth
|
||||
from reportlab.pdfgen import canvas
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT
|
||||
from app.models.document import Document
|
||||
from app.models.document_version import DocumentVersion
|
||||
from app.models.text_version import TextVersion
|
||||
|
||||
|
||||
def sha256_for_file(path: Path) -> str:
|
||||
hasher = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def get_next_document_version_number(db: Session, document_id: int) -> int:
|
||||
max_version = (
|
||||
db.query(func.max(DocumentVersion.version_number))
|
||||
.filter(DocumentVersion.document_id == document_id)
|
||||
.scalar()
|
||||
)
|
||||
return (max_version or 0) + 1
|
||||
|
||||
|
||||
def _build_output_path(root: str, document: Document, version_type: str) -> Path:
|
||||
source = Path(document.current_path or "")
|
||||
suffix = source.suffix.lower() if source.suffix else ".pdf"
|
||||
filename = f"{document.document_id}_{version_type}{suffix}"
|
||||
return Path(root) / filename
|
||||
|
||||
|
||||
def _latest_current_text_version(document: Document, version_type: str) -> TextVersion | None:
|
||||
candidates = [tv for tv in document.text_versions if tv.version_type == version_type and tv.is_current]
|
||||
if not candidates:
|
||||
return None
|
||||
return sorted(candidates, key=lambda x: (x.version_number, x.created_at), reverse=True)[0]
|
||||
|
||||
|
||||
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
|
||||
prefix = tmpdir / "page"
|
||||
subprocess.run(
|
||||
["pdftoppm", "-png", str(pdf_path), str(prefix)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return sorted(tmpdir.glob("page-*.png"))
|
||||
|
||||
|
||||
def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
|
||||
if not text:
|
||||
return max(6.0, box_height * 0.80)
|
||||
|
||||
font_size = max(6.0, box_height * 0.88)
|
||||
|
||||
while font_size > 4.5 and stringWidth(text, "Helvetica", font_size) > box_width * 0.98:
|
||||
font_size -= 0.25
|
||||
|
||||
min_reasonable = max(6.0, box_height * 0.68)
|
||||
return max(min_reasonable, font_size)
|
||||
|
||||
|
||||
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
|
||||
if not layout_json:
|
||||
return []
|
||||
|
||||
flattened = []
|
||||
for page in layout_json.get("pages", []):
|
||||
for line in page.get("lines", []):
|
||||
flattened.append(
|
||||
{
|
||||
"page": page["page"],
|
||||
"bbox": line["bbox"],
|
||||
"text": line.get("text", ""),
|
||||
}
|
||||
)
|
||||
return flattened
|
||||
|
||||
|
||||
def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion:
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||
reviewed = _latest_current_text_version(document, "reviewed")
|
||||
|
||||
if raw_ocr is None:
|
||||
raise ValueError("No current raw OCR version found")
|
||||
if reviewed is None:
|
||||
raise ValueError("No current reviewed text found")
|
||||
if current_file.suffix.lower() != ".pdf":
|
||||
raise ValueError("C1 corrected PDF generation currently supports PDFs only")
|
||||
|
||||
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
|
||||
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
|
||||
|
||||
if not raw_lines:
|
||||
raise ValueError("No OCR line boxes found in raw OCR layout data")
|
||||
|
||||
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
|
||||
raise ValueError("Reviewed line layout does not match raw OCR line layout")
|
||||
|
||||
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
|
||||
if not source_layout:
|
||||
raise ValueError("No source layout found")
|
||||
|
||||
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
reader = PdfReader(str(current_file))
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
tmpdir = Path(tmpdirname)
|
||||
images = _render_pdf_page_images(current_file, tmpdir)
|
||||
|
||||
overlay_pdf_path = tmpdir / "overlay.pdf"
|
||||
c = None
|
||||
|
||||
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
||||
|
||||
for page_num, img_path in enumerate(images, start=1):
|
||||
pdf_page = reader.pages[page_num - 1]
|
||||
page_w = float(pdf_page.mediabox.width)
|
||||
page_h = float(pdf_page.mediabox.height)
|
||||
|
||||
img = Image.open(img_path)
|
||||
|
||||
if c is None:
|
||||
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
|
||||
else:
|
||||
c.setPageSize((page_w, page_h))
|
||||
|
||||
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
|
||||
|
||||
page_layout = page_layouts.get(page_num, {"lines": []})
|
||||
src_w = float(page_layout.get("image_width") or img.size[0])
|
||||
src_h = float(page_layout.get("image_height") or img.size[1])
|
||||
|
||||
scale_x = page_w / src_w
|
||||
scale_y = page_h / src_h
|
||||
|
||||
for line in page_layout.get("lines", []):
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
||||
left, top, right, bottom = line["bbox"]
|
||||
|
||||
pdf_x = left * scale_x
|
||||
pdf_y = page_h - (bottom * scale_y)
|
||||
box_width = max(10.0, (right - left) * scale_x)
|
||||
box_height = max(6.0, (bottom - top) * scale_y)
|
||||
|
||||
font_size = _fit_font_size(text_line, box_width, box_height)
|
||||
|
||||
text_obj = c.beginText()
|
||||
text_obj.setTextRenderMode(3)
|
||||
text_obj.setFont("Helvetica", font_size)
|
||||
text_obj.setTextOrigin(pdf_x, pdf_y + 1)
|
||||
text_obj.textLine(text_line)
|
||||
c.drawText(text_obj)
|
||||
|
||||
c.showPage()
|
||||
|
||||
if c is None:
|
||||
raise ValueError("Failed to build overlay PDF")
|
||||
|
||||
c.save()
|
||||
shutil.copy2(overlay_pdf_path, out_path)
|
||||
|
||||
file_hash = sha256_for_file(out_path)
|
||||
|
||||
version = DocumentVersion(
|
||||
document_id=document.id,
|
||||
version_number=get_next_document_version_number(db, document.id),
|
||||
version_type="ocr_corrected",
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
created_by="save_ocr_corrected_pdf",
|
||||
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
|
||||
)
|
||||
db.add(version)
|
||||
|
||||
document.current_path = str(out_path)
|
||||
document.canonical_filename = out_path.name
|
||||
document.sha256_current = file_hash
|
||||
|
||||
db.commit()
|
||||
db.refresh(version)
|
||||
return version
|
||||
|
||||
|
||||
def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion:
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
shutil.copy2(current_file, out_path)
|
||||
file_hash = sha256_for_file(out_path)
|
||||
|
||||
version = DocumentVersion(
|
||||
document_id=document.id,
|
||||
version_number=get_next_document_version_number(db, document.id),
|
||||
version_type="field_enriched",
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
created_by="save_field_enriched_pdf",
|
||||
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
|
||||
)
|
||||
db.add(version)
|
||||
|
||||
document.current_path = str(out_path)
|
||||
document.canonical_filename = out_path.name
|
||||
document.sha256_current = file_hash
|
||||
|
||||
db.commit()
|
||||
db.refresh(version)
|
||||
return version
|
||||
|
|
@ -1,12 +1,16 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import hashlib
|
||||
import io
|
||||
import mimetypes
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from difflib import SequenceMatcher
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
from uuid import uuid4
|
||||
|
||||
from sqlalchemy import func
|
||||
|
|
@ -61,8 +65,7 @@ def get_tesseract_version() -> str | None:
|
|||
text=True,
|
||||
check=True,
|
||||
)
|
||||
line = result.stdout.splitlines()[0].strip()
|
||||
return line
|
||||
return result.stdout.splitlines()[0].strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
|
@ -93,67 +96,154 @@ def extract_pdf_text(path: Path) -> str:
|
|||
return ""
|
||||
|
||||
|
||||
def ocr_image(path: Path) -> str:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["tesseract", str(path), "stdout"],
|
||||
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
|
||||
reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ")
|
||||
grouped: dict[tuple[int, int, int, int], list[dict]] = {}
|
||||
|
||||
for row in reader:
|
||||
if not row.get("text"):
|
||||
continue
|
||||
text = row["text"].strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
try:
|
||||
level = int(row["level"])
|
||||
page_num = int(row["page_num"])
|
||||
block_num = int(row["block_num"])
|
||||
par_num = int(row["par_num"])
|
||||
line_num = int(row["line_num"])
|
||||
left = int(row["left"])
|
||||
top = int(row["top"])
|
||||
width = int(row["width"])
|
||||
height = int(row["height"])
|
||||
conf = float(row["conf"]) if row["conf"] not in ("-1", "", None) else None
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if level != 5:
|
||||
continue
|
||||
if page_num != page_number:
|
||||
continue
|
||||
|
||||
key = (page_num, block_num, par_num, line_num)
|
||||
grouped.setdefault(key, []).append(
|
||||
{
|
||||
"text": text,
|
||||
"left": left,
|
||||
"top": top,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"conf": conf,
|
||||
}
|
||||
)
|
||||
|
||||
lines = []
|
||||
for key, words in grouped.items():
|
||||
words = sorted(words, key=lambda w: w["left"])
|
||||
left = min(w["left"] for w in words)
|
||||
top = min(w["top"] for w in words)
|
||||
right = max(w["left"] + w["width"] for w in words)
|
||||
bottom = max(w["top"] + w["height"] for w in words)
|
||||
line_text = " ".join(w["text"] for w in words).strip()
|
||||
avg_conf = None
|
||||
valid_conf = [w["conf"] for w in words if w["conf"] is not None]
|
||||
if valid_conf:
|
||||
avg_conf = round(sum(valid_conf) / len(valid_conf), 2)
|
||||
|
||||
lines.append(
|
||||
{
|
||||
"text": line_text,
|
||||
"bbox": [left, top, right, bottom],
|
||||
"confidence": avg_conf,
|
||||
}
|
||||
)
|
||||
|
||||
lines.sort(key=lambda x: (x["bbox"][1], x["bbox"][0]))
|
||||
return {
|
||||
"page": page_number,
|
||||
"image_width": image_width,
|
||||
"image_height": image_height,
|
||||
"lines": lines,
|
||||
}
|
||||
|
||||
|
||||
def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
|
||||
with Image.open(path) as img:
|
||||
image_width, image_height = img.size
|
||||
|
||||
txt = subprocess.run(
|
||||
["tesseract", str(path), "stdout"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
).stdout.strip()
|
||||
|
||||
tsv = subprocess.run(
|
||||
["tesseract", str(path), "stdout", "tsv"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
).stdout
|
||||
|
||||
layout = {"pages": [_parse_tsv_lines(tsv, 1, image_width, image_height)]}
|
||||
return txt, layout
|
||||
|
||||
|
||||
def ocr_pdf_with_layout(path: Path) -> tuple[str, dict]:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_prefix = Path(tmpdir) / "page"
|
||||
subprocess.run(
|
||||
["pdftoppm", "-png", str(path), str(output_prefix)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
all_text = []
|
||||
pages = []
|
||||
|
||||
for idx, img in enumerate(sorted(Path(tmpdir).glob("page-*.png")), start=1):
|
||||
txt, layout = ocr_image_with_layout(img)
|
||||
if txt:
|
||||
all_text.append(txt)
|
||||
if layout.get("pages"):
|
||||
page_layout = layout["pages"][0]
|
||||
page_layout["page"] = idx
|
||||
pages.append(page_layout)
|
||||
|
||||
return "\n\n".join(all_text).strip(), {"pages": pages}
|
||||
|
||||
|
||||
def ocr_pdf(path: Path) -> str:
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_prefix = Path(tmpdir) / "page"
|
||||
try:
|
||||
subprocess.run(
|
||||
["pdftoppm", "-png", str(path), str(output_prefix)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
texts: list[str] = []
|
||||
for img in sorted(Path(tmpdir).glob("page-*.png")):
|
||||
text = ocr_image(img)
|
||||
if text:
|
||||
texts.append(text)
|
||||
|
||||
return "\n\n".join(texts).strip()
|
||||
|
||||
|
||||
def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
|
||||
def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
|
||||
suffix = path.suffix.lower()
|
||||
tesseract_version = get_tesseract_version()
|
||||
|
||||
if suffix == ".pdf":
|
||||
return ocr_pdf(path).strip(), "tesseract", tesseract_version
|
||||
txt, layout = ocr_pdf_with_layout(path)
|
||||
return txt.strip(), layout, "tesseract", tesseract_version
|
||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||
return ocr_image(path).strip(), "tesseract", tesseract_version
|
||||
return "", None, None
|
||||
txt, layout = ocr_image_with_layout(path)
|
||||
return txt.strip(), layout, "tesseract", tesseract_version
|
||||
return "", None, None, None
|
||||
|
||||
|
||||
def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
|
||||
def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, str | None, str | None]:
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
if suffix == ".pdf":
|
||||
extracted = extract_pdf_text(path)
|
||||
if len(extracted.strip()) >= 40:
|
||||
return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
||||
return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
||||
|
||||
ocr_text = ocr_pdf(path).strip()
|
||||
return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
|
||||
ocr_text, layout, engine, version = run_ocr_only(path)
|
||||
return ocr_text, layout, engine, version, "initial_ingest_fallback"
|
||||
|
||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||
return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
|
||||
ocr_text, layout, engine, version = run_ocr_only(path)
|
||||
return ocr_text, layout, engine, version, "initial_ingest"
|
||||
|
||||
return "", None, None, None
|
||||
return "", None, None, None, None
|
||||
|
||||
|
||||
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
|
||||
|
|
@ -173,7 +263,6 @@ def archive_document(
|
|||
) -> Document:
|
||||
if not source.exists():
|
||||
raise FileNotFoundError(f"Source file not found: {source}")
|
||||
|
||||
if not is_supported_file(source):
|
||||
raise ValueError(f"Unsupported file type: {source.suffix}")
|
||||
|
||||
|
|
@ -187,7 +276,7 @@ def archive_document(
|
|||
mime_type = guess_mime_type(current_path)
|
||||
sha256_current = sha256_for_file(current_path)
|
||||
|
||||
raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
|
||||
raw_text, layout_json, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
|
||||
|
||||
document = Document(
|
||||
document_id=document_id,
|
||||
|
|
@ -230,6 +319,7 @@ def archive_document(
|
|||
rerun_source=rerun_source,
|
||||
quality_flags=[],
|
||||
quality_note=None,
|
||||
layout_json=layout_json,
|
||||
)
|
||||
db.add(text_version)
|
||||
|
||||
|
|
@ -246,7 +336,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
|||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
|
||||
raw_text, layout_json, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
|
||||
if not raw_text:
|
||||
raise ValueError("OCR produced no text")
|
||||
|
||||
|
|
@ -278,6 +368,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
|||
quality_flags=[],
|
||||
quality_note=None,
|
||||
derived_from_version_id=previous_raw_id,
|
||||
layout_json=layout_json,
|
||||
)
|
||||
db.add(new_text)
|
||||
|
||||
|
|
@ -288,19 +379,9 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
|||
return new_text
|
||||
|
||||
|
||||
def ingest_file(
|
||||
db: Session,
|
||||
file_path: str,
|
||||
source_system: str,
|
||||
document_type: str = "receipt",
|
||||
) -> Document:
|
||||
def ingest_file(db: Session, file_path: str, source_system: str, document_type: str = "receipt") -> Document:
|
||||
source = Path(file_path).expanduser().resolve()
|
||||
return archive_document(
|
||||
db=db,
|
||||
source=source,
|
||||
source_system=source_system,
|
||||
document_type=document_type,
|
||||
)
|
||||
return archive_document(db=db, source=source, source_system=source_system, document_type=document_type)
|
||||
|
||||
|
||||
def ingest_uploaded_file(
|
||||
|
|
@ -321,12 +402,7 @@ def ingest_uploaded_file(
|
|||
staged_path = upload_root / staged_name
|
||||
staged_path.write_bytes(file_bytes)
|
||||
|
||||
return archive_document(
|
||||
db=db,
|
||||
source=staged_path,
|
||||
source_system=source_system,
|
||||
document_type=document_type,
|
||||
)
|
||||
return archive_document(db=db, source=staged_path, source_system=source_system, document_type=document_type)
|
||||
|
||||
|
||||
def ingest_directory(
|
||||
|
|
@ -337,7 +413,6 @@ def ingest_directory(
|
|||
document_type: str = "receipt",
|
||||
) -> list[Document]:
|
||||
source_dir = Path(directory_path).expanduser().resolve()
|
||||
|
||||
if not source_dir.exists() or not source_dir.is_dir():
|
||||
raise NotADirectoryError(f"Directory not found: {source_dir}")
|
||||
|
||||
|
|
@ -349,12 +424,7 @@ def ingest_directory(
|
|||
continue
|
||||
try:
|
||||
ingested.append(
|
||||
ingest_file(
|
||||
db=db,
|
||||
file_path=str(path),
|
||||
source_system=source_system,
|
||||
document_type=document_type,
|
||||
)
|
||||
ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ class Document(Base):
|
|||
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
share_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
||||
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
|
|
@ -48,4 +49,4 @@ class Document(Base):
|
|||
layer1_candidates: Mapped[list["Layer1Candidate"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ class TextVersion(Base):
|
|||
)
|
||||
|
||||
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
|
||||
version_type: Mapped[str] = mapped_column(String(50), nullable=False)
|
||||
|
||||
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
|
||||
|
|
@ -36,6 +36,8 @@ class TextVersion(Base):
|
|||
nullable=True,
|
||||
)
|
||||
|
||||
layout_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime, default=datetime.utcnow, nullable=False
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
from uuid import uuid4
|
||||
|
||||
|
|
@ -7,6 +8,10 @@ from fastapi.templating import Jinja2Templates
|
|||
from sqlalchemy.orm import Session, selectinload
|
||||
|
||||
from app.db.deps import get_db
|
||||
from app.logic.document_outputs import (
|
||||
create_field_enriched_pdf_version,
|
||||
create_ocr_corrected_pdf_version,
|
||||
)
|
||||
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
||||
from app.models.document import Document
|
||||
from app.models.document_version import DocumentVersion
|
||||
|
|
@ -39,6 +44,68 @@ QUALITY_FLAG_OPTIONS = [
|
|||
]
|
||||
|
||||
|
||||
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
|
||||
sorted_text_versions = sorted(
|
||||
document.text_versions,
|
||||
key=lambda x: (x.version_number, x.created_at),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
raw_ocr = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
|
||||
reviewed_ocr = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
|
||||
return raw_ocr, reviewed_ocr
|
||||
|
||||
|
||||
def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
|
||||
if not layout_json:
|
||||
return []
|
||||
|
||||
lines: list[str] = []
|
||||
for page in layout_json.get("pages", []):
|
||||
for line in page.get("lines", []):
|
||||
lines.append((line.get("text") or "").strip())
|
||||
return lines
|
||||
|
||||
|
||||
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
|
||||
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
|
||||
# the editable line set. Reviewed text remains visible above as history/state.
|
||||
source = raw_ocr or reviewed_ocr
|
||||
if source and source.layout_json:
|
||||
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
||||
if source and source.text_content:
|
||||
return source.text_content
|
||||
return ""
|
||||
|
||||
|
||||
def _line_count_from_layout(layout_json: dict | None) -> int:
|
||||
return len(_extract_line_texts_from_layout(layout_json))
|
||||
|
||||
|
||||
def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
|
||||
if not base_layout:
|
||||
return None
|
||||
|
||||
reviewed_lines = reviewed_text.splitlines()
|
||||
new_layout = deepcopy(base_layout)
|
||||
|
||||
idx = 0
|
||||
for page in new_layout.get("pages", []):
|
||||
for line in page.get("lines", []):
|
||||
line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
|
||||
idx += 1
|
||||
|
||||
return new_layout
|
||||
|
||||
|
||||
@router.get("/", response_class=HTMLResponse)
|
||||
def list_documents(request: Request, db: Session = Depends(get_db)):
|
||||
documents = db.query(Document).order_by(Document.created_at.desc()).all()
|
||||
|
|
@ -85,12 +152,7 @@ def test_ingest(db: Session = Depends(get_db)):
|
|||
document_id=document.id,
|
||||
version_number=1,
|
||||
version_type="raw_ocr",
|
||||
text_content=(
|
||||
"CVS PHARMACY\n"
|
||||
"Date: 2026-04-01\n"
|
||||
"Total: 12.34 USD\n"
|
||||
"Household supplies\n"
|
||||
),
|
||||
text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n",
|
||||
created_by="system",
|
||||
is_current=True,
|
||||
ocr_engine="test_seed",
|
||||
|
|
@ -116,7 +178,35 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
|||
try:
|
||||
rerun_ocr_for_document(db, document)
|
||||
except Exception:
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
||||
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
try:
|
||||
create_ocr_corrected_pdf_version(db, document)
|
||||
except Exception:
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
|
||||
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
try:
|
||||
create_field_enriched_pdf_version(db, document)
|
||||
except Exception:
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||
|
||||
|
|
@ -139,16 +229,15 @@ def save_reviewed_text(
|
|||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
sorted_text_versions = sorted(
|
||||
document.text_versions,
|
||||
key=lambda x: (x.version_number, x.created_at),
|
||||
reverse=True,
|
||||
)
|
||||
raw_ocr, _ = _get_current_text_versions(document)
|
||||
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
||||
actual_line_count = len(reviewed_text.splitlines())
|
||||
|
||||
current_raw = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
if expected_line_count and actual_line_count != expected_line_count:
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
existing_reviewed = [
|
||||
tv for tv in document.text_versions if tv.version_type == "reviewed" and tv.is_current
|
||||
|
|
@ -156,6 +245,11 @@ def save_reviewed_text(
|
|||
for tv in existing_reviewed:
|
||||
tv.is_current = False
|
||||
|
||||
reviewed_layout = _apply_reviewed_lines_to_layout(
|
||||
raw_ocr.layout_json if raw_ocr else None,
|
||||
reviewed_text,
|
||||
)
|
||||
|
||||
reviewed_version = TextVersion(
|
||||
document_id=document.id,
|
||||
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
||||
|
|
@ -163,14 +257,15 @@ def save_reviewed_text(
|
|||
text_content=reviewed_text,
|
||||
created_by="mcelwain",
|
||||
is_current=True,
|
||||
derived_from_version_id=current_raw.id if current_raw else None,
|
||||
derived_from_version_id=raw_ocr.id if raw_ocr else None,
|
||||
layout_json=reviewed_layout,
|
||||
)
|
||||
db.add(reviewed_version)
|
||||
|
||||
if current_raw:
|
||||
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
|
||||
current_raw.quality_flags = quality_flags or []
|
||||
current_raw.quality_note = quality_note or None
|
||||
if raw_ocr:
|
||||
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
||||
raw_ocr.quality_flags = quality_flags or []
|
||||
raw_ocr.quality_note = quality_note or None
|
||||
|
||||
document.review_status = "reviewed"
|
||||
|
||||
|
|
@ -196,27 +291,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
if document is None:
|
||||
return HTMLResponse(content="Document not found", status_code=404)
|
||||
|
||||
sorted_text_versions = sorted(
|
||||
document.text_versions,
|
||||
key=lambda x: (x.version_number, x.created_at),
|
||||
reverse=True,
|
||||
)
|
||||
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
|
||||
|
||||
raw_ocr = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
|
||||
reviewed_ocr = next(
|
||||
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
||||
None,
|
||||
)
|
||||
|
||||
review_text_value = (
|
||||
reviewed_ocr.text_content
|
||||
if reviewed_ocr is not None
|
||||
else raw_ocr.text_content if raw_ocr is not None else ""
|
||||
base_layout = (
|
||||
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
|
||||
else raw_ocr.layout_json if raw_ocr else None
|
||||
)
|
||||
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
||||
actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
|
||||
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
||||
|
||||
file_url = None
|
||||
if document.current_path:
|
||||
|
|
@ -228,6 +312,11 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
except Exception:
|
||||
file_url = None
|
||||
|
||||
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
||||
error = request.query_params.get("error")
|
||||
error_expected = request.query_params.get("expected")
|
||||
error_actual = request.query_params.get("actual")
|
||||
|
||||
return templates.TemplateResponse(
|
||||
request=request,
|
||||
name="documents/detail.html",
|
||||
|
|
@ -238,8 +327,15 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
|||
"reviewed_ocr": reviewed_ocr,
|
||||
"review_text_value": review_text_value,
|
||||
"file_url": file_url,
|
||||
"app_url": app_url,
|
||||
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
||||
"line_numbers": line_numbers,
|
||||
"expected_line_count": expected_line_count,
|
||||
"actual_line_count": actual_line_count,
|
||||
"error": error,
|
||||
"error_expected": error_expected,
|
||||
"error_actual": error_actual,
|
||||
},
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3,17 +3,67 @@
|
|||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>{{ document.document_id }}</title>
|
||||
<style>
|
||||
body { font-family: sans-serif; }
|
||||
textarea { font-family: monospace; }
|
||||
.editor-wrap {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
.line-numbers {
|
||||
font-family: monospace;
|
||||
white-space: pre;
|
||||
text-align: right;
|
||||
color: #666;
|
||||
user-select: none;
|
||||
padding-top: 2px;
|
||||
min-width: 3rem;
|
||||
}
|
||||
.line-warning {
|
||||
color: #8a5a00;
|
||||
font-weight: 600;
|
||||
}
|
||||
.error-box {
|
||||
background: #ffe8e8;
|
||||
color: #8b0000;
|
||||
padding: 0.75rem;
|
||||
border: 1px solid #cc9999;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="/documents/">Back to documents</a></p>
|
||||
|
||||
<h1>{{ document.document_id }}</h1>
|
||||
|
||||
{% if error == "line_count_mismatch" %}
|
||||
<div class="error-box">
|
||||
Could not save reviewed OCR because line count did not match OCR layout.
|
||||
Expected {{ error_expected }}, got {{ error_actual }}.
|
||||
</div>
|
||||
{% elif error == "save_ocr_corrected_failed" %}
|
||||
<div class="error-box">
|
||||
Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count.
|
||||
</div>
|
||||
{% elif error == "rerun_ocr_failed" %}
|
||||
<div class="error-box">
|
||||
OCR rerun failed.
|
||||
</div>
|
||||
{% elif error == "save_field_enriched_failed" %}
|
||||
<div class="error-box">
|
||||
Could not save field-enriched PDF.
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<h2>Document metadata</h2>
|
||||
<ul>
|
||||
<li>Type: {{ document.document_type }}</li>
|
||||
<li>Source path: {{ document.source_path }}</li>
|
||||
<li>Current path: {{ document.current_path }}</li>
|
||||
<li>Share path: {{ document.share_path or "" }}</li>
|
||||
<li>App URL: <a href="{{ app_url }}">{{ app_url }}</a></li>
|
||||
<li>Original filename: {{ document.original_filename }}</li>
|
||||
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
||||
<li>MIME type: {{ document.mime_type }}</li>
|
||||
|
|
@ -25,6 +75,14 @@
|
|||
<li>Updated at: {{ document.updated_at }}</li>
|
||||
</ul>
|
||||
|
||||
<h2>Saved PDF scaffolds</h2>
|
||||
<form method="post" action="/documents/{{ document.document_id }}/save-ocr-corrected-pdf" style="display:inline;">
|
||||
<button type="submit">Save OCR-corrected PDF</button>
|
||||
</form>
|
||||
<form method="post" action="/documents/{{ document.document_id }}/save-field-enriched-pdf" style="display:inline; margin-left: 1rem;">
|
||||
<button type="submit">Save field-enriched PDF</button>
|
||||
</form>
|
||||
|
||||
<h2>Document preview</h2>
|
||||
{% if file_url %}
|
||||
{% if document.mime_type == "application/pdf" %}
|
||||
|
|
@ -47,6 +105,7 @@
|
|||
{{ version.version_type }} —
|
||||
{{ version.file_path }} —
|
||||
{{ version.created_at }}
|
||||
{% if version.notes %}<br><em>{{ version.notes }}</em>{% endif %}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
|
@ -84,12 +143,23 @@
|
|||
<p>No reviewed OCR saved yet.</p>
|
||||
{% endif %}
|
||||
|
||||
<p>
|
||||
Expected OCR lines: <span id="expected-lines">{{ expected_line_count }}</span><br>
|
||||
Current editor lines: <span id="actual-lines">{{ actual_line_count }}</span>
|
||||
<br><span id="line-warning" class="line-warning" {% if expected_line_count == actual_line_count %}style="display:none;"{% endif %}>
|
||||
Line count mismatch may affect corrected PDF layout.
|
||||
</span>
|
||||
</p>
|
||||
|
||||
<form method="post" action="/documents/{{ document.document_id }}/review-text">
|
||||
<div>
|
||||
<label for="reviewed_text">Edit reviewed OCR text:</label>
|
||||
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line):</label>
|
||||
</div>
|
||||
<div>
|
||||
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
|
||||
|
||||
<div class="editor-wrap">
|
||||
<div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
|
||||
{% endfor %}</div>
|
||||
<textarea id="reviewed_text" name="reviewed_text" rows="{{ [actual_line_count + 2, 20]|max }}" cols="100">{{ review_text_value }}</textarea>
|
||||
</div>
|
||||
|
||||
<h3>Quality flags</h3>
|
||||
|
|
@ -113,8 +183,43 @@
|
|||
</div>
|
||||
|
||||
<div style="margin-top: 1rem;">
|
||||
<button type="submit">Save reviewed OCR</button>
|
||||
<button type="submit" id="save-reviewed-btn">Save reviewed OCR</button>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
const textarea = document.getElementById("reviewed_text");
|
||||
const expectedLines = parseInt(document.getElementById("expected-lines").textContent || "0", 10);
|
||||
const actualLinesEl = document.getElementById("actual-lines");
|
||||
const warningEl = document.getElementById("line-warning");
|
||||
const saveBtn = document.getElementById("save-reviewed-btn");
|
||||
const lineNumbersEl = document.getElementById("line-numbers");
|
||||
|
||||
function countLines(text) {
|
||||
if (text.length === 0) return 0;
|
||||
return text.split('\n').length;
|
||||
}
|
||||
|
||||
function rebuildLineNumbers(lineCount) {
|
||||
let nums = "";
|
||||
for (let i = 1; i <= lineCount; i++) {
|
||||
nums += i + "\n";
|
||||
}
|
||||
lineNumbersEl.textContent = nums;
|
||||
}
|
||||
|
||||
function updateEditorState() {
|
||||
const actual = countLines(textarea.value);
|
||||
actualLinesEl.textContent = actual.toString();
|
||||
rebuildLineNumbers(Math.max(actual, expectedLines));
|
||||
|
||||
const mismatch = expectedLines > 0 && actual !== expectedLines;
|
||||
warningEl.style.display = mismatch ? "inline" : "none";
|
||||
saveBtn.disabled = mismatch;
|
||||
}
|
||||
|
||||
textarea.addEventListener("input", updateEditorState);
|
||||
updateEditorState();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
Loading…
Reference in New Issue