feat: Phase 3.5 add line-preserving OCR review and corrected searchable PDF generation
This commit is contained in:
parent
0d70e6b7bb
commit
e67a67f80a
|
|
@ -8,3 +8,5 @@ DOCUMENT_STORAGE_ROOT = os.getenv("DOCUMENT_STORAGE_ROOT", "/mnt/storage/documen
|
||||||
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
|
DOCUMENT_ARCHIVE_ROOT = os.getenv("DOCUMENT_ARCHIVE_ROOT", "/mnt/storage/document-processor/archive/current")
|
||||||
INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
|
INBOX_ROOT = os.getenv("INBOX_ROOT", "/mnt/storage/document-processor/incoming/inbox")
|
||||||
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
|
UPLOAD_ROOT = os.getenv("UPLOAD_ROOT", "/mnt/storage/document-processor/incoming/uploads")
|
||||||
|
OCR_CORRECTED_ROOT = os.getenv("OCR_CORRECTED_ROOT", "/mnt/storage/document-processor/outputs/ocr_corrected")
|
||||||
|
FIELD_ENRICHED_ROOT = os.getenv("FIELD_ENRICHED_ROOT", "/mnt/storage/document-processor/outputs/field_enriched")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,243 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from reportlab.lib.utils import ImageReader
|
||||||
|
from reportlab.pdfbase.pdfmetrics import stringWidth
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
from sqlalchemy import func
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT
|
||||||
|
from app.models.document import Document
|
||||||
|
from app.models.document_version import DocumentVersion
|
||||||
|
from app.models.text_version import TextVersion
|
||||||
|
|
||||||
|
|
||||||
|
def sha256_for_file(path: Path) -> str:
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
with path.open("rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_next_document_version_number(db: Session, document_id: int) -> int:
|
||||||
|
max_version = (
|
||||||
|
db.query(func.max(DocumentVersion.version_number))
|
||||||
|
.filter(DocumentVersion.document_id == document_id)
|
||||||
|
.scalar()
|
||||||
|
)
|
||||||
|
return (max_version or 0) + 1
|
||||||
|
|
||||||
|
|
||||||
|
def _build_output_path(root: str, document: Document, version_type: str) -> Path:
|
||||||
|
source = Path(document.current_path or "")
|
||||||
|
suffix = source.suffix.lower() if source.suffix else ".pdf"
|
||||||
|
filename = f"{document.document_id}_{version_type}{suffix}"
|
||||||
|
return Path(root) / filename
|
||||||
|
|
||||||
|
|
||||||
|
def _latest_current_text_version(document: Document, version_type: str) -> TextVersion | None:
|
||||||
|
candidates = [tv for tv in document.text_versions if tv.version_type == version_type and tv.is_current]
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
return sorted(candidates, key=lambda x: (x.version_number, x.created_at), reverse=True)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _render_pdf_page_images(pdf_path: Path, tmpdir: Path) -> list[Path]:
|
||||||
|
prefix = tmpdir / "page"
|
||||||
|
subprocess.run(
|
||||||
|
["pdftoppm", "-png", str(pdf_path), str(prefix)],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
return sorted(tmpdir.glob("page-*.png"))
|
||||||
|
|
||||||
|
|
||||||
|
def _fit_font_size(text: str, box_width: float, box_height: float) -> float:
|
||||||
|
if not text:
|
||||||
|
return max(6.0, box_height * 0.80)
|
||||||
|
|
||||||
|
font_size = max(6.0, box_height * 0.88)
|
||||||
|
|
||||||
|
while font_size > 4.5 and stringWidth(text, "Helvetica", font_size) > box_width * 0.98:
|
||||||
|
font_size -= 0.25
|
||||||
|
|
||||||
|
min_reasonable = max(6.0, box_height * 0.68)
|
||||||
|
return max(min_reasonable, font_size)
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_layout_lines(layout_json: dict | None) -> list[dict]:
|
||||||
|
if not layout_json:
|
||||||
|
return []
|
||||||
|
|
||||||
|
flattened = []
|
||||||
|
for page in layout_json.get("pages", []):
|
||||||
|
for line in page.get("lines", []):
|
||||||
|
flattened.append(
|
||||||
|
{
|
||||||
|
"page": page["page"],
|
||||||
|
"bbox": line["bbox"],
|
||||||
|
"text": line.get("text", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return flattened
|
||||||
|
|
||||||
|
|
||||||
|
def create_ocr_corrected_pdf_version(db: Session, document: Document) -> DocumentVersion:
|
||||||
|
if not document.current_path:
|
||||||
|
raise ValueError("Document has no current_path")
|
||||||
|
|
||||||
|
current_file = Path(document.current_path)
|
||||||
|
if not current_file.exists():
|
||||||
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
|
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||||
|
reviewed = _latest_current_text_version(document, "reviewed")
|
||||||
|
|
||||||
|
if raw_ocr is None:
|
||||||
|
raise ValueError("No current raw OCR version found")
|
||||||
|
if reviewed is None:
|
||||||
|
raise ValueError("No current reviewed text found")
|
||||||
|
if current_file.suffix.lower() != ".pdf":
|
||||||
|
raise ValueError("C1 corrected PDF generation currently supports PDFs only")
|
||||||
|
|
||||||
|
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
|
||||||
|
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
|
||||||
|
|
||||||
|
if not raw_lines:
|
||||||
|
raise ValueError("No OCR line boxes found in raw OCR layout data")
|
||||||
|
|
||||||
|
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
|
||||||
|
raise ValueError("Reviewed line layout does not match raw OCR line layout")
|
||||||
|
|
||||||
|
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
|
||||||
|
if not source_layout:
|
||||||
|
raise ValueError("No source layout found")
|
||||||
|
|
||||||
|
out_path = _build_output_path(OCR_CORRECTED_ROOT, document, "ocr_corrected")
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
reader = PdfReader(str(current_file))
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
tmpdir = Path(tmpdirname)
|
||||||
|
images = _render_pdf_page_images(current_file, tmpdir)
|
||||||
|
|
||||||
|
overlay_pdf_path = tmpdir / "overlay.pdf"
|
||||||
|
c = None
|
||||||
|
|
||||||
|
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
||||||
|
|
||||||
|
for page_num, img_path in enumerate(images, start=1):
|
||||||
|
pdf_page = reader.pages[page_num - 1]
|
||||||
|
page_w = float(pdf_page.mediabox.width)
|
||||||
|
page_h = float(pdf_page.mediabox.height)
|
||||||
|
|
||||||
|
img = Image.open(img_path)
|
||||||
|
|
||||||
|
if c is None:
|
||||||
|
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
|
||||||
|
else:
|
||||||
|
c.setPageSize((page_w, page_h))
|
||||||
|
|
||||||
|
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
|
||||||
|
|
||||||
|
page_layout = page_layouts.get(page_num, {"lines": []})
|
||||||
|
src_w = float(page_layout.get("image_width") or img.size[0])
|
||||||
|
src_h = float(page_layout.get("image_height") or img.size[1])
|
||||||
|
|
||||||
|
scale_x = page_w / src_w
|
||||||
|
scale_y = page_h / src_h
|
||||||
|
|
||||||
|
for line in page_layout.get("lines", []):
|
||||||
|
text_line = (line.get("text") or "").strip()
|
||||||
|
if not text_line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
left, top, right, bottom = line["bbox"]
|
||||||
|
|
||||||
|
pdf_x = left * scale_x
|
||||||
|
pdf_y = page_h - (bottom * scale_y)
|
||||||
|
box_width = max(10.0, (right - left) * scale_x)
|
||||||
|
box_height = max(6.0, (bottom - top) * scale_y)
|
||||||
|
|
||||||
|
font_size = _fit_font_size(text_line, box_width, box_height)
|
||||||
|
|
||||||
|
text_obj = c.beginText()
|
||||||
|
text_obj.setTextRenderMode(3)
|
||||||
|
text_obj.setFont("Helvetica", font_size)
|
||||||
|
text_obj.setTextOrigin(pdf_x, pdf_y + 1)
|
||||||
|
text_obj.textLine(text_line)
|
||||||
|
c.drawText(text_obj)
|
||||||
|
|
||||||
|
c.showPage()
|
||||||
|
|
||||||
|
if c is None:
|
||||||
|
raise ValueError("Failed to build overlay PDF")
|
||||||
|
|
||||||
|
c.save()
|
||||||
|
shutil.copy2(overlay_pdf_path, out_path)
|
||||||
|
|
||||||
|
file_hash = sha256_for_file(out_path)
|
||||||
|
|
||||||
|
version = DocumentVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=get_next_document_version_number(db, document.id),
|
||||||
|
version_type="ocr_corrected",
|
||||||
|
file_path=str(out_path),
|
||||||
|
sha256=file_hash,
|
||||||
|
created_by="save_ocr_corrected_pdf",
|
||||||
|
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
|
||||||
|
)
|
||||||
|
db.add(version)
|
||||||
|
|
||||||
|
document.current_path = str(out_path)
|
||||||
|
document.canonical_filename = out_path.name
|
||||||
|
document.sha256_current = file_hash
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
db.refresh(version)
|
||||||
|
return version
|
||||||
|
|
||||||
|
|
||||||
|
def create_field_enriched_pdf_version(db: Session, document: Document) -> DocumentVersion:
|
||||||
|
if not document.current_path:
|
||||||
|
raise ValueError("Document has no current_path")
|
||||||
|
|
||||||
|
current_file = Path(document.current_path)
|
||||||
|
if not current_file.exists():
|
||||||
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
|
out_path = _build_output_path(FIELD_ENRICHED_ROOT, document, "field_enriched")
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
shutil.copy2(current_file, out_path)
|
||||||
|
file_hash = sha256_for_file(out_path)
|
||||||
|
|
||||||
|
version = DocumentVersion(
|
||||||
|
document_id=document.id,
|
||||||
|
version_number=get_next_document_version_number(db, document.id),
|
||||||
|
version_type="field_enriched",
|
||||||
|
file_path=str(out_path),
|
||||||
|
sha256=file_hash,
|
||||||
|
created_by="save_field_enriched_pdf",
|
||||||
|
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
|
||||||
|
)
|
||||||
|
db.add(version)
|
||||||
|
|
||||||
|
document.current_path = str(out_path)
|
||||||
|
document.canonical_filename = out_path.name
|
||||||
|
document.sha256_current = file_hash
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
db.refresh(version)
|
||||||
|
return version
|
||||||
|
|
@ -1,12 +1,16 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
|
|
@ -61,8 +65,7 @@ def get_tesseract_version() -> str | None:
|
||||||
text=True,
|
text=True,
|
||||||
check=True,
|
check=True,
|
||||||
)
|
)
|
||||||
line = result.stdout.splitlines()[0].strip()
|
return result.stdout.splitlines()[0].strip()
|
||||||
return line
|
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -93,67 +96,154 @@ def extract_pdf_text(path: Path) -> str:
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def ocr_image(path: Path) -> str:
|
def _parse_tsv_lines(tsv_text: str, page_number: int, image_width: int, image_height: int) -> dict:
|
||||||
|
reader = csv.DictReader(io.StringIO(tsv_text), delimiter=" ")
|
||||||
|
grouped: dict[tuple[int, int, int, int], list[dict]] = {}
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
if not row.get("text"):
|
||||||
|
continue
|
||||||
|
text = row["text"].strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
level = int(row["level"])
|
||||||
|
page_num = int(row["page_num"])
|
||||||
|
block_num = int(row["block_num"])
|
||||||
|
par_num = int(row["par_num"])
|
||||||
|
line_num = int(row["line_num"])
|
||||||
|
left = int(row["left"])
|
||||||
|
top = int(row["top"])
|
||||||
|
width = int(row["width"])
|
||||||
|
height = int(row["height"])
|
||||||
|
conf = float(row["conf"]) if row["conf"] not in ("-1", "", None) else None
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if level != 5:
|
||||||
|
continue
|
||||||
|
if page_num != page_number:
|
||||||
|
continue
|
||||||
|
|
||||||
|
key = (page_num, block_num, par_num, line_num)
|
||||||
|
grouped.setdefault(key, []).append(
|
||||||
|
{
|
||||||
|
"text": text,
|
||||||
|
"left": left,
|
||||||
|
"top": top,
|
||||||
|
"width": width,
|
||||||
|
"height": height,
|
||||||
|
"conf": conf,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
for key, words in grouped.items():
|
||||||
|
words = sorted(words, key=lambda w: w["left"])
|
||||||
|
left = min(w["left"] for w in words)
|
||||||
|
top = min(w["top"] for w in words)
|
||||||
|
right = max(w["left"] + w["width"] for w in words)
|
||||||
|
bottom = max(w["top"] + w["height"] for w in words)
|
||||||
|
line_text = " ".join(w["text"] for w in words).strip()
|
||||||
|
avg_conf = None
|
||||||
|
valid_conf = [w["conf"] for w in words if w["conf"] is not None]
|
||||||
|
if valid_conf:
|
||||||
|
avg_conf = round(sum(valid_conf) / len(valid_conf), 2)
|
||||||
|
|
||||||
|
lines.append(
|
||||||
|
{
|
||||||
|
"text": line_text,
|
||||||
|
"bbox": [left, top, right, bottom],
|
||||||
|
"confidence": avg_conf,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.sort(key=lambda x: (x["bbox"][1], x["bbox"][0]))
|
||||||
|
return {
|
||||||
|
"page": page_number,
|
||||||
|
"image_width": image_width,
|
||||||
|
"image_height": image_height,
|
||||||
|
"lines": lines,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_image_with_layout(path: Path) -> tuple[str, dict]:
|
||||||
|
with Image.open(path) as img:
|
||||||
|
image_width, image_height = img.size
|
||||||
|
|
||||||
|
txt = subprocess.run(
|
||||||
["tesseract", str(path), "stdout"],
|
["tesseract", str(path), "stdout"],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
check=True,
|
check=True,
|
||||||
)
|
).stdout.strip()
|
||||||
return result.stdout.strip()
|
|
||||||
except Exception:
|
tsv = subprocess.run(
|
||||||
return ""
|
["tesseract", str(path), "stdout", "tsv"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=True,
|
||||||
|
).stdout
|
||||||
|
|
||||||
|
layout = {"pages": [_parse_tsv_lines(tsv, 1, image_width, image_height)]}
|
||||||
|
return txt, layout
|
||||||
|
|
||||||
|
|
||||||
def ocr_pdf(path: Path) -> str:
|
def ocr_pdf_with_layout(path: Path) -> tuple[str, dict]:
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
output_prefix = Path(tmpdir) / "page"
|
output_prefix = Path(tmpdir) / "page"
|
||||||
try:
|
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
["pdftoppm", "-png", str(path), str(output_prefix)],
|
["pdftoppm", "-png", str(path), str(output_prefix)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
check=True,
|
check=True,
|
||||||
)
|
)
|
||||||
except Exception:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
texts: list[str] = []
|
all_text = []
|
||||||
for img in sorted(Path(tmpdir).glob("page-*.png")):
|
pages = []
|
||||||
text = ocr_image(img)
|
|
||||||
if text:
|
|
||||||
texts.append(text)
|
|
||||||
|
|
||||||
return "\n\n".join(texts).strip()
|
for idx, img in enumerate(sorted(Path(tmpdir).glob("page-*.png")), start=1):
|
||||||
|
txt, layout = ocr_image_with_layout(img)
|
||||||
|
if txt:
|
||||||
|
all_text.append(txt)
|
||||||
|
if layout.get("pages"):
|
||||||
|
page_layout = layout["pages"][0]
|
||||||
|
page_layout["page"] = idx
|
||||||
|
pages.append(page_layout)
|
||||||
|
|
||||||
|
return "\n\n".join(all_text).strip(), {"pages": pages}
|
||||||
|
|
||||||
|
|
||||||
def run_ocr_only(path: Path) -> tuple[str, str | None, str | None]:
|
def run_ocr_only(path: Path) -> tuple[str, dict | None, str | None, str | None]:
|
||||||
suffix = path.suffix.lower()
|
suffix = path.suffix.lower()
|
||||||
tesseract_version = get_tesseract_version()
|
tesseract_version = get_tesseract_version()
|
||||||
|
|
||||||
if suffix == ".pdf":
|
if suffix == ".pdf":
|
||||||
return ocr_pdf(path).strip(), "tesseract", tesseract_version
|
txt, layout = ocr_pdf_with_layout(path)
|
||||||
|
return txt.strip(), layout, "tesseract", tesseract_version
|
||||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||||
return ocr_image(path).strip(), "tesseract", tesseract_version
|
txt, layout = ocr_image_with_layout(path)
|
||||||
return "", None, None
|
return txt.strip(), layout, "tesseract", tesseract_version
|
||||||
|
return "", None, None, None
|
||||||
|
|
||||||
|
|
||||||
def get_raw_text_for_document(path: Path) -> tuple[str, str | None, str | None, str | None]:
|
def get_raw_text_for_document(path: Path) -> tuple[str, dict | None, str | None, str | None, str | None]:
|
||||||
suffix = path.suffix.lower()
|
suffix = path.suffix.lower()
|
||||||
|
|
||||||
if suffix == ".pdf":
|
if suffix == ".pdf":
|
||||||
extracted = extract_pdf_text(path)
|
extracted = extract_pdf_text(path)
|
||||||
if len(extracted.strip()) >= 40:
|
if len(extracted.strip()) >= 40:
|
||||||
return extracted, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
return extracted, None, "pdftotext", get_pdftotext_version(), "initial_ingest"
|
||||||
|
|
||||||
ocr_text = ocr_pdf(path).strip()
|
ocr_text, layout, engine, version = run_ocr_only(path)
|
||||||
return ocr_text, "tesseract", get_tesseract_version(), "initial_ingest_fallback"
|
return ocr_text, layout, engine, version, "initial_ingest_fallback"
|
||||||
|
|
||||||
if suffix in {".jpg", ".jpeg", ".png"}:
|
if suffix in {".jpg", ".jpeg", ".png"}:
|
||||||
return ocr_image(path).strip(), "tesseract", get_tesseract_version(), "initial_ingest"
|
ocr_text, layout, engine, version = run_ocr_only(path)
|
||||||
|
return ocr_text, layout, engine, version, "initial_ingest"
|
||||||
|
|
||||||
return "", None, None, None
|
return "", None, None, None, None
|
||||||
|
|
||||||
|
|
||||||
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
|
def compute_quality_score(source_text: str, reviewed_text: str) -> float:
|
||||||
|
|
@ -173,7 +263,6 @@ def archive_document(
|
||||||
) -> Document:
|
) -> Document:
|
||||||
if not source.exists():
|
if not source.exists():
|
||||||
raise FileNotFoundError(f"Source file not found: {source}")
|
raise FileNotFoundError(f"Source file not found: {source}")
|
||||||
|
|
||||||
if not is_supported_file(source):
|
if not is_supported_file(source):
|
||||||
raise ValueError(f"Unsupported file type: {source.suffix}")
|
raise ValueError(f"Unsupported file type: {source.suffix}")
|
||||||
|
|
||||||
|
|
@ -187,7 +276,7 @@ def archive_document(
|
||||||
mime_type = guess_mime_type(current_path)
|
mime_type = guess_mime_type(current_path)
|
||||||
sha256_current = sha256_for_file(current_path)
|
sha256_current = sha256_for_file(current_path)
|
||||||
|
|
||||||
raw_text, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
|
raw_text, layout_json, ocr_engine, ocr_engine_version, rerun_source = get_raw_text_for_document(current_path)
|
||||||
|
|
||||||
document = Document(
|
document = Document(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
|
|
@ -230,6 +319,7 @@ def archive_document(
|
||||||
rerun_source=rerun_source,
|
rerun_source=rerun_source,
|
||||||
quality_flags=[],
|
quality_flags=[],
|
||||||
quality_note=None,
|
quality_note=None,
|
||||||
|
layout_json=layout_json,
|
||||||
)
|
)
|
||||||
db.add(text_version)
|
db.add(text_version)
|
||||||
|
|
||||||
|
|
@ -246,7 +336,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
||||||
if not current_file.exists():
|
if not current_file.exists():
|
||||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||||
|
|
||||||
raw_text, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
|
raw_text, layout_json, ocr_engine, ocr_engine_version = run_ocr_only(current_file)
|
||||||
if not raw_text:
|
if not raw_text:
|
||||||
raise ValueError("OCR produced no text")
|
raise ValueError("OCR produced no text")
|
||||||
|
|
||||||
|
|
@ -278,6 +368,7 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
||||||
quality_flags=[],
|
quality_flags=[],
|
||||||
quality_note=None,
|
quality_note=None,
|
||||||
derived_from_version_id=previous_raw_id,
|
derived_from_version_id=previous_raw_id,
|
||||||
|
layout_json=layout_json,
|
||||||
)
|
)
|
||||||
db.add(new_text)
|
db.add(new_text)
|
||||||
|
|
||||||
|
|
@ -288,19 +379,9 @@ def rerun_ocr_for_document(db: Session, document: Document) -> TextVersion:
|
||||||
return new_text
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
def ingest_file(
|
def ingest_file(db: Session, file_path: str, source_system: str, document_type: str = "receipt") -> Document:
|
||||||
db: Session,
|
|
||||||
file_path: str,
|
|
||||||
source_system: str,
|
|
||||||
document_type: str = "receipt",
|
|
||||||
) -> Document:
|
|
||||||
source = Path(file_path).expanduser().resolve()
|
source = Path(file_path).expanduser().resolve()
|
||||||
return archive_document(
|
return archive_document(db=db, source=source, source_system=source_system, document_type=document_type)
|
||||||
db=db,
|
|
||||||
source=source,
|
|
||||||
source_system=source_system,
|
|
||||||
document_type=document_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ingest_uploaded_file(
|
def ingest_uploaded_file(
|
||||||
|
|
@ -321,12 +402,7 @@ def ingest_uploaded_file(
|
||||||
staged_path = upload_root / staged_name
|
staged_path = upload_root / staged_name
|
||||||
staged_path.write_bytes(file_bytes)
|
staged_path.write_bytes(file_bytes)
|
||||||
|
|
||||||
return archive_document(
|
return archive_document(db=db, source=staged_path, source_system=source_system, document_type=document_type)
|
||||||
db=db,
|
|
||||||
source=staged_path,
|
|
||||||
source_system=source_system,
|
|
||||||
document_type=document_type,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ingest_directory(
|
def ingest_directory(
|
||||||
|
|
@ -337,7 +413,6 @@ def ingest_directory(
|
||||||
document_type: str = "receipt",
|
document_type: str = "receipt",
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
source_dir = Path(directory_path).expanduser().resolve()
|
source_dir = Path(directory_path).expanduser().resolve()
|
||||||
|
|
||||||
if not source_dir.exists() or not source_dir.is_dir():
|
if not source_dir.exists() or not source_dir.is_dir():
|
||||||
raise NotADirectoryError(f"Directory not found: {source_dir}")
|
raise NotADirectoryError(f"Directory not found: {source_dir}")
|
||||||
|
|
||||||
|
|
@ -349,12 +424,7 @@ def ingest_directory(
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
ingested.append(
|
ingested.append(
|
||||||
ingest_file(
|
ingest_file(db=db, file_path=str(path), source_system=source_system, document_type=document_type)
|
||||||
db=db,
|
|
||||||
file_path=str(path),
|
|
||||||
source_system=source_system,
|
|
||||||
document_type=document_type,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ class Document(Base):
|
||||||
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
source_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
original_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
current_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
share_path: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
|
||||||
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
original_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
canonical_filename: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ class TextVersion(Base):
|
||||||
)
|
)
|
||||||
|
|
||||||
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||||
version_type: Mapped[str] = mapped_column(String(50), nullable=False) # raw_ocr, reviewed
|
version_type: Mapped[str] = mapped_column(String(50), nullable=False)
|
||||||
|
|
||||||
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
text_content: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
|
|
||||||
|
|
@ -36,6 +36,8 @@ class TextVersion(Base):
|
||||||
nullable=True,
|
nullable=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
layout_json: Mapped[dict | None] = mapped_column(JSON, nullable=True)
|
||||||
|
|
||||||
created_at: Mapped[datetime] = mapped_column(
|
created_at: Mapped[datetime] = mapped_column(
|
||||||
DateTime, default=datetime.utcnow, nullable=False
|
DateTime, default=datetime.utcnow, nullable=False
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
from copy import deepcopy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
|
|
@ -7,6 +8,10 @@ from fastapi.templating import Jinja2Templates
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
|
||||||
from app.db.deps import get_db
|
from app.db.deps import get_db
|
||||||
|
from app.logic.document_outputs import (
|
||||||
|
create_field_enriched_pdf_version,
|
||||||
|
create_ocr_corrected_pdf_version,
|
||||||
|
)
|
||||||
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
from app.logic.ingest import compute_quality_score, rerun_ocr_for_document
|
||||||
from app.models.document import Document
|
from app.models.document import Document
|
||||||
from app.models.document_version import DocumentVersion
|
from app.models.document_version import DocumentVersion
|
||||||
|
|
@ -39,6 +44,68 @@ QUALITY_FLAG_OPTIONS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_current_text_versions(document: Document) -> tuple[TextVersion | None, TextVersion | None]:
|
||||||
|
sorted_text_versions = sorted(
|
||||||
|
document.text_versions,
|
||||||
|
key=lambda x: (x.version_number, x.created_at),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
raw_ocr = next(
|
||||||
|
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
reviewed_ocr = next(
|
||||||
|
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
return raw_ocr, reviewed_ocr
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_line_texts_from_layout(layout_json: dict | None) -> list[str]:
|
||||||
|
if not layout_json:
|
||||||
|
return []
|
||||||
|
|
||||||
|
lines: list[str] = []
|
||||||
|
for page in layout_json.get("pages", []):
|
||||||
|
for line in page.get("lines", []):
|
||||||
|
lines.append((line.get("text") or "").strip())
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def _build_review_text_value(raw_ocr: TextVersion | None, reviewed_ocr: TextVersion | None) -> str:
|
||||||
|
# Prefer the current raw OCR in the editor so rerun OCR immediately refreshes
|
||||||
|
# the editable line set. Reviewed text remains visible above as history/state.
|
||||||
|
source = raw_ocr or reviewed_ocr
|
||||||
|
if source and source.layout_json:
|
||||||
|
return "\n".join(_extract_line_texts_from_layout(source.layout_json))
|
||||||
|
if source and source.text_content:
|
||||||
|
return source.text_content
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _line_count_from_layout(layout_json: dict | None) -> int:
|
||||||
|
return len(_extract_line_texts_from_layout(layout_json))
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_reviewed_lines_to_layout(base_layout: dict | None, reviewed_text: str) -> dict | None:
|
||||||
|
if not base_layout:
|
||||||
|
return None
|
||||||
|
|
||||||
|
reviewed_lines = reviewed_text.splitlines()
|
||||||
|
new_layout = deepcopy(base_layout)
|
||||||
|
|
||||||
|
idx = 0
|
||||||
|
for page in new_layout.get("pages", []):
|
||||||
|
for line in page.get("lines", []):
|
||||||
|
line["text"] = reviewed_lines[idx] if idx < len(reviewed_lines) else ""
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
return new_layout
|
||||||
|
|
||||||
|
|
||||||
@router.get("/", response_class=HTMLResponse)
|
@router.get("/", response_class=HTMLResponse)
|
||||||
def list_documents(request: Request, db: Session = Depends(get_db)):
|
def list_documents(request: Request, db: Session = Depends(get_db)):
|
||||||
documents = db.query(Document).order_by(Document.created_at.desc()).all()
|
documents = db.query(Document).order_by(Document.created_at.desc()).all()
|
||||||
|
|
@ -85,12 +152,7 @@ def test_ingest(db: Session = Depends(get_db)):
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
version_number=1,
|
version_number=1,
|
||||||
version_type="raw_ocr",
|
version_type="raw_ocr",
|
||||||
text_content=(
|
text_content="CVS PHARMACY\nDate: 2026-04-01\nTotal: 12.34 USD\nHousehold supplies\n",
|
||||||
"CVS PHARMACY\n"
|
|
||||||
"Date: 2026-04-01\n"
|
|
||||||
"Total: 12.34 USD\n"
|
|
||||||
"Household supplies\n"
|
|
||||||
),
|
|
||||||
created_by="system",
|
created_by="system",
|
||||||
is_current=True,
|
is_current=True,
|
||||||
ocr_engine="test_seed",
|
ocr_engine="test_seed",
|
||||||
|
|
@ -116,8 +178,36 @@ def rerun_ocr(document_id: str, db: Session = Depends(get_db)):
|
||||||
try:
|
try:
|
||||||
rerun_ocr_for_document(db, document)
|
rerun_ocr_for_document(db, document)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=rerun_ocr_failed", status_code=303)
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{document_id}/save-ocr-corrected-pdf", response_class=RedirectResponse)
|
||||||
|
def save_ocr_corrected_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).options(selectinload(Document.text_versions)).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return RedirectResponse(url="/documents/", status_code=303)
|
||||||
|
|
||||||
|
try:
|
||||||
|
create_ocr_corrected_pdf_version(db, document)
|
||||||
|
except Exception:
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_ocr_corrected_failed", status_code=303)
|
||||||
|
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
|
||||||
|
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||||
|
if document is None:
|
||||||
|
return RedirectResponse(url="/documents/", status_code=303)
|
||||||
|
|
||||||
|
try:
|
||||||
|
create_field_enriched_pdf_version(db, document)
|
||||||
|
except Exception:
|
||||||
|
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_field_enriched_failed", status_code=303)
|
||||||
|
|
||||||
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
return RedirectResponse(url=f"/documents/{document.document_id}", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -139,15 +229,14 @@ def save_reviewed_text(
|
||||||
if document is None:
|
if document is None:
|
||||||
return RedirectResponse(url="/documents/", status_code=303)
|
return RedirectResponse(url="/documents/", status_code=303)
|
||||||
|
|
||||||
sorted_text_versions = sorted(
|
raw_ocr, _ = _get_current_text_versions(document)
|
||||||
document.text_versions,
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
||||||
key=lambda x: (x.version_number, x.created_at),
|
actual_line_count = len(reviewed_text.splitlines())
|
||||||
reverse=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
current_raw = next(
|
if expected_line_count and actual_line_count != expected_line_count:
|
||||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
return RedirectResponse(
|
||||||
None,
|
url=f"/documents/{document.document_id}?error=line_count_mismatch&expected={expected_line_count}&actual={actual_line_count}",
|
||||||
|
status_code=303,
|
||||||
)
|
)
|
||||||
|
|
||||||
existing_reviewed = [
|
existing_reviewed = [
|
||||||
|
|
@ -156,6 +245,11 @@ def save_reviewed_text(
|
||||||
for tv in existing_reviewed:
|
for tv in existing_reviewed:
|
||||||
tv.is_current = False
|
tv.is_current = False
|
||||||
|
|
||||||
|
reviewed_layout = _apply_reviewed_lines_to_layout(
|
||||||
|
raw_ocr.layout_json if raw_ocr else None,
|
||||||
|
reviewed_text,
|
||||||
|
)
|
||||||
|
|
||||||
reviewed_version = TextVersion(
|
reviewed_version = TextVersion(
|
||||||
document_id=document.id,
|
document_id=document.id,
|
||||||
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
version_number=max(tv.version_number for tv in document.text_versions) + 1 if document.text_versions else 1,
|
||||||
|
|
@ -163,14 +257,15 @@ def save_reviewed_text(
|
||||||
text_content=reviewed_text,
|
text_content=reviewed_text,
|
||||||
created_by="mcelwain",
|
created_by="mcelwain",
|
||||||
is_current=True,
|
is_current=True,
|
||||||
derived_from_version_id=current_raw.id if current_raw else None,
|
derived_from_version_id=raw_ocr.id if raw_ocr else None,
|
||||||
|
layout_json=reviewed_layout,
|
||||||
)
|
)
|
||||||
db.add(reviewed_version)
|
db.add(reviewed_version)
|
||||||
|
|
||||||
if current_raw:
|
if raw_ocr:
|
||||||
current_raw.quality_score = compute_quality_score(current_raw.text_content, reviewed_text)
|
raw_ocr.quality_score = compute_quality_score(raw_ocr.text_content, reviewed_text)
|
||||||
current_raw.quality_flags = quality_flags or []
|
raw_ocr.quality_flags = quality_flags or []
|
||||||
current_raw.quality_note = quality_note or None
|
raw_ocr.quality_note = quality_note or None
|
||||||
|
|
||||||
document.review_status = "reviewed"
|
document.review_status = "reviewed"
|
||||||
|
|
||||||
|
|
@ -196,27 +291,16 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
if document is None:
|
if document is None:
|
||||||
return HTMLResponse(content="Document not found", status_code=404)
|
return HTMLResponse(content="Document not found", status_code=404)
|
||||||
|
|
||||||
sorted_text_versions = sorted(
|
raw_ocr, reviewed_ocr = _get_current_text_versions(document)
|
||||||
document.text_versions,
|
review_text_value = _build_review_text_value(raw_ocr, reviewed_ocr)
|
||||||
key=lambda x: (x.version_number, x.created_at),
|
|
||||||
reverse=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
raw_ocr = next(
|
base_layout = (
|
||||||
(tv for tv in sorted_text_versions if tv.version_type == "raw_ocr" and tv.is_current),
|
reviewed_ocr.layout_json if reviewed_ocr and reviewed_ocr.layout_json
|
||||||
None,
|
else raw_ocr.layout_json if raw_ocr else None
|
||||||
)
|
|
||||||
|
|
||||||
reviewed_ocr = next(
|
|
||||||
(tv for tv in sorted_text_versions if tv.version_type == "reviewed" and tv.is_current),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
|
|
||||||
review_text_value = (
|
|
||||||
reviewed_ocr.text_content
|
|
||||||
if reviewed_ocr is not None
|
|
||||||
else raw_ocr.text_content if raw_ocr is not None else ""
|
|
||||||
)
|
)
|
||||||
|
expected_line_count = _line_count_from_layout(raw_ocr.layout_json if raw_ocr else None)
|
||||||
|
actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
|
||||||
|
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
||||||
|
|
||||||
file_url = None
|
file_url = None
|
||||||
if document.current_path:
|
if document.current_path:
|
||||||
|
|
@ -228,6 +312,11 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
except Exception:
|
except Exception:
|
||||||
file_url = None
|
file_url = None
|
||||||
|
|
||||||
|
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
||||||
|
error = request.query_params.get("error")
|
||||||
|
error_expected = request.query_params.get("expected")
|
||||||
|
error_actual = request.query_params.get("actual")
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
request=request,
|
request=request,
|
||||||
name="documents/detail.html",
|
name="documents/detail.html",
|
||||||
|
|
@ -238,8 +327,15 @@ def document_detail(document_id: str, request: Request, db: Session = Depends(ge
|
||||||
"reviewed_ocr": reviewed_ocr,
|
"reviewed_ocr": reviewed_ocr,
|
||||||
"review_text_value": review_text_value,
|
"review_text_value": review_text_value,
|
||||||
"file_url": file_url,
|
"file_url": file_url,
|
||||||
|
"app_url": app_url,
|
||||||
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||||
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||||
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
"current_quality_note": raw_ocr.quality_note if raw_ocr and raw_ocr.quality_note else "",
|
||||||
|
"line_numbers": line_numbers,
|
||||||
|
"expected_line_count": expected_line_count,
|
||||||
|
"actual_line_count": actual_line_count,
|
||||||
|
"error": error,
|
||||||
|
"error_expected": error_expected,
|
||||||
|
"error_actual": error_actual,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -3,17 +3,67 @@
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>{{ document.document_id }}</title>
|
<title>{{ document.document_id }}</title>
|
||||||
|
<style>
|
||||||
|
body { font-family: sans-serif; }
|
||||||
|
textarea { font-family: monospace; }
|
||||||
|
.editor-wrap {
|
||||||
|
display: flex;
|
||||||
|
align-items: flex-start;
|
||||||
|
gap: 0.5rem;
|
||||||
|
}
|
||||||
|
.line-numbers {
|
||||||
|
font-family: monospace;
|
||||||
|
white-space: pre;
|
||||||
|
text-align: right;
|
||||||
|
color: #666;
|
||||||
|
user-select: none;
|
||||||
|
padding-top: 2px;
|
||||||
|
min-width: 3rem;
|
||||||
|
}
|
||||||
|
.line-warning {
|
||||||
|
color: #8a5a00;
|
||||||
|
font-weight: 600;
|
||||||
|
}
|
||||||
|
.error-box {
|
||||||
|
background: #ffe8e8;
|
||||||
|
color: #8b0000;
|
||||||
|
padding: 0.75rem;
|
||||||
|
border: 1px solid #cc9999;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<p><a href="/documents/">Back to documents</a></p>
|
<p><a href="/documents/">Back to documents</a></p>
|
||||||
|
|
||||||
<h1>{{ document.document_id }}</h1>
|
<h1>{{ document.document_id }}</h1>
|
||||||
|
|
||||||
|
{% if error == "line_count_mismatch" %}
|
||||||
|
<div class="error-box">
|
||||||
|
Could not save reviewed OCR because line count did not match OCR layout.
|
||||||
|
Expected {{ error_expected }}, got {{ error_actual }}.
|
||||||
|
</div>
|
||||||
|
{% elif error == "save_ocr_corrected_failed" %}
|
||||||
|
<div class="error-box">
|
||||||
|
Could not save OCR-corrected PDF. Check that reviewed OCR line count matches raw OCR line count.
|
||||||
|
</div>
|
||||||
|
{% elif error == "rerun_ocr_failed" %}
|
||||||
|
<div class="error-box">
|
||||||
|
OCR rerun failed.
|
||||||
|
</div>
|
||||||
|
{% elif error == "save_field_enriched_failed" %}
|
||||||
|
<div class="error-box">
|
||||||
|
Could not save field-enriched PDF.
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<h2>Document metadata</h2>
|
<h2>Document metadata</h2>
|
||||||
<ul>
|
<ul>
|
||||||
<li>Type: {{ document.document_type }}</li>
|
<li>Type: {{ document.document_type }}</li>
|
||||||
<li>Source path: {{ document.source_path }}</li>
|
<li>Source path: {{ document.source_path }}</li>
|
||||||
<li>Current path: {{ document.current_path }}</li>
|
<li>Current path: {{ document.current_path }}</li>
|
||||||
|
<li>Share path: {{ document.share_path or "" }}</li>
|
||||||
|
<li>App URL: <a href="{{ app_url }}">{{ app_url }}</a></li>
|
||||||
<li>Original filename: {{ document.original_filename }}</li>
|
<li>Original filename: {{ document.original_filename }}</li>
|
||||||
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
<li>Canonical filename: {{ document.canonical_filename }}</li>
|
||||||
<li>MIME type: {{ document.mime_type }}</li>
|
<li>MIME type: {{ document.mime_type }}</li>
|
||||||
|
|
@ -25,6 +75,14 @@
|
||||||
<li>Updated at: {{ document.updated_at }}</li>
|
<li>Updated at: {{ document.updated_at }}</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
<h2>Saved PDF scaffolds</h2>
|
||||||
|
<form method="post" action="/documents/{{ document.document_id }}/save-ocr-corrected-pdf" style="display:inline;">
|
||||||
|
<button type="submit">Save OCR-corrected PDF</button>
|
||||||
|
</form>
|
||||||
|
<form method="post" action="/documents/{{ document.document_id }}/save-field-enriched-pdf" style="display:inline; margin-left: 1rem;">
|
||||||
|
<button type="submit">Save field-enriched PDF</button>
|
||||||
|
</form>
|
||||||
|
|
||||||
<h2>Document preview</h2>
|
<h2>Document preview</h2>
|
||||||
{% if file_url %}
|
{% if file_url %}
|
||||||
{% if document.mime_type == "application/pdf" %}
|
{% if document.mime_type == "application/pdf" %}
|
||||||
|
|
@ -47,6 +105,7 @@
|
||||||
{{ version.version_type }} —
|
{{ version.version_type }} —
|
||||||
{{ version.file_path }} —
|
{{ version.file_path }} —
|
||||||
{{ version.created_at }}
|
{{ version.created_at }}
|
||||||
|
{% if version.notes %}<br><em>{{ version.notes }}</em>{% endif %}
|
||||||
</li>
|
</li>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</ul>
|
</ul>
|
||||||
|
|
@ -84,12 +143,23 @@
|
||||||
<p>No reviewed OCR saved yet.</p>
|
<p>No reviewed OCR saved yet.</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Expected OCR lines: <span id="expected-lines">{{ expected_line_count }}</span><br>
|
||||||
|
Current editor lines: <span id="actual-lines">{{ actual_line_count }}</span>
|
||||||
|
<br><span id="line-warning" class="line-warning" {% if expected_line_count == actual_line_count %}style="display:none;"{% endif %}>
|
||||||
|
Line count mismatch may affect corrected PDF layout.
|
||||||
|
</span>
|
||||||
|
</p>
|
||||||
|
|
||||||
<form method="post" action="/documents/{{ document.document_id }}/review-text">
|
<form method="post" action="/documents/{{ document.document_id }}/review-text">
|
||||||
<div>
|
<div>
|
||||||
<label for="reviewed_text">Edit reviewed OCR text:</label>
|
<label for="reviewed_text">Edit reviewed OCR text (one line per OCR line):</label>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
|
||||||
<textarea id="reviewed_text" name="reviewed_text" rows="20" cols="100">{{ review_text_value }}</textarea>
|
<div class="editor-wrap">
|
||||||
|
<div class="line-numbers" id="line-numbers">{% for n in line_numbers %}{{ n }}
|
||||||
|
{% endfor %}</div>
|
||||||
|
<textarea id="reviewed_text" name="reviewed_text" rows="{{ [actual_line_count + 2, 20]|max }}" cols="100">{{ review_text_value }}</textarea>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<h3>Quality flags</h3>
|
<h3>Quality flags</h3>
|
||||||
|
|
@ -113,8 +183,43 @@
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div style="margin-top: 1rem;">
|
<div style="margin-top: 1rem;">
|
||||||
<button type="submit">Save reviewed OCR</button>
|
<button type="submit" id="save-reviewed-btn">Save reviewed OCR</button>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
const textarea = document.getElementById("reviewed_text");
|
||||||
|
const expectedLines = parseInt(document.getElementById("expected-lines").textContent || "0", 10);
|
||||||
|
const actualLinesEl = document.getElementById("actual-lines");
|
||||||
|
const warningEl = document.getElementById("line-warning");
|
||||||
|
const saveBtn = document.getElementById("save-reviewed-btn");
|
||||||
|
const lineNumbersEl = document.getElementById("line-numbers");
|
||||||
|
|
||||||
|
function countLines(text) {
|
||||||
|
if (text.length === 0) return 0;
|
||||||
|
return text.split('\n').length;
|
||||||
|
}
|
||||||
|
|
||||||
|
function rebuildLineNumbers(lineCount) {
|
||||||
|
let nums = "";
|
||||||
|
for (let i = 1; i <= lineCount; i++) {
|
||||||
|
nums += i + "\n";
|
||||||
|
}
|
||||||
|
lineNumbersEl.textContent = nums;
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateEditorState() {
|
||||||
|
const actual = countLines(textarea.value);
|
||||||
|
actualLinesEl.textContent = actual.toString();
|
||||||
|
rebuildLineNumbers(Math.max(actual, expectedLines));
|
||||||
|
|
||||||
|
const mismatch = expectedLines > 0 && actual !== expectedLines;
|
||||||
|
warningEl.style.display = mismatch ? "inline" : "none";
|
||||||
|
saveBtn.disabled = mismatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
textarea.addEventListener("input", updateEditorState);
|
||||||
|
updateEditorState();
|
||||||
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue