feat: embed PDF metadata for saved versions and validate available versions by metadata match

This commit is contained in:
Sean McElwain 2026-04-11 17:51:41 -05:00
parent c9fdf953e7
commit f26f7ddc03
4 changed files with 247 additions and 8 deletions

View File

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
from datetime import datetime
import hashlib import hashlib
import os import os
import re import re
@ -61,7 +62,7 @@ import tempfile
from pathlib import Path from pathlib import Path
from PIL import Image from PIL import Image
from pypdf import PdfReader from pypdf import PdfReader, PdfWriter
from reportlab.lib.utils import ImageReader from reportlab.lib.utils import ImageReader
from reportlab.pdfbase.pdfmetrics import stringWidth from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.pdfgen import canvas from reportlab.pdfgen import canvas
@ -106,6 +107,155 @@ def _prune_old_saved_files(db: Session, document: Document, keep_paths: set[str]
except Exception: except Exception:
pass pass
def _build_pdf_keywords(document) -> str:
"""
Currently returns location-only keywords.
Easy to extend later.
"""
additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None
parts = []
if additional:
# adjust field names if needed
for field in ["location_city", "location_area", "location_name"]:
value = getattr(additional, field, None)
if value:
parts.append(str(value).strip().lower())
# dedupe while preserving order
seen = set()
clean = []
for p in parts:
if p and p not in seen:
seen.add(p)
clean.append(p)
return ", ".join(clean)
def _latest_additional(document):
rows = getattr(document, "additional_fields", None) or []
return rows[0] if rows else None
def _latest_extracted(document):
rows = getattr(document, "extracted_fields", None) or []
return rows[0] if rows else None
def _humanize_filename(path_obj: Path) -> str:
stem = path_obj.stem.replace("_", " ").replace("-", " ").strip()
stem = re.sub(r"\s+", " ", stem)
return stem.title()
def _build_pdf_title(document, out_path: Path) -> str:
return _humanize_filename(out_path)
def _build_pdf_author(document) -> str:
additional = _latest_additional(document)
owners = []
if additional:
for field in ["owner_primary", "owner_secondary"]:
value = getattr(additional, field, None)
if value:
owners.append(str(value).strip())
seen = set()
clean = []
for owner in owners:
key = owner.lower()
if key not in seen:
seen.add(key)
clean.append(owner)
return "; ".join(clean)
def _build_pdf_subject(document) -> str:
value = getattr(document, "document_type", None)
return str(value).replace("_", " ").title() if value else ""
def _build_pdf_keywords(document) -> str:
"""
Currently returns location-only keywords.
Easy to extend later.
"""
parts = []
extracted = _latest_extracted(document)
if extracted:
location = getattr(extracted, "location", None)
if location:
for chunk in re.split(r"[,;/|-]+", str(location)):
chunk = chunk.strip().lower()
if chunk:
parts.append(chunk)
seen = set()
clean = []
for p in parts:
if p and p not in seen:
seen.add(p)
clean.append(p)
return ", ".join(clean)
def _source_timestamp(document) -> datetime | None:
for attr in ["source_path", "original_path", "current_path"]:
value = getattr(document, attr, None)
if not value:
continue
try:
p = Path(value)
if p.exists():
return datetime.fromtimestamp(p.stat().st_mtime)
except Exception:
pass
return None
def _pdf_date(dt: datetime | None) -> str:
if not dt:
dt = datetime.now()
return dt.strftime("D:%Y%m%d%H%M%S")
def _write_pdf_metadata(path_obj: Path, document, version_number: int, version_type: str) -> None:
reader = PdfReader(str(path_obj))
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
now = datetime.now()
source_dt = _source_timestamp(document)
metadata = {
"/Title": _build_pdf_title(document, path_obj),
"/Author": _build_pdf_author(document),
"/Subject": _build_pdf_subject(document),
"/Keywords": _build_pdf_keywords(document),
"/Creator": "Document Processor",
"/Producer": "Document Processor",
"/CreationDate": _pdf_date(source_dt),
"/ModDate": _pdf_date(now),
"/DocumentID": str(getattr(document, "document_id", "") or ""),
"/VersionNumber": str(version_number),
"/VersionType": str(version_type),
}
writer.add_metadata({k: v for k, v in metadata.items() if v is not None})
tmp_path = path_obj.with_suffix(path_obj.suffix + ".meta.tmp")
with tmp_path.open("wb") as f:
writer.write(f)
tmp_path.replace(path_obj)
def sha256_for_file(path: Path) -> str: def sha256_for_file(path: Path) -> str:
hasher = hashlib.sha256() hasher = hashlib.sha256()
with path.open("rb") as f: with path.open("rb") as f:
@ -317,6 +467,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
shutil.copy2(overlay_pdf_path, out_path) shutil.copy2(overlay_pdf_path, out_path)
compress_pdf_with_ghostscript(out_path) compress_pdf_with_ghostscript(out_path)
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
file_hash = sha256_for_file(out_path) file_hash = sha256_for_file(out_path)
try: try:
@ -386,6 +537,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
if current_file.resolve() != out_path.resolve(): if current_file.resolve() != out_path.resolve():
shutil.copy2(current_file, out_path) shutil.copy2(current_file, out_path)
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
file_hash = sha256_for_file(out_path) file_hash = sha256_for_file(out_path)
try: try:

View File

@ -1,3 +1,4 @@
from pathlib import Path
from decimal import Decimal from decimal import Decimal
from fastapi import FastAPI, Request from fastapi import FastAPI, Request
@ -20,7 +21,12 @@ from app.routes.trash import router as trash_router
app = FastAPI(title="document-processor") app = FastAPI(title="document-processor")
app.mount("/static", StaticFiles(directory="app/static"), name="static") app.mount("/static", StaticFiles(directory="app/static"), name="static")
app.mount("/files", StaticFiles(directory="/mnt/svr-01/storage"), name="files") storage_dir = Path("/mnt/svr-01/storage")
if storage_dir.exists():
app.mount("/files", StaticFiles(directory=str(storage_dir)), name="files")
else:
print("WARNING: storage mount not available, /files disabled")
app.include_router(health_router) app.include_router(health_router)
app.include_router(documents_router) app.include_router(documents_router)

View File

@ -2,6 +2,8 @@ from copy import deepcopy
from datetime import datetime from datetime import datetime
from decimal import Decimal, InvalidOperation from decimal import Decimal, InvalidOperation
import re import re
import os
import hashlib
from pathlib import Path from pathlib import Path
from fastapi import APIRouter, Depends, Form, Query, Request from fastapi import APIRouter, Depends, Form, Query, Request
@ -9,6 +11,7 @@ from fastapi.responses import HTMLResponse, RedirectResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from sqlalchemy import distinct from sqlalchemy import distinct
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
from pypdf import PdfReader
from app.core.storage_settings import get_default_save_root from app.core.storage_settings import get_default_save_root
from app.db.deps import get_db from app.db.deps import get_db
@ -30,6 +33,54 @@ from app.models.text_version import TextVersion
router = APIRouter(prefix="/documents", tags=["documents"]) router = APIRouter(prefix="/documents", tags=["documents"])
def _storage_available() -> bool:
storage_root = Path("/mnt/svr-01/storage")
try:
return storage_root.exists() and storage_root.is_mount() and storage_root.is_dir() and os.access(storage_root, os.R_OK | os.X_OK)
except Exception:
return False
def _sha256_for_file(path_obj: Path) -> str:
hasher = hashlib.sha256()
with path_obj.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
hasher.update(chunk)
return hasher.hexdigest()
def _version_file_available(version, expected_document_id: str) -> bool:
file_path = getattr(version, "file_path", None)
if not file_path:
return False
try:
path_obj = Path(file_path)
if not path_obj.exists() or not path_obj.is_file():
return False
reader = PdfReader(str(path_obj))
meta = reader.metadata or {}
if str(meta.get("/DocumentID", "")).strip() != str(expected_document_id):
return False
if str(meta.get("/VersionNumber", "")).strip() != str(version.version_number):
return False
if str(meta.get("/VersionType", "")).strip() != str(version.version_type):
return False
expected_sha = getattr(version, "sha256", None)
if expected_sha:
actual_sha = _sha256_for_file(path_obj)
if actual_sha != expected_sha:
return False
return True
except Exception:
return False
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
@ -560,6 +611,11 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)):
@router.post("/{document_id}/save-pdf", response_class=RedirectResponse) @router.post("/{document_id}/save-pdf", response_class=RedirectResponse)
def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
if not _storage_available():
return RedirectResponse(
url=f"/documents/{document_id}?error=storage_unavailable",
status_code=303,
)
document = ( document = (
db.query(Document) db.query(Document)
.options( .options(
@ -848,7 +904,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1)) line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
file_url = None file_url = None
if document.current_path: storage_available = _storage_available()
if storage_available and document.current_path:
storage_root = Path("/mnt/svr-01/storage") storage_root = Path("/mnt/svr-01/storage")
current_path = Path(document.current_path) current_path = Path(document.current_path)
try: try:
@ -891,6 +948,11 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
) )
) )
version_rows = []
for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number, reverse=True):
file_exists = _version_file_available(version, document.document_id)
version_rows.append((version, file_exists))
active_tab = request.query_params.get("tab", "ocr-review") active_tab = request.query_params.get("tab", "ocr-review")
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
active_tab = "ocr-review" active_tab = "ocr-review"
@ -911,6 +973,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
"reviewed_ocr": reviewed_ocr, "reviewed_ocr": reviewed_ocr,
"review_text_value": review_text_value, "review_text_value": review_text_value,
"file_url": file_url, "file_url": file_url,
"storage_available": storage_available,
"version_rows": version_rows,
"app_url": app_url, "app_url": app_url,
"quality_flag_options": QUALITY_FLAG_OPTIONS, "quality_flag_options": QUALITY_FLAG_OPTIONS,
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],

View File

@ -99,11 +99,20 @@
</div> </div>
</div> </div>
<div class="workspace-grid">
{% if error == "storage_unavailable" %}
<div style="background:#ffe4e6; border:1px solid #fecdd3; color:#7f1d1d; padding:0.75rem 1rem; border-radius:10px; margin-bottom:1rem;">
Storage mount unavailable. Please retry in a moment.
</div>
{% endif %}
<div class="workspace-grid">
<section> <section>
<div class="card preview-card"> <div class="card preview-card">
<h2 class="card-title">Document preview</h2> <h2 class="card-title">Document preview</h2>
{% if file_url %} {% if not storage_available %}
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
{% elif file_url %}
{% if document.mime_type == "application/pdf" %} {% if document.mime_type == "application/pdf" %}
<iframe class="preview-frame" src="{{ file_url }}"></iframe> <iframe class="preview-frame" src="{{ file_url }}"></iframe>
{% elif document.mime_type in ["image/jpeg", "image/png"] %} {% elif document.mime_type in ["image/jpeg", "image/png"] %}
@ -308,7 +317,7 @@
<div class="tab-panel{% if active_tab == 'versions' %} active{% endif %}" data-panel="versions"> <div class="tab-panel{% if active_tab == 'versions' %} active{% endif %}" data-panel="versions">
<h2 class="card-title">Document versions</h2> <h2 class="card-title">Document versions</h2>
{% if document.versions %} {% if version_rows %}
<div class="table-wrap"> <div class="table-wrap">
<table> <table>
<thead> <thead>
@ -321,11 +330,18 @@
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for version in document.versions %} {% for version, file_exists in version_rows %}
<tr> <tr>
<td>v{{ version.version_number }}</td> <td>v{{ version.version_number }}</td>
<td>{{ version.version_type }}</td> <td>{{ version.version_type }}</td>
<td>{{ version.file_path }}</td> <td>
{{ version.file_path }}
<div style="margin-top:0.25rem;">
{% if file_exists %}
<span class="badge">Available</span>
{% endif %}
</div>
</td>
<td>{{ version.created_at }}</td> <td>{{ version.created_at }}</td>
<td>{{ version.notes or "" }}</td> <td>{{ version.notes or "" }}</td>
</tr> </tr>