feat: embed PDF metadata for saved versions and validate available versions by metadata match
This commit is contained in:
parent
c9fdf953e7
commit
f26f7ddc03
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
|
|
@ -61,7 +62,7 @@ import tempfile
|
|||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
from reportlab.lib.utils import ImageReader
|
||||
from reportlab.pdfbase.pdfmetrics import stringWidth
|
||||
from reportlab.pdfgen import canvas
|
||||
|
|
@ -106,6 +107,155 @@ def _prune_old_saved_files(db: Session, document: Document, keep_paths: set[str]
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _build_pdf_keywords(document) -> str:
|
||||
"""
|
||||
Currently returns location-only keywords.
|
||||
Easy to extend later.
|
||||
"""
|
||||
additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None
|
||||
|
||||
parts = []
|
||||
|
||||
if additional:
|
||||
# adjust field names if needed
|
||||
for field in ["location_city", "location_area", "location_name"]:
|
||||
value = getattr(additional, field, None)
|
||||
if value:
|
||||
parts.append(str(value).strip().lower())
|
||||
|
||||
# dedupe while preserving order
|
||||
seen = set()
|
||||
clean = []
|
||||
for p in parts:
|
||||
if p and p not in seen:
|
||||
seen.add(p)
|
||||
clean.append(p)
|
||||
|
||||
return ", ".join(clean)
|
||||
|
||||
|
||||
|
||||
def _latest_additional(document):
|
||||
rows = getattr(document, "additional_fields", None) or []
|
||||
return rows[0] if rows else None
|
||||
|
||||
|
||||
def _latest_extracted(document):
|
||||
rows = getattr(document, "extracted_fields", None) or []
|
||||
return rows[0] if rows else None
|
||||
|
||||
|
||||
def _humanize_filename(path_obj: Path) -> str:
|
||||
stem = path_obj.stem.replace("_", " ").replace("-", " ").strip()
|
||||
stem = re.sub(r"\s+", " ", stem)
|
||||
return stem.title()
|
||||
|
||||
|
||||
def _build_pdf_title(document, out_path: Path) -> str:
|
||||
return _humanize_filename(out_path)
|
||||
|
||||
|
||||
def _build_pdf_author(document) -> str:
|
||||
additional = _latest_additional(document)
|
||||
owners = []
|
||||
if additional:
|
||||
for field in ["owner_primary", "owner_secondary"]:
|
||||
value = getattr(additional, field, None)
|
||||
if value:
|
||||
owners.append(str(value).strip())
|
||||
seen = set()
|
||||
clean = []
|
||||
for owner in owners:
|
||||
key = owner.lower()
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
clean.append(owner)
|
||||
return "; ".join(clean)
|
||||
|
||||
|
||||
def _build_pdf_subject(document) -> str:
|
||||
value = getattr(document, "document_type", None)
|
||||
return str(value).replace("_", " ").title() if value else ""
|
||||
|
||||
|
||||
def _build_pdf_keywords(document) -> str:
|
||||
"""
|
||||
Currently returns location-only keywords.
|
||||
Easy to extend later.
|
||||
"""
|
||||
parts = []
|
||||
|
||||
extracted = _latest_extracted(document)
|
||||
if extracted:
|
||||
location = getattr(extracted, "location", None)
|
||||
if location:
|
||||
for chunk in re.split(r"[,;/|-]+", str(location)):
|
||||
chunk = chunk.strip().lower()
|
||||
if chunk:
|
||||
parts.append(chunk)
|
||||
|
||||
seen = set()
|
||||
clean = []
|
||||
for p in parts:
|
||||
if p and p not in seen:
|
||||
seen.add(p)
|
||||
clean.append(p)
|
||||
|
||||
return ", ".join(clean)
|
||||
|
||||
|
||||
def _source_timestamp(document) -> datetime | None:
|
||||
for attr in ["source_path", "original_path", "current_path"]:
|
||||
value = getattr(document, attr, None)
|
||||
if not value:
|
||||
continue
|
||||
try:
|
||||
p = Path(value)
|
||||
if p.exists():
|
||||
return datetime.fromtimestamp(p.stat().st_mtime)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _pdf_date(dt: datetime | None) -> str:
|
||||
if not dt:
|
||||
dt = datetime.now()
|
||||
return dt.strftime("D:%Y%m%d%H%M%S")
|
||||
|
||||
|
||||
def _write_pdf_metadata(path_obj: Path, document, version_number: int, version_type: str) -> None:
|
||||
reader = PdfReader(str(path_obj))
|
||||
writer = PdfWriter()
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
|
||||
now = datetime.now()
|
||||
source_dt = _source_timestamp(document)
|
||||
|
||||
metadata = {
|
||||
"/Title": _build_pdf_title(document, path_obj),
|
||||
"/Author": _build_pdf_author(document),
|
||||
"/Subject": _build_pdf_subject(document),
|
||||
"/Keywords": _build_pdf_keywords(document),
|
||||
"/Creator": "Document Processor",
|
||||
"/Producer": "Document Processor",
|
||||
"/CreationDate": _pdf_date(source_dt),
|
||||
"/ModDate": _pdf_date(now),
|
||||
"/DocumentID": str(getattr(document, "document_id", "") or ""),
|
||||
"/VersionNumber": str(version_number),
|
||||
"/VersionType": str(version_type),
|
||||
}
|
||||
|
||||
writer.add_metadata({k: v for k, v in metadata.items() if v is not None})
|
||||
|
||||
tmp_path = path_obj.with_suffix(path_obj.suffix + ".meta.tmp")
|
||||
with tmp_path.open("wb") as f:
|
||||
writer.write(f)
|
||||
tmp_path.replace(path_obj)
|
||||
|
||||
|
||||
def sha256_for_file(path: Path) -> str:
|
||||
hasher = hashlib.sha256()
|
||||
with path.open("rb") as f:
|
||||
|
|
@ -317,6 +467,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
|
|||
shutil.copy2(overlay_pdf_path, out_path)
|
||||
|
||||
compress_pdf_with_ghostscript(out_path)
|
||||
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
|
||||
|
||||
file_hash = sha256_for_file(out_path)
|
||||
try:
|
||||
|
|
@ -386,6 +537,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
|||
|
||||
if current_file.resolve() != out_path.resolve():
|
||||
shutil.copy2(current_file, out_path)
|
||||
|
||||
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
|
||||
file_hash = sha256_for_file(out_path)
|
||||
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
from pathlib import Path
|
||||
from decimal import Decimal
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
|
|
@ -20,7 +21,12 @@ from app.routes.trash import router as trash_router
|
|||
|
||||
app = FastAPI(title="document-processor")
|
||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||
app.mount("/files", StaticFiles(directory="/mnt/svr-01/storage"), name="files")
|
||||
storage_dir = Path("/mnt/svr-01/storage")
|
||||
|
||||
if storage_dir.exists():
|
||||
app.mount("/files", StaticFiles(directory=str(storage_dir)), name="files")
|
||||
else:
|
||||
print("WARNING: storage mount not available, /files disabled")
|
||||
|
||||
app.include_router(health_router)
|
||||
app.include_router(documents_router)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ from copy import deepcopy
|
|||
from datetime import datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
import re
|
||||
import os
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, Form, Query, Request
|
||||
|
|
@ -9,6 +11,7 @@ from fastapi.responses import HTMLResponse, RedirectResponse
|
|||
from fastapi.templating import Jinja2Templates
|
||||
from sqlalchemy import distinct
|
||||
from sqlalchemy.orm import Session, selectinload
|
||||
from pypdf import PdfReader
|
||||
|
||||
from app.core.storage_settings import get_default_save_root
|
||||
from app.db.deps import get_db
|
||||
|
|
@ -30,6 +33,54 @@ from app.models.text_version import TextVersion
|
|||
|
||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||
|
||||
|
||||
def _storage_available() -> bool:
|
||||
storage_root = Path("/mnt/svr-01/storage")
|
||||
try:
|
||||
return storage_root.exists() and storage_root.is_mount() and storage_root.is_dir() and os.access(storage_root, os.R_OK | os.X_OK)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
|
||||
def _sha256_for_file(path_obj: Path) -> str:
|
||||
hasher = hashlib.sha256()
|
||||
with path_obj.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||||
hasher.update(chunk)
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def _version_file_available(version, expected_document_id: str) -> bool:
|
||||
file_path = getattr(version, "file_path", None)
|
||||
if not file_path:
|
||||
return False
|
||||
|
||||
try:
|
||||
path_obj = Path(file_path)
|
||||
if not path_obj.exists() or not path_obj.is_file():
|
||||
return False
|
||||
|
||||
reader = PdfReader(str(path_obj))
|
||||
meta = reader.metadata or {}
|
||||
|
||||
if str(meta.get("/DocumentID", "")).strip() != str(expected_document_id):
|
||||
return False
|
||||
if str(meta.get("/VersionNumber", "")).strip() != str(version.version_number):
|
||||
return False
|
||||
if str(meta.get("/VersionType", "")).strip() != str(version.version_type):
|
||||
return False
|
||||
|
||||
expected_sha = getattr(version, "sha256", None)
|
||||
if expected_sha:
|
||||
actual_sha = _sha256_for_file(path_obj)
|
||||
if actual_sha != expected_sha:
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||
|
||||
|
|
@ -560,6 +611,11 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)):
|
|||
|
||||
@router.post("/{document_id}/save-pdf", response_class=RedirectResponse)
|
||||
def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
|
||||
if not _storage_available():
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document_id}?error=storage_unavailable",
|
||||
status_code=303,
|
||||
)
|
||||
document = (
|
||||
db.query(Document)
|
||||
.options(
|
||||
|
|
@ -848,7 +904,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
||||
|
||||
file_url = None
|
||||
if document.current_path:
|
||||
storage_available = _storage_available()
|
||||
if storage_available and document.current_path:
|
||||
storage_root = Path("/mnt/svr-01/storage")
|
||||
current_path = Path(document.current_path)
|
||||
try:
|
||||
|
|
@ -891,6 +948,11 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
)
|
||||
)
|
||||
|
||||
version_rows = []
|
||||
for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number, reverse=True):
|
||||
file_exists = _version_file_available(version, document.document_id)
|
||||
version_rows.append((version, file_exists))
|
||||
|
||||
active_tab = request.query_params.get("tab", "ocr-review")
|
||||
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
|
||||
active_tab = "ocr-review"
|
||||
|
|
@ -911,6 +973,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
"reviewed_ocr": reviewed_ocr,
|
||||
"review_text_value": review_text_value,
|
||||
"file_url": file_url,
|
||||
"storage_available": storage_available,
|
||||
"version_rows": version_rows,
|
||||
"app_url": app_url,
|
||||
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||
|
|
|
|||
|
|
@ -99,11 +99,20 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
{% if error == "storage_unavailable" %}
|
||||
<div style="background:#ffe4e6; border:1px solid #fecdd3; color:#7f1d1d; padding:0.75rem 1rem; border-radius:10px; margin-bottom:1rem;">
|
||||
Storage mount unavailable. Please retry in a moment.
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="workspace-grid">
|
||||
<section>
|
||||
<div class="card preview-card">
|
||||
<h2 class="card-title">Document preview</h2>
|
||||
{% if file_url %}
|
||||
{% if not storage_available %}
|
||||
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
|
||||
{% elif file_url %}
|
||||
{% if document.mime_type == "application/pdf" %}
|
||||
<iframe class="preview-frame" src="{{ file_url }}"></iframe>
|
||||
{% elif document.mime_type in ["image/jpeg", "image/png"] %}
|
||||
|
|
@ -308,7 +317,7 @@
|
|||
|
||||
<div class="tab-panel{% if active_tab == 'versions' %} active{% endif %}" data-panel="versions">
|
||||
<h2 class="card-title">Document versions</h2>
|
||||
{% if document.versions %}
|
||||
{% if version_rows %}
|
||||
<div class="table-wrap">
|
||||
<table>
|
||||
<thead>
|
||||
|
|
@ -321,11 +330,18 @@
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for version in document.versions %}
|
||||
{% for version, file_exists in version_rows %}
|
||||
<tr>
|
||||
<td>v{{ version.version_number }}</td>
|
||||
<td>{{ version.version_type }}</td>
|
||||
<td>{{ version.file_path }}</td>
|
||||
<td>
|
||||
{{ version.file_path }}
|
||||
<div style="margin-top:0.25rem;">
|
||||
{% if file_exists %}
|
||||
<span class="badge">Available</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</td>
|
||||
<td>{{ version.created_at }}</td>
|
||||
<td>{{ version.notes or "" }}</td>
|
||||
</tr>
|
||||
|
|
|
|||
Loading…
Reference in New Issue