feat: embed PDF metadata for saved versions and validate available versions by metadata match
This commit is contained in:
parent
c9fdf953e7
commit
f26f7ddc03
|
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
@ -61,7 +62,7 @@ import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader, PdfWriter
|
||||||
from reportlab.lib.utils import ImageReader
|
from reportlab.lib.utils import ImageReader
|
||||||
from reportlab.pdfbase.pdfmetrics import stringWidth
|
from reportlab.pdfbase.pdfmetrics import stringWidth
|
||||||
from reportlab.pdfgen import canvas
|
from reportlab.pdfgen import canvas
|
||||||
|
|
@ -106,6 +107,155 @@ def _prune_old_saved_files(db: Session, document: Document, keep_paths: set[str]
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pdf_keywords(document) -> str:
|
||||||
|
"""
|
||||||
|
Currently returns location-only keywords.
|
||||||
|
Easy to extend later.
|
||||||
|
"""
|
||||||
|
additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
if additional:
|
||||||
|
# adjust field names if needed
|
||||||
|
for field in ["location_city", "location_area", "location_name"]:
|
||||||
|
value = getattr(additional, field, None)
|
||||||
|
if value:
|
||||||
|
parts.append(str(value).strip().lower())
|
||||||
|
|
||||||
|
# dedupe while preserving order
|
||||||
|
seen = set()
|
||||||
|
clean = []
|
||||||
|
for p in parts:
|
||||||
|
if p and p not in seen:
|
||||||
|
seen.add(p)
|
||||||
|
clean.append(p)
|
||||||
|
|
||||||
|
return ", ".join(clean)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _latest_additional(document):
|
||||||
|
rows = getattr(document, "additional_fields", None) or []
|
||||||
|
return rows[0] if rows else None
|
||||||
|
|
||||||
|
|
||||||
|
def _latest_extracted(document):
|
||||||
|
rows = getattr(document, "extracted_fields", None) or []
|
||||||
|
return rows[0] if rows else None
|
||||||
|
|
||||||
|
|
||||||
|
def _humanize_filename(path_obj: Path) -> str:
|
||||||
|
stem = path_obj.stem.replace("_", " ").replace("-", " ").strip()
|
||||||
|
stem = re.sub(r"\s+", " ", stem)
|
||||||
|
return stem.title()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pdf_title(document, out_path: Path) -> str:
|
||||||
|
return _humanize_filename(out_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pdf_author(document) -> str:
|
||||||
|
additional = _latest_additional(document)
|
||||||
|
owners = []
|
||||||
|
if additional:
|
||||||
|
for field in ["owner_primary", "owner_secondary"]:
|
||||||
|
value = getattr(additional, field, None)
|
||||||
|
if value:
|
||||||
|
owners.append(str(value).strip())
|
||||||
|
seen = set()
|
||||||
|
clean = []
|
||||||
|
for owner in owners:
|
||||||
|
key = owner.lower()
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
clean.append(owner)
|
||||||
|
return "; ".join(clean)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pdf_subject(document) -> str:
|
||||||
|
value = getattr(document, "document_type", None)
|
||||||
|
return str(value).replace("_", " ").title() if value else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pdf_keywords(document) -> str:
|
||||||
|
"""
|
||||||
|
Currently returns location-only keywords.
|
||||||
|
Easy to extend later.
|
||||||
|
"""
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
extracted = _latest_extracted(document)
|
||||||
|
if extracted:
|
||||||
|
location = getattr(extracted, "location", None)
|
||||||
|
if location:
|
||||||
|
for chunk in re.split(r"[,;/|-]+", str(location)):
|
||||||
|
chunk = chunk.strip().lower()
|
||||||
|
if chunk:
|
||||||
|
parts.append(chunk)
|
||||||
|
|
||||||
|
seen = set()
|
||||||
|
clean = []
|
||||||
|
for p in parts:
|
||||||
|
if p and p not in seen:
|
||||||
|
seen.add(p)
|
||||||
|
clean.append(p)
|
||||||
|
|
||||||
|
return ", ".join(clean)
|
||||||
|
|
||||||
|
|
||||||
|
def _source_timestamp(document) -> datetime | None:
|
||||||
|
for attr in ["source_path", "original_path", "current_path"]:
|
||||||
|
value = getattr(document, attr, None)
|
||||||
|
if not value:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
p = Path(value)
|
||||||
|
if p.exists():
|
||||||
|
return datetime.fromtimestamp(p.stat().st_mtime)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_date(dt: datetime | None) -> str:
|
||||||
|
if not dt:
|
||||||
|
dt = datetime.now()
|
||||||
|
return dt.strftime("D:%Y%m%d%H%M%S")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_pdf_metadata(path_obj: Path, document, version_number: int, version_type: str) -> None:
|
||||||
|
reader = PdfReader(str(path_obj))
|
||||||
|
writer = PdfWriter()
|
||||||
|
for page in reader.pages:
|
||||||
|
writer.add_page(page)
|
||||||
|
|
||||||
|
now = datetime.now()
|
||||||
|
source_dt = _source_timestamp(document)
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"/Title": _build_pdf_title(document, path_obj),
|
||||||
|
"/Author": _build_pdf_author(document),
|
||||||
|
"/Subject": _build_pdf_subject(document),
|
||||||
|
"/Keywords": _build_pdf_keywords(document),
|
||||||
|
"/Creator": "Document Processor",
|
||||||
|
"/Producer": "Document Processor",
|
||||||
|
"/CreationDate": _pdf_date(source_dt),
|
||||||
|
"/ModDate": _pdf_date(now),
|
||||||
|
"/DocumentID": str(getattr(document, "document_id", "") or ""),
|
||||||
|
"/VersionNumber": str(version_number),
|
||||||
|
"/VersionType": str(version_type),
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.add_metadata({k: v for k, v in metadata.items() if v is not None})
|
||||||
|
|
||||||
|
tmp_path = path_obj.with_suffix(path_obj.suffix + ".meta.tmp")
|
||||||
|
with tmp_path.open("wb") as f:
|
||||||
|
writer.write(f)
|
||||||
|
tmp_path.replace(path_obj)
|
||||||
|
|
||||||
|
|
||||||
def sha256_for_file(path: Path) -> str:
|
def sha256_for_file(path: Path) -> str:
|
||||||
hasher = hashlib.sha256()
|
hasher = hashlib.sha256()
|
||||||
with path.open("rb") as f:
|
with path.open("rb") as f:
|
||||||
|
|
@ -317,6 +467,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
|
||||||
shutil.copy2(overlay_pdf_path, out_path)
|
shutil.copy2(overlay_pdf_path, out_path)
|
||||||
|
|
||||||
compress_pdf_with_ghostscript(out_path)
|
compress_pdf_with_ghostscript(out_path)
|
||||||
|
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
|
||||||
|
|
||||||
file_hash = sha256_for_file(out_path)
|
file_hash = sha256_for_file(out_path)
|
||||||
try:
|
try:
|
||||||
|
|
@ -386,6 +537,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
||||||
|
|
||||||
if current_file.resolve() != out_path.resolve():
|
if current_file.resolve() != out_path.resolve():
|
||||||
shutil.copy2(current_file, out_path)
|
shutil.copy2(current_file, out_path)
|
||||||
|
|
||||||
|
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
|
||||||
file_hash = sha256_for_file(out_path)
|
file_hash = sha256_for_file(out_path)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
from pathlib import Path
|
||||||
from decimal import Decimal
|
from decimal import Decimal
|
||||||
|
|
||||||
from fastapi import FastAPI, Request
|
from fastapi import FastAPI, Request
|
||||||
|
|
@ -20,7 +21,12 @@ from app.routes.trash import router as trash_router
|
||||||
|
|
||||||
app = FastAPI(title="document-processor")
|
app = FastAPI(title="document-processor")
|
||||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||||
app.mount("/files", StaticFiles(directory="/mnt/svr-01/storage"), name="files")
|
storage_dir = Path("/mnt/svr-01/storage")
|
||||||
|
|
||||||
|
if storage_dir.exists():
|
||||||
|
app.mount("/files", StaticFiles(directory=str(storage_dir)), name="files")
|
||||||
|
else:
|
||||||
|
print("WARNING: storage mount not available, /files disabled")
|
||||||
|
|
||||||
app.include_router(health_router)
|
app.include_router(health_router)
|
||||||
app.include_router(documents_router)
|
app.include_router(documents_router)
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@ from copy import deepcopy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from decimal import Decimal, InvalidOperation
|
from decimal import Decimal, InvalidOperation
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, Form, Query, Request
|
from fastapi import APIRouter, Depends, Form, Query, Request
|
||||||
|
|
@ -9,6 +11,7 @@ from fastapi.responses import HTMLResponse, RedirectResponse
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
from sqlalchemy import distinct
|
from sqlalchemy import distinct
|
||||||
from sqlalchemy.orm import Session, selectinload
|
from sqlalchemy.orm import Session, selectinload
|
||||||
|
from pypdf import PdfReader
|
||||||
|
|
||||||
from app.core.storage_settings import get_default_save_root
|
from app.core.storage_settings import get_default_save_root
|
||||||
from app.db.deps import get_db
|
from app.db.deps import get_db
|
||||||
|
|
@ -30,6 +33,54 @@ from app.models.text_version import TextVersion
|
||||||
|
|
||||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||||
|
|
||||||
|
|
||||||
|
def _storage_available() -> bool:
|
||||||
|
storage_root = Path("/mnt/svr-01/storage")
|
||||||
|
try:
|
||||||
|
return storage_root.exists() and storage_root.is_mount() and storage_root.is_dir() and os.access(storage_root, os.R_OK | os.X_OK)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _sha256_for_file(path_obj: Path) -> str:
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
with path_obj.open("rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _version_file_available(version, expected_document_id: str) -> bool:
|
||||||
|
file_path = getattr(version, "file_path", None)
|
||||||
|
if not file_path:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
path_obj = Path(file_path)
|
||||||
|
if not path_obj.exists() or not path_obj.is_file():
|
||||||
|
return False
|
||||||
|
|
||||||
|
reader = PdfReader(str(path_obj))
|
||||||
|
meta = reader.metadata or {}
|
||||||
|
|
||||||
|
if str(meta.get("/DocumentID", "")).strip() != str(expected_document_id):
|
||||||
|
return False
|
||||||
|
if str(meta.get("/VersionNumber", "")).strip() != str(version.version_number):
|
||||||
|
return False
|
||||||
|
if str(meta.get("/VersionType", "")).strip() != str(version.version_type):
|
||||||
|
return False
|
||||||
|
|
||||||
|
expected_sha = getattr(version, "sha256", None)
|
||||||
|
if expected_sha:
|
||||||
|
actual_sha = _sha256_for_file(path_obj)
|
||||||
|
if actual_sha != expected_sha:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
|
||||||
|
|
@ -560,6 +611,11 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)):
|
||||||
|
|
||||||
@router.post("/{document_id}/save-pdf", response_class=RedirectResponse)
|
@router.post("/{document_id}/save-pdf", response_class=RedirectResponse)
|
||||||
def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
|
def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
|
||||||
|
if not _storage_available():
|
||||||
|
return RedirectResponse(
|
||||||
|
url=f"/documents/{document_id}?error=storage_unavailable",
|
||||||
|
status_code=303,
|
||||||
|
)
|
||||||
document = (
|
document = (
|
||||||
db.query(Document)
|
db.query(Document)
|
||||||
.options(
|
.options(
|
||||||
|
|
@ -848,7 +904,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
||||||
|
|
||||||
file_url = None
|
file_url = None
|
||||||
if document.current_path:
|
storage_available = _storage_available()
|
||||||
|
if storage_available and document.current_path:
|
||||||
storage_root = Path("/mnt/svr-01/storage")
|
storage_root = Path("/mnt/svr-01/storage")
|
||||||
current_path = Path(document.current_path)
|
current_path = Path(document.current_path)
|
||||||
try:
|
try:
|
||||||
|
|
@ -891,6 +948,11 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
version_rows = []
|
||||||
|
for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number, reverse=True):
|
||||||
|
file_exists = _version_file_available(version, document.document_id)
|
||||||
|
version_rows.append((version, file_exists))
|
||||||
|
|
||||||
active_tab = request.query_params.get("tab", "ocr-review")
|
active_tab = request.query_params.get("tab", "ocr-review")
|
||||||
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
|
if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}:
|
||||||
active_tab = "ocr-review"
|
active_tab = "ocr-review"
|
||||||
|
|
@ -911,6 +973,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
||||||
"reviewed_ocr": reviewed_ocr,
|
"reviewed_ocr": reviewed_ocr,
|
||||||
"review_text_value": review_text_value,
|
"review_text_value": review_text_value,
|
||||||
"file_url": file_url,
|
"file_url": file_url,
|
||||||
|
"storage_available": storage_available,
|
||||||
|
"version_rows": version_rows,
|
||||||
"app_url": app_url,
|
"app_url": app_url,
|
||||||
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
"quality_flag_options": QUALITY_FLAG_OPTIONS,
|
||||||
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
"current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [],
|
||||||
|
|
|
||||||
|
|
@ -99,11 +99,20 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
{% if error == "storage_unavailable" %}
|
||||||
|
<div style="background:#ffe4e6; border:1px solid #fecdd3; color:#7f1d1d; padding:0.75rem 1rem; border-radius:10px; margin-bottom:1rem;">
|
||||||
|
Storage mount unavailable. Please retry in a moment.
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<div class="workspace-grid">
|
<div class="workspace-grid">
|
||||||
<section>
|
<section>
|
||||||
<div class="card preview-card">
|
<div class="card preview-card">
|
||||||
<h2 class="card-title">Document preview</h2>
|
<h2 class="card-title">Document preview</h2>
|
||||||
{% if file_url %}
|
{% if not storage_available %}
|
||||||
|
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
|
||||||
|
{% elif file_url %}
|
||||||
{% if document.mime_type == "application/pdf" %}
|
{% if document.mime_type == "application/pdf" %}
|
||||||
<iframe class="preview-frame" src="{{ file_url }}"></iframe>
|
<iframe class="preview-frame" src="{{ file_url }}"></iframe>
|
||||||
{% elif document.mime_type in ["image/jpeg", "image/png"] %}
|
{% elif document.mime_type in ["image/jpeg", "image/png"] %}
|
||||||
|
|
@ -308,7 +317,7 @@
|
||||||
|
|
||||||
<div class="tab-panel{% if active_tab == 'versions' %} active{% endif %}" data-panel="versions">
|
<div class="tab-panel{% if active_tab == 'versions' %} active{% endif %}" data-panel="versions">
|
||||||
<h2 class="card-title">Document versions</h2>
|
<h2 class="card-title">Document versions</h2>
|
||||||
{% if document.versions %}
|
{% if version_rows %}
|
||||||
<div class="table-wrap">
|
<div class="table-wrap">
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
|
|
@ -321,11 +330,18 @@
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for version in document.versions %}
|
{% for version, file_exists in version_rows %}
|
||||||
<tr>
|
<tr>
|
||||||
<td>v{{ version.version_number }}</td>
|
<td>v{{ version.version_number }}</td>
|
||||||
<td>{{ version.version_type }}</td>
|
<td>{{ version.version_type }}</td>
|
||||||
<td>{{ version.file_path }}</td>
|
<td>
|
||||||
|
{{ version.file_path }}
|
||||||
|
<div style="margin-top:0.25rem;">
|
||||||
|
{% if file_exists %}
|
||||||
|
<span class="badge">Available</span>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
<td>{{ version.created_at }}</td>
|
<td>{{ version.created_at }}</td>
|
||||||
<td>{{ version.notes or "" }}</td>
|
<td>{{ version.notes or "" }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue