diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index 9612239..bdf2505 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import datetime import hashlib import os import re @@ -61,7 +62,7 @@ import tempfile from pathlib import Path from PIL import Image -from pypdf import PdfReader +from pypdf import PdfReader, PdfWriter from reportlab.lib.utils import ImageReader from reportlab.pdfbase.pdfmetrics import stringWidth from reportlab.pdfgen import canvas @@ -106,6 +107,155 @@ def _prune_old_saved_files(db: Session, document: Document, keep_paths: set[str] except Exception: pass + +def _build_pdf_keywords(document) -> str: + """ + Currently returns location-only keywords. + Easy to extend later. + """ + additional = document.additional_fields[0] if getattr(document, "additional_fields", None) else None + + parts = [] + + if additional: + # adjust field names if needed + for field in ["location_city", "location_area", "location_name"]: + value = getattr(additional, field, None) + if value: + parts.append(str(value).strip().lower()) + + # dedupe while preserving order + seen = set() + clean = [] + for p in parts: + if p and p not in seen: + seen.add(p) + clean.append(p) + + return ", ".join(clean) + + + +def _latest_additional(document): + rows = getattr(document, "additional_fields", None) or [] + return rows[0] if rows else None + + +def _latest_extracted(document): + rows = getattr(document, "extracted_fields", None) or [] + return rows[0] if rows else None + + +def _humanize_filename(path_obj: Path) -> str: + stem = path_obj.stem.replace("_", " ").replace("-", " ").strip() + stem = re.sub(r"\s+", " ", stem) + return stem.title() + + +def _build_pdf_title(document, out_path: Path) -> str: + return _humanize_filename(out_path) + + +def _build_pdf_author(document) -> str: + additional = _latest_additional(document) + owners = [] + if additional: + for field in ["owner_primary", "owner_secondary"]: + value = getattr(additional, field, None) + if value: + owners.append(str(value).strip()) + seen = set() + clean = [] + for owner in owners: + key = owner.lower() + if key not in seen: + seen.add(key) + clean.append(owner) + return "; ".join(clean) + + +def _build_pdf_subject(document) -> str: + value = getattr(document, "document_type", None) + return str(value).replace("_", " ").title() if value else "" + + +def _build_pdf_keywords(document) -> str: + """ + Currently returns location-only keywords. + Easy to extend later. + """ + parts = [] + + extracted = _latest_extracted(document) + if extracted: + location = getattr(extracted, "location", None) + if location: + for chunk in re.split(r"[,;/|-]+", str(location)): + chunk = chunk.strip().lower() + if chunk: + parts.append(chunk) + + seen = set() + clean = [] + for p in parts: + if p and p not in seen: + seen.add(p) + clean.append(p) + + return ", ".join(clean) + + +def _source_timestamp(document) -> datetime | None: + for attr in ["source_path", "original_path", "current_path"]: + value = getattr(document, attr, None) + if not value: + continue + try: + p = Path(value) + if p.exists(): + return datetime.fromtimestamp(p.stat().st_mtime) + except Exception: + pass + return None + + +def _pdf_date(dt: datetime | None) -> str: + if not dt: + dt = datetime.now() + return dt.strftime("D:%Y%m%d%H%M%S") + + +def _write_pdf_metadata(path_obj: Path, document, version_number: int, version_type: str) -> None: + reader = PdfReader(str(path_obj)) + writer = PdfWriter() + for page in reader.pages: + writer.add_page(page) + + now = datetime.now() + source_dt = _source_timestamp(document) + + metadata = { + "/Title": _build_pdf_title(document, path_obj), + "/Author": _build_pdf_author(document), + "/Subject": _build_pdf_subject(document), + "/Keywords": _build_pdf_keywords(document), + "/Creator": "Document Processor", + "/Producer": "Document Processor", + "/CreationDate": _pdf_date(source_dt), + "/ModDate": _pdf_date(now), + "/DocumentID": str(getattr(document, "document_id", "") or ""), + "/VersionNumber": str(version_number), + "/VersionType": str(version_type), + } + + writer.add_metadata({k: v for k, v in metadata.items() if v is not None}) + + tmp_path = path_obj.with_suffix(path_obj.suffix + ".meta.tmp") + with tmp_path.open("wb") as f: + writer.write(f) + tmp_path.replace(path_obj) + + def sha256_for_file(path: Path) -> str: hasher = hashlib.sha256() with path.open("rb") as f: @@ -317,6 +467,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat shutil.copy2(overlay_pdf_path, out_path) compress_pdf_with_ghostscript(out_path) + _write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected") file_hash = sha256_for_file(out_path) try: @@ -386,6 +537,8 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa if current_file.resolve() != out_path.resolve(): shutil.copy2(current_file, out_path) + + _write_pdf_metadata(out_path, document, next_version_number, "field_enriched") file_hash = sha256_for_file(out_path) try: diff --git a/app/main.py b/app/main.py index b4364be..6a3f86a 100644 --- a/app/main.py +++ b/app/main.py @@ -1,3 +1,4 @@ +from pathlib import Path from decimal import Decimal from fastapi import FastAPI, Request @@ -20,7 +21,12 @@ from app.routes.trash import router as trash_router app = FastAPI(title="document-processor") app.mount("/static", StaticFiles(directory="app/static"), name="static") -app.mount("/files", StaticFiles(directory="/mnt/svr-01/storage"), name="files") +storage_dir = Path("/mnt/svr-01/storage") + +if storage_dir.exists(): + app.mount("/files", StaticFiles(directory=str(storage_dir)), name="files") +else: + print("WARNING: storage mount not available, /files disabled") app.include_router(health_router) app.include_router(documents_router) diff --git a/app/routes/documents.py b/app/routes/documents.py index 8706951..353f29f 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -2,6 +2,8 @@ from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation import re +import os +import hashlib from pathlib import Path from fastapi import APIRouter, Depends, Form, Query, Request @@ -9,6 +11,7 @@ from fastapi.responses import HTMLResponse, RedirectResponse from fastapi.templating import Jinja2Templates from sqlalchemy import distinct from sqlalchemy.orm import Session, selectinload +from pypdf import PdfReader from app.core.storage_settings import get_default_save_root from app.db.deps import get_db @@ -30,6 +33,54 @@ from app.models.text_version import TextVersion router = APIRouter(prefix="/documents", tags=["documents"]) + +def _storage_available() -> bool: + storage_root = Path("/mnt/svr-01/storage") + try: + return storage_root.exists() and storage_root.is_mount() and storage_root.is_dir() and os.access(storage_root, os.R_OK | os.X_OK) + except Exception: + return False + + + +def _sha256_for_file(path_obj: Path) -> str: + hasher = hashlib.sha256() + with path_obj.open("rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + hasher.update(chunk) + return hasher.hexdigest() + + +def _version_file_available(version, expected_document_id: str) -> bool: + file_path = getattr(version, "file_path", None) + if not file_path: + return False + + try: + path_obj = Path(file_path) + if not path_obj.exists() or not path_obj.is_file(): + return False + + reader = PdfReader(str(path_obj)) + meta = reader.metadata or {} + + if str(meta.get("/DocumentID", "")).strip() != str(expected_document_id): + return False + if str(meta.get("/VersionNumber", "")).strip() != str(version.version_number): + return False + if str(meta.get("/VersionType", "")).strip() != str(version.version_type): + return False + + expected_sha = getattr(version, "sha256", None) + if expected_sha: + actual_sha = _sha256_for_file(path_obj) + if actual_sha != expected_sha: + return False + + return True + except Exception: + return False + BASE_DIR = Path(__file__).resolve().parent.parent templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) @@ -560,6 +611,11 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)): @router.post("/{document_id}/save-pdf", response_class=RedirectResponse) def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)): + if not _storage_available(): + return RedirectResponse( + url=f"/documents/{document_id}?error=storage_unavailable", + status_code=303, + ) document = ( db.query(Document) .options( @@ -848,7 +904,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1)) file_url = None - if document.current_path: + storage_available = _storage_available() + if storage_available and document.current_path: storage_root = Path("/mnt/svr-01/storage") current_path = Path(document.current_path) try: @@ -891,6 +948,11 @@ def document_detail(document_id: str, request: Request, queue: str | None = None ) ) + version_rows = [] + for version in sorted(getattr(document, "versions", []), key=lambda v: v.version_number, reverse=True): + file_exists = _version_file_available(version, document.document_id) + version_rows.append((version, file_exists)) + active_tab = request.query_params.get("tab", "ocr-review") if active_tab not in {"ocr-review", "extracted-fields", "additional-fields", "versions", "raw-ocr"}: active_tab = "ocr-review" @@ -911,6 +973,8 @@ def document_detail(document_id: str, request: Request, queue: str | None = None "reviewed_ocr": reviewed_ocr, "review_text_value": review_text_value, "file_url": file_url, + "storage_available": storage_available, + "version_rows": version_rows, "app_url": app_url, "quality_flag_options": QUALITY_FLAG_OPTIONS, "current_quality_flags": raw_ocr.quality_flags if raw_ocr and raw_ocr.quality_flags else [], diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 939f5c1..8b6afb3 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -99,11 +99,20 @@ -
+ +{% if error == "storage_unavailable" %} +
+ Storage mount unavailable. Please retry in a moment. +
+{% endif %} + +

Document preview

- {% if file_url %} + {% if not storage_available %} +

Storage mount unavailable. Preview is temporarily unavailable.

+ {% elif file_url %} {% if document.mime_type == "application/pdf" %} {% elif document.mime_type in ["image/jpeg", "image/png"] %} @@ -308,7 +317,7 @@

Document versions

- {% if document.versions %} + {% if version_rows %}
@@ -321,11 +330,18 @@ - {% for version in document.versions %} + {% for version, file_exists in version_rows %} - +
v{{ version.version_number }} {{ version.version_type }}{{ version.file_path }} + {{ version.file_path }} +
+ {% if file_exists %} + Available + {% endif %} +
+
{{ version.created_at }} {{ version.notes or "" }}