diff --git a/app/logic/document_outputs.py b/app/logic/document_outputs.py index bdf2505..fdfbdba 100644 --- a/app/logic/document_outputs.py +++ b/app/logic/document_outputs.py @@ -469,6 +469,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat compress_pdf_with_ghostscript(out_path) _write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected") + file_size = out_path.stat().st_size file_hash = sha256_for_file(out_path) try: mirror_path = _mirror_to_secondary_owner(document, out_path) @@ -496,6 +497,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat version_type="ocr_corrected", file_path=str(out_path), sha256=file_hash, + file_size_bytes=file_size, created_by="save_ocr_corrected_pdf", notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.", ) @@ -539,6 +541,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa shutil.copy2(current_file, out_path) _write_pdf_metadata(out_path, document, next_version_number, "field_enriched") + file_size = out_path.stat().st_size file_hash = sha256_for_file(out_path) try: @@ -556,6 +559,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa version_type="field_enriched", file_path=str(out_path), sha256=file_hash, + file_size_bytes=file_size, created_by="save_field_enriched_pdf", notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.", ) diff --git a/app/models/document_version.py b/app/models/document_version.py index 163b4db..53fc785 100644 --- a/app/models/document_version.py +++ b/app/models/document_version.py @@ -1,5 +1,5 @@ from datetime import datetime -from sqlalchemy import String, DateTime, ForeignKey, Text, Integer +from sqlalchemy import String, DateTime, ForeignKey, Text, Integer, BigInteger from sqlalchemy.orm import Mapped, mapped_column, relationship from app.db.base import Base @@ -20,6 +20,7 @@ class DocumentVersion(Base): file_path: Mapped[str] = mapped_column(Text, nullable=False) sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) + file_size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True) created_by: Mapped[str | None] = mapped_column(String(100), nullable=True) notes: Mapped[str | None] = mapped_column(Text, nullable=True) diff --git a/app/routes/documents.py b/app/routes/documents.py index 164bf9b..22fb901 100644 --- a/app/routes/documents.py +++ b/app/routes/documents.py @@ -2,6 +2,7 @@ from copy import deepcopy from datetime import datetime from decimal import Decimal, InvalidOperation import re +import traceback import os import hashlib import json @@ -32,6 +33,7 @@ from app.models.document import Document from app.models.document_additional_field import DocumentAdditionalField from app.models.document_preset import DocumentPreset from app.models.text_version import TextVersion +from app.utils.filesize import human_size router = APIRouter(prefix="/documents", tags=["documents"]) @@ -174,6 +176,7 @@ def _document_export_payload(document) -> dict: BASE_DIR = Path(__file__).resolve().parent.parent templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) +templates.env.globals["human_size"] = human_size QUALITY_FLAG_OPTIONS = [ "bad_embedded_text", @@ -764,7 +767,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend create_field_enriched_pdf_version(db, document, output_path=output_path_obj) else: create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj) - except Exception: + except Exception as e: + print("save_pdf failed:", repr(e), flush=True) + traceback.print_exc() return RedirectResponse( url=f"/documents/{document.document_id}?error=save_pdf_failed", status_code=303, diff --git a/app/templates/documents/detail.html b/app/templates/documents/detail.html index 8b6afb3..6b3743f 100644 --- a/app/templates/documents/detail.html +++ b/app/templates/documents/detail.html @@ -325,6 +325,7 @@ Version Type Path + Size Created Notes @@ -342,7 +343,8 @@ {% endif %} - {{ version.created_at }} + {{ human_size(version.file_size_bytes) }} +{{ version.created_at }} {{ version.notes or "" }} {% endfor %} diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/filesize.py b/app/utils/filesize.py new file mode 100644 index 0000000..4383a0f --- /dev/null +++ b/app/utils/filesize.py @@ -0,0 +1,14 @@ +def human_size(num_bytes: int | None) -> str: + if not num_bytes: + return "" + + units = ["B", "KB", "MB", "GB", "TB", "PB"] + size = float(num_bytes) + + for unit in units: + if size < 1024 or unit == units[-1]: + s = f"{size:.3f}".rstrip("0").rstrip(".") + if "." not in s: + s += ".0" + return f"{s} {unit}" + size /= 1024