feat: add file size tracking and formatting

- store file_size_bytes on versions
- add human_size() utility
- display normalized sizes in UI

enables size vs fidelity analysis
This commit is contained in:
Sean McElwain 2026-04-11 19:28:48 -05:00
parent f1d896a9ed
commit 1e37a80894
6 changed files with 29 additions and 3 deletions

View File

@ -469,6 +469,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
compress_pdf_with_ghostscript(out_path) compress_pdf_with_ghostscript(out_path)
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected") _write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
file_size = out_path.stat().st_size
file_hash = sha256_for_file(out_path) file_hash = sha256_for_file(out_path)
try: try:
mirror_path = _mirror_to_secondary_owner(document, out_path) mirror_path = _mirror_to_secondary_owner(document, out_path)
@ -496,6 +497,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
version_type="ocr_corrected", version_type="ocr_corrected",
file_path=str(out_path), file_path=str(out_path),
sha256=file_hash, sha256=file_hash,
file_size_bytes=file_size,
created_by="save_ocr_corrected_pdf", created_by="save_ocr_corrected_pdf",
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.", notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
) )
@ -539,6 +541,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
shutil.copy2(current_file, out_path) shutil.copy2(current_file, out_path)
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched") _write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
file_size = out_path.stat().st_size
file_hash = sha256_for_file(out_path) file_hash = sha256_for_file(out_path)
try: try:
@ -556,6 +559,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
version_type="field_enriched", version_type="field_enriched",
file_path=str(out_path), file_path=str(out_path),
sha256=file_hash, sha256=file_hash,
file_size_bytes=file_size,
created_by="save_field_enriched_pdf", created_by="save_field_enriched_pdf",
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.", notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
) )

View File

@ -1,5 +1,5 @@
from datetime import datetime from datetime import datetime
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer from sqlalchemy import String, DateTime, ForeignKey, Text, Integer, BigInteger
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base from app.db.base import Base
@ -20,6 +20,7 @@ class DocumentVersion(Base):
file_path: Mapped[str] = mapped_column(Text, nullable=False) file_path: Mapped[str] = mapped_column(Text, nullable=False)
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True) sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
file_size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True) created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True) notes: Mapped[str | None] = mapped_column(Text, nullable=True)

View File

@ -2,6 +2,7 @@ from copy import deepcopy
from datetime import datetime from datetime import datetime
from decimal import Decimal, InvalidOperation from decimal import Decimal, InvalidOperation
import re import re
import traceback
import os import os
import hashlib import hashlib
import json import json
@ -32,6 +33,7 @@ from app.models.document import Document
from app.models.document_additional_field import DocumentAdditionalField from app.models.document_additional_field import DocumentAdditionalField
from app.models.document_preset import DocumentPreset from app.models.document_preset import DocumentPreset
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
from app.utils.filesize import human_size
router = APIRouter(prefix="/documents", tags=["documents"]) router = APIRouter(prefix="/documents", tags=["documents"])
@ -174,6 +176,7 @@ def _document_export_payload(document) -> dict:
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates")) templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
templates.env.globals["human_size"] = human_size
QUALITY_FLAG_OPTIONS = [ QUALITY_FLAG_OPTIONS = [
"bad_embedded_text", "bad_embedded_text",
@ -764,7 +767,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
create_field_enriched_pdf_version(db, document, output_path=output_path_obj) create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
else: else:
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj) create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
except Exception: except Exception as e:
print("save_pdf failed:", repr(e), flush=True)
traceback.print_exc()
return RedirectResponse( return RedirectResponse(
url=f"/documents/{document.document_id}?error=save_pdf_failed", url=f"/documents/{document.document_id}?error=save_pdf_failed",
status_code=303, status_code=303,

View File

@ -325,6 +325,7 @@
<th>Version</th> <th>Version</th>
<th>Type</th> <th>Type</th>
<th>Path</th> <th>Path</th>
<th>Size</th>
<th>Created</th> <th>Created</th>
<th>Notes</th> <th>Notes</th>
</tr> </tr>
@ -342,7 +343,8 @@
{% endif %} {% endif %}
</div> </div>
</td> </td>
<td>{{ version.created_at }}</td> <td>{{ human_size(version.file_size_bytes) }}</td>
<td>{{ version.created_at }}</td>
<td>{{ version.notes or "" }}</td> <td>{{ version.notes or "" }}</td>
</tr> </tr>
{% endfor %} {% endfor %}

0
app/utils/__init__.py Normal file
View File

14
app/utils/filesize.py Normal file
View File

@ -0,0 +1,14 @@
def human_size(num_bytes: int | None) -> str:
if not num_bytes:
return ""
units = ["B", "KB", "MB", "GB", "TB", "PB"]
size = float(num_bytes)
for unit in units:
if size < 1024 or unit == units[-1]:
s = f"{size:.3f}".rstrip("0").rstrip(".")
if "." not in s:
s += ".0"
return f"{s} {unit}"
size /= 1024