feat: add file size tracking and formatting

- store file_size_bytes on versions
- add human_size() utility
- display normalized sizes in UI

enables size vs fidelity analysis
This commit is contained in:
Sean McElwain 2026-04-11 19:28:48 -05:00
parent f1d896a9ed
commit 1e37a80894
6 changed files with 29 additions and 3 deletions

View File

@ -469,6 +469,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
compress_pdf_with_ghostscript(out_path)
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
file_size = out_path.stat().st_size
file_hash = sha256_for_file(out_path)
try:
mirror_path = _mirror_to_secondary_owner(document, out_path)
@ -496,6 +497,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
version_type="ocr_corrected",
file_path=str(out_path),
sha256=file_hash,
file_size_bytes=file_size,
created_by="save_ocr_corrected_pdf",
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
)
@ -539,6 +541,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
shutil.copy2(current_file, out_path)
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
file_size = out_path.stat().st_size
file_hash = sha256_for_file(out_path)
try:
@ -556,6 +559,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
version_type="field_enriched",
file_path=str(out_path),
sha256=file_hash,
file_size_bytes=file_size,
created_by="save_field_enriched_pdf",
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
)

View File

@ -1,5 +1,5 @@
from datetime import datetime
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer, BigInteger
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.db.base import Base
@ -20,6 +20,7 @@ class DocumentVersion(Base):
file_path: Mapped[str] = mapped_column(Text, nullable=False)
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
file_size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
notes: Mapped[str | None] = mapped_column(Text, nullable=True)

View File

@ -2,6 +2,7 @@ from copy import deepcopy
from datetime import datetime
from decimal import Decimal, InvalidOperation
import re
import traceback
import os
import hashlib
import json
@ -32,6 +33,7 @@ from app.models.document import Document
from app.models.document_additional_field import DocumentAdditionalField
from app.models.document_preset import DocumentPreset
from app.models.text_version import TextVersion
from app.utils.filesize import human_size
router = APIRouter(prefix="/documents", tags=["documents"])
@ -174,6 +176,7 @@ def _document_export_payload(document) -> dict:
BASE_DIR = Path(__file__).resolve().parent.parent
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
templates.env.globals["human_size"] = human_size
QUALITY_FLAG_OPTIONS = [
"bad_embedded_text",
@ -764,7 +767,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
else:
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
except Exception:
except Exception as e:
print("save_pdf failed:", repr(e), flush=True)
traceback.print_exc()
return RedirectResponse(
url=f"/documents/{document.document_id}?error=save_pdf_failed",
status_code=303,

View File

@ -325,6 +325,7 @@
<th>Version</th>
<th>Type</th>
<th>Path</th>
<th>Size</th>
<th>Created</th>
<th>Notes</th>
</tr>
@ -342,7 +343,8 @@
{% endif %}
</div>
</td>
<td>{{ version.created_at }}</td>
<td>{{ human_size(version.file_size_bytes) }}</td>
<td>{{ version.created_at }}</td>
<td>{{ version.notes or "" }}</td>
</tr>
{% endfor %}

0
app/utils/__init__.py Normal file
View File

14
app/utils/filesize.py Normal file
View File

@ -0,0 +1,14 @@
def human_size(num_bytes: int | None) -> str:
if not num_bytes:
return ""
units = ["B", "KB", "MB", "GB", "TB", "PB"]
size = float(num_bytes)
for unit in units:
if size < 1024 or unit == units[-1]:
s = f"{size:.3f}".rstrip("0").rstrip(".")
if "." not in s:
s += ".0"
return f"{s} {unit}"
size /= 1024