feat: add file size tracking and formatting
- store file_size_bytes on versions - add human_size() utility - display normalized sizes in UI enables size vs fidelity analysis
This commit is contained in:
parent
f1d896a9ed
commit
1e37a80894
|
|
@ -469,6 +469,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
|
|||
compress_pdf_with_ghostscript(out_path)
|
||||
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
|
||||
|
||||
file_size = out_path.stat().st_size
|
||||
file_hash = sha256_for_file(out_path)
|
||||
try:
|
||||
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||
|
|
@ -496,6 +497,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
|
|||
version_type="ocr_corrected",
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
file_size_bytes=file_size,
|
||||
created_by="save_ocr_corrected_pdf",
|
||||
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
|
||||
)
|
||||
|
|
@ -539,6 +541,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
|||
shutil.copy2(current_file, out_path)
|
||||
|
||||
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
|
||||
file_size = out_path.stat().st_size
|
||||
file_hash = sha256_for_file(out_path)
|
||||
|
||||
try:
|
||||
|
|
@ -556,6 +559,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
|||
version_type="field_enriched",
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
file_size_bytes=file_size,
|
||||
created_by="save_field_enriched_pdf",
|
||||
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from datetime import datetime
|
||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer
|
||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer, BigInteger
|
||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||
|
||||
from app.db.base import Base
|
||||
|
|
@ -20,6 +20,7 @@ class DocumentVersion(Base):
|
|||
|
||||
file_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||
file_size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||
|
||||
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ from copy import deepcopy
|
|||
from datetime import datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
import re
|
||||
import traceback
|
||||
import os
|
||||
import hashlib
|
||||
import json
|
||||
|
|
@ -32,6 +33,7 @@ from app.models.document import Document
|
|||
from app.models.document_additional_field import DocumentAdditionalField
|
||||
from app.models.document_preset import DocumentPreset
|
||||
from app.models.text_version import TextVersion
|
||||
from app.utils.filesize import human_size
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||
|
||||
|
|
@ -174,6 +176,7 @@ def _document_export_payload(document) -> dict:
|
|||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||
templates.env.globals["human_size"] = human_size
|
||||
|
||||
QUALITY_FLAG_OPTIONS = [
|
||||
"bad_embedded_text",
|
||||
|
|
@ -764,7 +767,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
|
|||
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
|
||||
else:
|
||||
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
print("save_pdf failed:", repr(e), flush=True)
|
||||
traceback.print_exc()
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document.document_id}?error=save_pdf_failed",
|
||||
status_code=303,
|
||||
|
|
|
|||
|
|
@ -325,6 +325,7 @@
|
|||
<th>Version</th>
|
||||
<th>Type</th>
|
||||
<th>Path</th>
|
||||
<th>Size</th>
|
||||
<th>Created</th>
|
||||
<th>Notes</th>
|
||||
</tr>
|
||||
|
|
@ -342,6 +343,7 @@
|
|||
{% endif %}
|
||||
</div>
|
||||
</td>
|
||||
<td>{{ human_size(version.file_size_bytes) }}</td>
|
||||
<td>{{ version.created_at }}</td>
|
||||
<td>{{ version.notes or "" }}</td>
|
||||
</tr>
|
||||
|
|
|
|||
|
|
@ -0,0 +1,14 @@
|
|||
def human_size(num_bytes: int | None) -> str:
|
||||
if not num_bytes:
|
||||
return ""
|
||||
|
||||
units = ["B", "KB", "MB", "GB", "TB", "PB"]
|
||||
size = float(num_bytes)
|
||||
|
||||
for unit in units:
|
||||
if size < 1024 or unit == units[-1]:
|
||||
s = f"{size:.3f}".rstrip("0").rstrip(".")
|
||||
if "." not in s:
|
||||
s += ".0"
|
||||
return f"{s} {unit}"
|
||||
size /= 1024
|
||||
Loading…
Reference in New Issue