feat: add file size tracking and formatting
- store file_size_bytes on versions - add human_size() utility - display normalized sizes in UI enables size vs fidelity analysis
This commit is contained in:
parent
f1d896a9ed
commit
1e37a80894
|
|
@ -469,6 +469,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
|
||||||
compress_pdf_with_ghostscript(out_path)
|
compress_pdf_with_ghostscript(out_path)
|
||||||
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
|
_write_pdf_metadata(out_path, document, next_version_number, "ocr_corrected")
|
||||||
|
|
||||||
|
file_size = out_path.stat().st_size
|
||||||
file_hash = sha256_for_file(out_path)
|
file_hash = sha256_for_file(out_path)
|
||||||
try:
|
try:
|
||||||
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||||
|
|
@ -496,6 +497,7 @@ def create_ocr_corrected_pdf_version(db: Session, document: Document, output_pat
|
||||||
version_type="ocr_corrected",
|
version_type="ocr_corrected",
|
||||||
file_path=str(out_path),
|
file_path=str(out_path),
|
||||||
sha256=file_hash,
|
sha256=file_hash,
|
||||||
|
file_size_bytes=file_size,
|
||||||
created_by="save_ocr_corrected_pdf",
|
created_by="save_ocr_corrected_pdf",
|
||||||
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
|
notes="C1 output: rebuilt searchable PDF using reviewed text aligned to OCR line boxes.",
|
||||||
)
|
)
|
||||||
|
|
@ -539,6 +541,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
||||||
shutil.copy2(current_file, out_path)
|
shutil.copy2(current_file, out_path)
|
||||||
|
|
||||||
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
|
_write_pdf_metadata(out_path, document, next_version_number, "field_enriched")
|
||||||
|
file_size = out_path.stat().st_size
|
||||||
file_hash = sha256_for_file(out_path)
|
file_hash = sha256_for_file(out_path)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -556,6 +559,7 @@ def create_field_enriched_pdf_version(db: Session, document: Document, output_pa
|
||||||
version_type="field_enriched",
|
version_type="field_enriched",
|
||||||
file_path=str(out_path),
|
file_path=str(out_path),
|
||||||
sha256=file_hash,
|
sha256=file_hash,
|
||||||
|
file_size_bytes=file_size,
|
||||||
created_by="save_field_enriched_pdf",
|
created_by="save_field_enriched_pdf",
|
||||||
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
|
notes="Scaffold output: copied current file; extracted fields not yet embedded into PDF.",
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer
|
from sqlalchemy import String, DateTime, ForeignKey, Text, Integer, BigInteger
|
||||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
|
|
||||||
from app.db.base import Base
|
from app.db.base import Base
|
||||||
|
|
@ -20,6 +20,7 @@ class DocumentVersion(Base):
|
||||||
|
|
||||||
file_path: Mapped[str] = mapped_column(Text, nullable=False)
|
file_path: Mapped[str] = mapped_column(Text, nullable=False)
|
||||||
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
sha256: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
||||||
|
file_size_bytes: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
|
||||||
|
|
||||||
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
created_by: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||||
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
|
notes: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ from copy import deepcopy
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from decimal import Decimal, InvalidOperation
|
from decimal import Decimal, InvalidOperation
|
||||||
import re
|
import re
|
||||||
|
import traceback
|
||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
|
@ -32,6 +33,7 @@ from app.models.document import Document
|
||||||
from app.models.document_additional_field import DocumentAdditionalField
|
from app.models.document_additional_field import DocumentAdditionalField
|
||||||
from app.models.document_preset import DocumentPreset
|
from app.models.document_preset import DocumentPreset
|
||||||
from app.models.text_version import TextVersion
|
from app.models.text_version import TextVersion
|
||||||
|
from app.utils.filesize import human_size
|
||||||
|
|
||||||
router = APIRouter(prefix="/documents", tags=["documents"])
|
router = APIRouter(prefix="/documents", tags=["documents"])
|
||||||
|
|
||||||
|
|
@ -174,6 +176,7 @@ def _document_export_payload(document) -> dict:
|
||||||
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
templates = Jinja2Templates(directory=str(BASE_DIR / "templates"))
|
||||||
|
templates.env.globals["human_size"] = human_size
|
||||||
|
|
||||||
QUALITY_FLAG_OPTIONS = [
|
QUALITY_FLAG_OPTIONS = [
|
||||||
"bad_embedded_text",
|
"bad_embedded_text",
|
||||||
|
|
@ -764,7 +767,9 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
|
||||||
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
|
create_field_enriched_pdf_version(db, document, output_path=output_path_obj)
|
||||||
else:
|
else:
|
||||||
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
|
create_ocr_corrected_pdf_version(db, document, output_path=output_path_obj)
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
print("save_pdf failed:", repr(e), flush=True)
|
||||||
|
traceback.print_exc()
|
||||||
return RedirectResponse(
|
return RedirectResponse(
|
||||||
url=f"/documents/{document.document_id}?error=save_pdf_failed",
|
url=f"/documents/{document.document_id}?error=save_pdf_failed",
|
||||||
status_code=303,
|
status_code=303,
|
||||||
|
|
|
||||||
|
|
@ -325,6 +325,7 @@
|
||||||
<th>Version</th>
|
<th>Version</th>
|
||||||
<th>Type</th>
|
<th>Type</th>
|
||||||
<th>Path</th>
|
<th>Path</th>
|
||||||
|
<th>Size</th>
|
||||||
<th>Created</th>
|
<th>Created</th>
|
||||||
<th>Notes</th>
|
<th>Notes</th>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
@ -342,7 +343,8 @@
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</td>
|
</td>
|
||||||
<td>{{ version.created_at }}</td>
|
<td>{{ human_size(version.file_size_bytes) }}</td>
|
||||||
|
<td>{{ version.created_at }}</td>
|
||||||
<td>{{ version.notes or "" }}</td>
|
<td>{{ version.notes or "" }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
def human_size(num_bytes: int | None) -> str:
|
||||||
|
if not num_bytes:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
units = ["B", "KB", "MB", "GB", "TB", "PB"]
|
||||||
|
size = float(num_bytes)
|
||||||
|
|
||||||
|
for unit in units:
|
||||||
|
if size < 1024 or unit == units[-1]:
|
||||||
|
s = f"{size:.3f}".rstrip("0").rstrip(".")
|
||||||
|
if "." not in s:
|
||||||
|
s += ".0"
|
||||||
|
return f"{s} {unit}"
|
||||||
|
size /= 1024
|
||||||
Loading…
Reference in New Issue