refactor: use time-based hex document ids for new ingests

This commit is contained in:
Sean McElwain 2026-04-07 13:51:47 -05:00
parent 13590aeeca
commit 9ebaa6f99e
2 changed files with 27 additions and 3 deletions

23
app/logic/id_gen.py Normal file
View File

@ -0,0 +1,23 @@
import time
from sqlalchemy.orm import Session
from app.models.document import Document
def generate_document_id(db: Session) -> str:
ts = int(time.time())
ts_hex = format(ts, "x")
n = 0
while True:
n_hex = format(n, "x")
candidate = f"doc_{ts_hex}-{n_hex}"
exists = (
db.query(Document)
.filter(Document.document_id == candidate)
.first()
)
if not exists:
return candidate
n += 1

View File

@ -9,7 +9,6 @@ import subprocess
import tempfile
from difflib import SequenceMatcher
from pathlib import Path
from uuid import uuid4
from PIL import Image
from sqlalchemy import func
@ -19,6 +18,7 @@ from app.core.config import DOCUMENT_ARCHIVE_ROOT, INBOX_ROOT, UPLOAD_ROOT
from app.models.document import Document
from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion
from app.logic.id_gen import generate_document_id
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
@ -304,7 +304,7 @@ def archive_document(
if not is_supported_file(source):
raise ValueError(f"Unsupported file type: {source.suffix}")
document_id = f"doc_{uuid4().hex[:12]}"
document_id = generate_document_id(db)
current_path = build_storage_path(document_id, source)
current_path.parent.mkdir(parents=True, exist_ok=True)
@ -436,7 +436,8 @@ def ingest_uploaded_file(
upload_root = Path(UPLOAD_ROOT)
upload_root.mkdir(parents=True, exist_ok=True)
staged_name = f"{uuid4().hex[:12]}_{Path(filename).name}"
document_id = generate_document_id(db)
staged_name = f"{document_id}_{Path(filename).name}"
staged_path = upload_root / staged_name
staged_path.write_bytes(file_bytes)