refactor: use time-based hex document ids for new ingests

This commit is contained in:
Sean McElwain 2026-04-07 13:51:47 -05:00
parent 13590aeeca
commit 9ebaa6f99e
2 changed files with 27 additions and 3 deletions

23
app/logic/id_gen.py Normal file
View File

@ -0,0 +1,23 @@
import time
from sqlalchemy.orm import Session
from app.models.document import Document
def generate_document_id(db: Session) -> str:
ts = int(time.time())
ts_hex = format(ts, "x")
n = 0
while True:
n_hex = format(n, "x")
candidate = f"doc_{ts_hex}-{n_hex}"
exists = (
db.query(Document)
.filter(Document.document_id == candidate)
.first()
)
if not exists:
return candidate
n += 1

View File

@ -9,7 +9,6 @@ import subprocess
import tempfile import tempfile
from difflib import SequenceMatcher from difflib import SequenceMatcher
from pathlib import Path from pathlib import Path
from uuid import uuid4
from PIL import Image from PIL import Image
from sqlalchemy import func from sqlalchemy import func
@ -19,6 +18,7 @@ from app.core.config import DOCUMENT_ARCHIVE_ROOT, INBOX_ROOT, UPLOAD_ROOT
from app.models.document import Document from app.models.document import Document
from app.models.document_version import DocumentVersion from app.models.document_version import DocumentVersion
from app.models.text_version import TextVersion from app.models.text_version import TextVersion
from app.logic.id_gen import generate_document_id
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"} ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
@ -304,7 +304,7 @@ def archive_document(
if not is_supported_file(source): if not is_supported_file(source):
raise ValueError(f"Unsupported file type: {source.suffix}") raise ValueError(f"Unsupported file type: {source.suffix}")
document_id = f"doc_{uuid4().hex[:12]}" document_id = generate_document_id(db)
current_path = build_storage_path(document_id, source) current_path = build_storage_path(document_id, source)
current_path.parent.mkdir(parents=True, exist_ok=True) current_path.parent.mkdir(parents=True, exist_ok=True)
@ -436,7 +436,8 @@ def ingest_uploaded_file(
upload_root = Path(UPLOAD_ROOT) upload_root = Path(UPLOAD_ROOT)
upload_root.mkdir(parents=True, exist_ok=True) upload_root.mkdir(parents=True, exist_ok=True)
staged_name = f"{uuid4().hex[:12]}_{Path(filename).name}" document_id = generate_document_id(db)
staged_name = f"{document_id}_{Path(filename).name}"
staged_path = upload_root / staged_name staged_path = upload_root / staged_name
staged_path.write_bytes(file_bytes) staged_path.write_bytes(file_bytes)