refactor: use time-based hex document ids for new ingests
This commit is contained in:
parent
13590aeeca
commit
9ebaa6f99e
|
|
@ -0,0 +1,23 @@
|
||||||
|
import time
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from app.models.document import Document
|
||||||
|
|
||||||
|
|
||||||
|
def generate_document_id(db: Session) -> str:
|
||||||
|
ts = int(time.time())
|
||||||
|
ts_hex = format(ts, "x")
|
||||||
|
|
||||||
|
n = 0
|
||||||
|
while True:
|
||||||
|
n_hex = format(n, "x")
|
||||||
|
candidate = f"doc_{ts_hex}-{n_hex}"
|
||||||
|
|
||||||
|
exists = (
|
||||||
|
db.query(Document)
|
||||||
|
.filter(Document.document_id == candidate)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if not exists:
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
|
@ -9,7 +9,6 @@ import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from difflib import SequenceMatcher
|
from difflib import SequenceMatcher
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from uuid import uuid4
|
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
|
|
@ -19,6 +18,7 @@ from app.core.config import DOCUMENT_ARCHIVE_ROOT, INBOX_ROOT, UPLOAD_ROOT
|
||||||
from app.models.document import Document
|
from app.models.document import Document
|
||||||
from app.models.document_version import DocumentVersion
|
from app.models.document_version import DocumentVersion
|
||||||
from app.models.text_version import TextVersion
|
from app.models.text_version import TextVersion
|
||||||
|
from app.logic.id_gen import generate_document_id
|
||||||
|
|
||||||
|
|
||||||
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
|
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png"}
|
||||||
|
|
@ -304,7 +304,7 @@ def archive_document(
|
||||||
if not is_supported_file(source):
|
if not is_supported_file(source):
|
||||||
raise ValueError(f"Unsupported file type: {source.suffix}")
|
raise ValueError(f"Unsupported file type: {source.suffix}")
|
||||||
|
|
||||||
document_id = f"doc_{uuid4().hex[:12]}"
|
document_id = generate_document_id(db)
|
||||||
current_path = build_storage_path(document_id, source)
|
current_path = build_storage_path(document_id, source)
|
||||||
|
|
||||||
current_path.parent.mkdir(parents=True, exist_ok=True)
|
current_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -436,7 +436,8 @@ def ingest_uploaded_file(
|
||||||
upload_root = Path(UPLOAD_ROOT)
|
upload_root = Path(UPLOAD_ROOT)
|
||||||
upload_root.mkdir(parents=True, exist_ok=True)
|
upload_root.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
staged_name = f"{uuid4().hex[:12]}_{Path(filename).name}"
|
document_id = generate_document_id(db)
|
||||||
|
staged_name = f"{document_id}_{Path(filename).name}"
|
||||||
staged_path = upload_root / staged_name
|
staged_path = upload_root / staged_name
|
||||||
staged_path.write_bytes(file_bytes)
|
staged_path.write_bytes(file_bytes)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue