update app
This commit is contained in:
parent
6ae16c1808
commit
610f25c2b8
|
|
@ -72,6 +72,10 @@ from sqlalchemy.orm import Session
|
|||
from app.core.config import FIELD_ENRICHED_ROOT, OCR_CORRECTED_ROOT
|
||||
from app.models.document import Document
|
||||
from app.models.document_version import DocumentVersion
|
||||
|
||||
from app.models.document_replica_layout_version import DocumentReplicaLayoutVersion
|
||||
from app.models.document_replica_output import DocumentReplicaOutput
|
||||
from app.models.document_replica_review_state import DocumentReplicaReviewState
|
||||
from app.models.text_version import TextVersion
|
||||
|
||||
|
||||
|
|
@ -685,19 +689,10 @@ def save_ocr_corrected_pdf_current(db: Session, document: Document, output_path:
|
|||
except Exception:
|
||||
share_path_value = None
|
||||
|
||||
document.share_path = share_path_value
|
||||
document.current_path = str(out_path)
|
||||
document.canonical_filename = out_path.name
|
||||
document.sha256_current = file_hash
|
||||
db.add(document)
|
||||
|
||||
# Replica outputs are non-destructive exports for now.
|
||||
# Do not replace the primary/current document path.
|
||||
db.commit()
|
||||
|
||||
keep_paths = {str(out_path)}
|
||||
if document.share_path:
|
||||
keep_paths.add(str(document.share_path))
|
||||
_prune_old_saved_files(db, document, keep_paths)
|
||||
|
||||
|
||||
def save_field_enriched_pdf_current(db: Session, document: Document, output_path: Path) -> None:
|
||||
if not document.current_path:
|
||||
|
|
@ -736,3 +731,263 @@ def save_field_enriched_pdf_current(db: Session, document: Document, output_path
|
|||
if document.share_path:
|
||||
keep_paths.add(str(document.share_path))
|
||||
_prune_old_saved_files(db, document, keep_paths)
|
||||
|
||||
|
||||
def _next_replica_layout_version_number(db: Session, document_id: int) -> int:
|
||||
return (
|
||||
db.query(func.max(DocumentReplicaLayoutVersion.version_number))
|
||||
.filter(DocumentReplicaLayoutVersion.document_id == document_id)
|
||||
.scalar()
|
||||
or 0
|
||||
) + 1
|
||||
|
||||
|
||||
def _get_current_replica_review_state(document: Document) -> DocumentReplicaReviewState | None:
|
||||
rows = getattr(document, "replica_review_states", None) or []
|
||||
return rows[0] if rows else None
|
||||
|
||||
|
||||
def _get_replica_source_context(document: Document):
|
||||
if not document.current_path:
|
||||
raise ValueError("Document has no current_path")
|
||||
|
||||
current_file = Path(document.current_path)
|
||||
if not current_file.exists():
|
||||
raise FileNotFoundError(f"Current file not found: {current_file}")
|
||||
|
||||
raw_ocr = _latest_current_text_version(document, "raw_ocr")
|
||||
reviewed = _latest_current_text_version(document, "reviewed")
|
||||
|
||||
if raw_ocr is None:
|
||||
raise ValueError("No current raw OCR version found")
|
||||
if reviewed is None:
|
||||
raise ValueError("No current reviewed text found")
|
||||
if current_file.suffix.lower() != ".pdf":
|
||||
raise ValueError("Replica PDF generation currently supports PDFs only")
|
||||
|
||||
raw_lines = _flatten_layout_lines(raw_ocr.layout_json)
|
||||
reviewed_lines = _flatten_layout_lines(reviewed.layout_json)
|
||||
|
||||
if not raw_lines:
|
||||
raise ValueError("No OCR line boxes found in raw OCR layout data")
|
||||
if reviewed_lines and len(reviewed_lines) != len(raw_lines):
|
||||
raise ValueError("Reviewed line layout does not match raw OCR line layout")
|
||||
|
||||
source_layout = reviewed.layout_json if reviewed.layout_json else raw_ocr.layout_json
|
||||
if not source_layout:
|
||||
raise ValueError("No source layout found")
|
||||
|
||||
return current_file, raw_ocr, reviewed, source_layout
|
||||
|
||||
|
||||
def build_replica_layout(document: Document, mode: str = "shared") -> dict:
|
||||
current_file, raw_ocr, reviewed, source_layout = _get_replica_source_context(document)
|
||||
reader = PdfReader(str(current_file))
|
||||
|
||||
pages = []
|
||||
page_layouts = {page["page"]: page for page in source_layout.get("pages", [])}
|
||||
|
||||
for page_num, pdf_page in enumerate(reader.pages, start=1):
|
||||
page_w = float(pdf_page.mediabox.width)
|
||||
page_h = float(pdf_page.mediabox.height)
|
||||
page_layout = page_layouts.get(page_num, {"lines": []})
|
||||
src_w = float(page_layout.get("image_width") or 1.0)
|
||||
src_h = float(page_layout.get("image_height") or 1.0)
|
||||
scale_x = page_w / src_w
|
||||
scale_y = page_h / src_h
|
||||
|
||||
line_entries = []
|
||||
for line in page_layout.get("lines", []):
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
||||
left, top, right, bottom = line["bbox"]
|
||||
pdf_x = left * scale_x
|
||||
pdf_y = page_h - (bottom * scale_y)
|
||||
box_width = max(10.0, (right - left) * scale_x)
|
||||
box_height = max(6.0, (bottom - top) * scale_y)
|
||||
font_size = _fit_font_size(text_line, box_width, box_height)
|
||||
|
||||
line_entries.append(
|
||||
{
|
||||
"text": text_line,
|
||||
"bbox_source": [left, top, right, bottom],
|
||||
"pdf_x": pdf_x,
|
||||
"pdf_y": pdf_y,
|
||||
"box_width": box_width,
|
||||
"box_height": box_height,
|
||||
"font_family_guess": "Helvetica",
|
||||
"font_size_guess": font_size,
|
||||
"text_color_guess": "#000000",
|
||||
"text_render_mode_clean": 0,
|
||||
"text_render_mode_scan_backed": 3,
|
||||
}
|
||||
)
|
||||
|
||||
pages.append(
|
||||
{
|
||||
"page": page_num,
|
||||
"page_width": page_w,
|
||||
"page_height": page_h,
|
||||
"image_width": src_w,
|
||||
"image_height": src_h,
|
||||
"lines": line_entries,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"schema_version": 1,
|
||||
"mode_source": mode,
|
||||
"current_path": str(current_file),
|
||||
"text_version_source": {
|
||||
"raw_ocr_version_id": raw_ocr.id if raw_ocr else None,
|
||||
"reviewed_version_id": reviewed.id if reviewed else None,
|
||||
},
|
||||
"pages": pages,
|
||||
}
|
||||
|
||||
|
||||
def _save_replica_layout_version(
|
||||
db: Session,
|
||||
document: Document,
|
||||
layout_json: dict,
|
||||
mode: str,
|
||||
created_by: str = "save_replica_pdf",
|
||||
) -> DocumentReplicaLayoutVersion:
|
||||
db.query(DocumentReplicaLayoutVersion).filter(
|
||||
DocumentReplicaLayoutVersion.document_id == document.id,
|
||||
DocumentReplicaLayoutVersion.is_current == True, # noqa: E712
|
||||
).update({"is_current": False}, synchronize_session=False)
|
||||
|
||||
version = DocumentReplicaLayoutVersion(
|
||||
document_id=document.id,
|
||||
version_number=_next_replica_layout_version_number(db, document.id),
|
||||
version_type="heuristic",
|
||||
render_mode_source=mode,
|
||||
is_current=True,
|
||||
created_by=created_by,
|
||||
quality_flags=[],
|
||||
inference_metadata_json={"pipeline": "heuristic_replica_v1", "mode": mode},
|
||||
layout_json=layout_json,
|
||||
)
|
||||
db.add(version)
|
||||
db.flush()
|
||||
|
||||
state = _get_current_replica_review_state(document)
|
||||
if state is None:
|
||||
state = DocumentReplicaReviewState(document_id=document.id)
|
||||
db.add(state)
|
||||
|
||||
state.current_replica_layout_version_id = version.id
|
||||
state.is_reviewed = False
|
||||
state.is_approved = False
|
||||
state.needs_manual_adjustment = False
|
||||
state.needs_model_retry = False
|
||||
db.flush()
|
||||
|
||||
return version
|
||||
|
||||
|
||||
def _render_replica_pdf_from_layout(
|
||||
current_file: Path,
|
||||
layout_json: dict,
|
||||
out_path: Path,
|
||||
mode: str,
|
||||
) -> None:
|
||||
reader = PdfReader(str(current_file))
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
tmpdir = Path(tmpdirname)
|
||||
images = _render_pdf_page_images(current_file, tmpdir)
|
||||
overlay_pdf_path = tmpdir / "replica.pdf"
|
||||
c = None
|
||||
|
||||
pages = {page["page"]: page for page in layout_json.get("pages", [])}
|
||||
|
||||
for page_num, img_path in enumerate(images, start=1):
|
||||
pdf_page = reader.pages[page_num - 1]
|
||||
page_w = float(pdf_page.mediabox.width)
|
||||
page_h = float(pdf_page.mediabox.height)
|
||||
|
||||
if c is None:
|
||||
c = canvas.Canvas(str(overlay_pdf_path), pagesize=(page_w, page_h))
|
||||
else:
|
||||
c.setPageSize((page_w, page_h))
|
||||
|
||||
if mode == "scan_backed":
|
||||
c.drawImage(ImageReader(str(img_path)), 0, 0, width=page_w, height=page_h)
|
||||
|
||||
page_layout = pages.get(page_num, {"lines": []})
|
||||
|
||||
for line in page_layout.get("lines", []):
|
||||
text_line = (line.get("text") or "").strip()
|
||||
if not text_line:
|
||||
continue
|
||||
|
||||
text_obj = c.beginText()
|
||||
text_obj.setTextRenderMode(3 if mode == "scan_backed" else 0)
|
||||
text_obj.setFont(line.get("font_family_guess") or "Helvetica", float(line.get("font_size_guess") or 10))
|
||||
text_obj.setTextOrigin(float(line["pdf_x"]), float(line["pdf_y"]) + 1)
|
||||
text_obj.textLine(text_line)
|
||||
c.drawText(text_obj)
|
||||
|
||||
c.showPage()
|
||||
|
||||
if c is None:
|
||||
raise ValueError("Failed to build replica PDF")
|
||||
|
||||
c.save()
|
||||
shutil.copy2(overlay_pdf_path, out_path)
|
||||
|
||||
compress_pdf_with_ghostscript(out_path)
|
||||
|
||||
|
||||
def save_replica_pdf(db: Session, document: Document, output_path: Path, mode: str) -> None:
|
||||
if mode not in {"clean", "scan_backed"}:
|
||||
raise ValueError(f"Unsupported replica mode: {mode}")
|
||||
|
||||
current_file, _, _, _ = _get_replica_source_context(document)
|
||||
out_path = Path(output_path)
|
||||
out_path = out_path.with_name(re.sub(r"_v\d+(?=\.[^.]+$)", "", out_path.name))
|
||||
|
||||
stem = re.sub(r"(_replica_clean|_replica_scan_backed)$", "", out_path.stem)
|
||||
suffix = out_path.suffix or ".pdf"
|
||||
|
||||
if mode == "clean":
|
||||
out_path = out_path.with_name(f"{stem}_replica_clean{suffix}")
|
||||
else:
|
||||
out_path = out_path.with_name(f"{stem}_replica_scan_backed{suffix}")
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
layout_json = build_replica_layout(document, mode=mode)
|
||||
layout_version = _save_replica_layout_version(db, document, layout_json, mode=mode)
|
||||
|
||||
_render_replica_pdf_from_layout(current_file, layout_json, out_path, mode=mode)
|
||||
|
||||
file_hash = sha256_for_file(out_path)
|
||||
file_size = out_path.stat().st_size
|
||||
|
||||
try:
|
||||
mirror_path = _mirror_to_secondary_owner(document, out_path)
|
||||
share_path_value = str(mirror_path) if mirror_path else None
|
||||
except Exception:
|
||||
share_path_value = None
|
||||
|
||||
output = DocumentReplicaOutput(
|
||||
document_id=document.id,
|
||||
replica_layout_version_id=layout_version.id,
|
||||
output_type=mode,
|
||||
file_path=str(out_path),
|
||||
sha256=file_hash,
|
||||
file_size_bytes=file_size,
|
||||
created_by="save_replica_pdf",
|
||||
render_settings_json={"mode": mode},
|
||||
)
|
||||
db.add(output)
|
||||
|
||||
# Replica outputs are non-destructive exports.
|
||||
# Do not replace the primary/current document path or prune sibling files.
|
||||
db.commit()
|
||||
|
|
|
|||
|
|
@ -18,3 +18,6 @@ __all__ = [
|
|||
"DocumentPreset",
|
||||
]
|
||||
from app.models.document_naming_field import DocumentNamingField
|
||||
from app.models.document_replica_layout_version import DocumentReplicaLayoutVersion
|
||||
from app.models.document_replica_output import DocumentReplicaOutput
|
||||
from app.models.document_replica_review_state import DocumentReplicaReviewState
|
||||
|
|
|
|||
|
|
@ -105,3 +105,17 @@ class Document(Base):
|
|||
cascade="all, delete-orphan",
|
||||
uselist=False,
|
||||
)
|
||||
|
||||
replica_layout_versions: Mapped[list["DocumentReplicaLayoutVersion"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
order_by="DocumentReplicaLayoutVersion.version_number",
|
||||
)
|
||||
replica_outputs: Mapped[list["DocumentReplicaOutput"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
replica_review_states: Mapped[list["DocumentReplicaReviewState"]] = relationship(
|
||||
back_populates="document",
|
||||
cascade="all, delete-orphan",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, JSON, String, Text
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class DocumentReplicaLayoutVersion(Base):
|
||||
__tablename__ = "document_replica_layout_versions"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
document_id = Column(Integer, ForeignKey("documents.id"), nullable=False, index=True)
|
||||
|
||||
version_number = Column(Integer, nullable=False)
|
||||
version_type = Column(String, nullable=False, default="heuristic")
|
||||
render_mode_source = Column(String, nullable=False, default="shared")
|
||||
is_current = Column(Boolean, nullable=False, default=True)
|
||||
|
||||
created_by = Column(String, nullable=True)
|
||||
derived_from_text_version_id = Column(Integer, ForeignKey("text_versions.id"), nullable=True)
|
||||
derived_from_replica_layout_version_id = Column(Integer, ForeignKey("document_replica_layout_versions.id"), nullable=True)
|
||||
|
||||
model_name = Column(String, nullable=True)
|
||||
model_version = Column(String, nullable=True)
|
||||
prompt_version = Column(String, nullable=True)
|
||||
|
||||
quality_score = Column(String, nullable=True)
|
||||
quality_note = Column(Text, nullable=True)
|
||||
quality_flags = Column(JSON, nullable=True)
|
||||
inference_metadata_json = Column(JSON, nullable=True)
|
||||
layout_json = Column(JSON, nullable=False)
|
||||
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
||||
|
||||
document = relationship("Document", back_populates="replica_layout_versions")
|
||||
outputs = relationship("DocumentReplicaOutput", back_populates="replica_layout_version", cascade="all, delete-orphan")
|
||||
parent_layout_version = relationship("DocumentReplicaLayoutVersion", remote_side=[id])
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
from sqlalchemy import Column, DateTime, ForeignKey, Integer, JSON, String
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class DocumentReplicaOutput(Base):
|
||||
__tablename__ = "document_replica_outputs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
document_id = Column(Integer, ForeignKey("documents.id"), nullable=False, index=True)
|
||||
replica_layout_version_id = Column(Integer, ForeignKey("document_replica_layout_versions.id"), nullable=False, index=True)
|
||||
|
||||
output_type = Column(String, nullable=False)
|
||||
file_path = Column(String, nullable=False)
|
||||
sha256 = Column(String, nullable=True)
|
||||
file_size_bytes = Column(Integer, nullable=True)
|
||||
created_by = Column(String, nullable=True)
|
||||
render_settings_json = Column(JSON, nullable=True)
|
||||
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
||||
|
||||
document = relationship("Document", back_populates="replica_outputs")
|
||||
replica_layout_version = relationship("DocumentReplicaLayoutVersion", back_populates="outputs")
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, Text
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
from app.db.base import Base
|
||||
|
||||
|
||||
class DocumentReplicaReviewState(Base):
|
||||
__tablename__ = "document_replica_review_states"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
document_id = Column(Integer, ForeignKey("documents.id"), nullable=False, index=True)
|
||||
current_replica_layout_version_id = Column(Integer, ForeignKey("document_replica_layout_versions.id"), nullable=True)
|
||||
|
||||
is_reviewed = Column(Boolean, nullable=False, default=False)
|
||||
is_approved = Column(Boolean, nullable=False, default=False)
|
||||
needs_model_retry = Column(Boolean, nullable=False, default=False)
|
||||
needs_manual_adjustment = Column(Boolean, nullable=False, default=False)
|
||||
|
||||
reviewed_by = Column(Text, nullable=True)
|
||||
review_note = Column(Text, nullable=True)
|
||||
|
||||
reviewed_at = Column(DateTime(timezone=True), nullable=True)
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False)
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now(), nullable=False)
|
||||
|
||||
document = relationship("Document", back_populates="replica_review_states")
|
||||
|
|
@ -22,6 +22,7 @@ from app.db.deps import get_db
|
|||
from app.logic.document_outputs import (
|
||||
save_field_enriched_pdf_current,
|
||||
save_ocr_corrected_pdf_current,
|
||||
save_replica_pdf,
|
||||
)
|
||||
from app.logic.storage_paths import build_proposed_storage_path
|
||||
from app.logic.extraction import (
|
||||
|
|
@ -1003,6 +1004,38 @@ def move_to_trash(document_id: str, db: Session = Depends(get_db)):
|
|||
|
||||
|
||||
|
||||
|
||||
def _resolve_document_output_path(document, output_path: str = "") -> Path:
|
||||
save_root = get_default_save_root()
|
||||
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
||||
|
||||
default_output_path = Path(
|
||||
build_proposed_storage_path(
|
||||
document=document,
|
||||
save_root=save_root,
|
||||
naming_row=naming_row,
|
||||
)
|
||||
)
|
||||
default_output_path = default_output_path.with_name(
|
||||
re.sub(r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", default_output_path.name)
|
||||
)
|
||||
if default_output_path.suffix.lower() != ".pdf":
|
||||
default_output_path = default_output_path.with_suffix(".pdf")
|
||||
|
||||
output_path_raw = (output_path or "").strip()
|
||||
output_path_obj = Path(output_path_raw) if output_path_raw else default_output_path
|
||||
|
||||
if output_path_obj.suffix.lower() != ".pdf":
|
||||
output_path_obj = output_path_obj.with_suffix(".pdf")
|
||||
|
||||
allowed_root = Path(save_root).resolve()
|
||||
resolved_parent = output_path_obj.parent.resolve()
|
||||
if allowed_root != resolved_parent and allowed_root not in resolved_parent.parents:
|
||||
raise ValueError("invalid_output_path")
|
||||
|
||||
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||
return output_path_obj
|
||||
|
||||
@router.post("/{document_id}/save-pdf", response_class=RedirectResponse)
|
||||
def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
|
||||
if not _storage_available():
|
||||
|
|
@ -1024,41 +1057,14 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
|
|||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
save_root = get_default_save_root()
|
||||
naming_row = document.naming_fields[0] if getattr(document, "naming_fields", None) else None
|
||||
|
||||
default_output_path = Path(
|
||||
build_proposed_storage_path(
|
||||
document=document,
|
||||
save_root=save_root,
|
||||
naming_row=naming_row,
|
||||
)
|
||||
)
|
||||
default_output_path = default_output_path.with_name(
|
||||
re.sub(r"(?:_v\d+|_\d+)(?=\.[^.]+$)", "", default_output_path.name)
|
||||
)
|
||||
if default_output_path.suffix.lower() != ".pdf":
|
||||
default_output_path = default_output_path.with_suffix(".pdf")
|
||||
|
||||
output_path_raw = (output_path or "").strip()
|
||||
if output_path_raw:
|
||||
output_path_obj = Path(output_path_raw)
|
||||
else:
|
||||
output_path_obj = default_output_path
|
||||
|
||||
if output_path_obj.suffix.lower() != ".pdf":
|
||||
output_path_obj = output_path_obj.with_suffix(".pdf")
|
||||
|
||||
allowed_root = Path(save_root).resolve()
|
||||
resolved_parent = output_path_obj.parent.resolve()
|
||||
if allowed_root != resolved_parent and allowed_root not in resolved_parent.parents:
|
||||
try:
|
||||
output_path_obj = _resolve_document_output_path(document, output_path)
|
||||
except ValueError:
|
||||
return RedirectResponse(
|
||||
url=f"/documents/{document.document_id}?error=invalid_output_path",
|
||||
status_code=303,
|
||||
)
|
||||
|
||||
output_path_obj.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
has_extracted = bool(getattr(document, "extracted_fields", None))
|
||||
has_additional = bool(getattr(document, "additional_fields", None))
|
||||
|
||||
|
|
@ -1079,6 +1085,70 @@ def save_pdf(document_id: str, output_path: str = Form(""), db: Session = Depend
|
|||
|
||||
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-replica-pdf", response_class=RedirectResponse)
|
||||
def save_replica_pdf_clean(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
|
||||
if not _storage_available():
|
||||
return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303)
|
||||
|
||||
document = (
|
||||
db.query(Document)
|
||||
.options(
|
||||
selectinload(Document.text_versions),
|
||||
selectinload(Document.naming_fields),
|
||||
selectinload(Document.replica_review_states),
|
||||
)
|
||||
.filter(Document.document_id == document_id)
|
||||
.first()
|
||||
)
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
try:
|
||||
output_path_obj = _resolve_document_output_path(document, output_path)
|
||||
save_replica_pdf(db, document, output_path_obj, mode="clean")
|
||||
except ValueError as e:
|
||||
if "invalid_output_path" in str(e):
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303)
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_failed&tab=ocr-review", status_code=303)
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_failed&tab=ocr-review", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?success=saved_replica_pdf&tab=ocr-review", status_code=303)
|
||||
|
||||
|
||||
@router.post("/{document_id}/save-replica-pdf-scan-backed", response_class=RedirectResponse)
|
||||
def save_replica_pdf_scan_backed(document_id: str, output_path: str = Form(""), db: Session = Depends(get_db)):
|
||||
if not _storage_available():
|
||||
return RedirectResponse(url=f"/documents/{document_id}?error=storage_unavailable", status_code=303)
|
||||
|
||||
document = (
|
||||
db.query(Document)
|
||||
.options(
|
||||
selectinload(Document.text_versions),
|
||||
selectinload(Document.naming_fields),
|
||||
selectinload(Document.replica_review_states),
|
||||
)
|
||||
.filter(Document.document_id == document_id)
|
||||
.first()
|
||||
)
|
||||
if document is None:
|
||||
return RedirectResponse(url="/documents/", status_code=303)
|
||||
|
||||
try:
|
||||
output_path_obj = _resolve_document_output_path(document, output_path)
|
||||
save_replica_pdf(db, document, output_path_obj, mode="scan_backed")
|
||||
except ValueError as e:
|
||||
if "invalid_output_path" in str(e):
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=invalid_output_path", status_code=303)
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_scan_backed_failed&tab=ocr-review", status_code=303)
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?error=save_replica_pdf_scan_backed_failed&tab=ocr-review", status_code=303)
|
||||
|
||||
return RedirectResponse(url=f"/documents/{document.document_id}?success=saved_replica_pdf_scan_backed&tab=ocr-review", status_code=303)
|
||||
|
||||
@router.post("/{document_id}/save-field-enriched-pdf", response_class=RedirectResponse)
|
||||
def save_field_enriched_pdf(document_id: str, db: Session = Depends(get_db)):
|
||||
return RedirectResponse(
|
||||
|
|
@ -1459,12 +1529,13 @@ async def save_line_items(
|
|||
)
|
||||
|
||||
@router.get("/{document_id}/preview-file")
|
||||
def document_preview_file(document_id: str, db: Session = Depends(get_db)):
|
||||
def document_preview_file(document_id: str, path: str | None = None, db: Session = Depends(get_db)):
|
||||
document = db.query(Document).filter(Document.document_id == document_id).first()
|
||||
if document is None or not document.current_path:
|
||||
resolved_path = path or (document.current_path if document else None)
|
||||
if document is None or not resolved_path:
|
||||
return HTMLResponse(content="Preview file not found", status_code=404)
|
||||
|
||||
path_obj = Path(document.current_path)
|
||||
path_obj = Path(resolved_path)
|
||||
if not path_obj.exists() or not path_obj.is_file():
|
||||
return HTMLResponse(content="Preview file not found", status_code=404)
|
||||
|
||||
|
|
@ -1472,8 +1543,26 @@ def document_preview_file(document_id: str, db: Session = Depends(get_db)):
|
|||
return FileResponse(path=str(path_obj), media_type=media_type, filename=path_obj.name, headers={"Content-Disposition": "inline; filename=\"" + path_obj.name + "\""})
|
||||
|
||||
|
||||
|
||||
def _get_latest_replica_output(document, output_type: str):
|
||||
outputs = getattr(document, "replica_outputs", None) or []
|
||||
matches = [row for row in outputs if getattr(row, "output_type", None) == output_type]
|
||||
matches.sort(key=lambda x: getattr(x, "created_at", None) or 0, reverse=True)
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def _build_preview_url_for_path(request: Request, document_id: str, path_value: str | None):
|
||||
if not path_value:
|
||||
return None
|
||||
path_obj = Path(path_value)
|
||||
if not path_obj.exists() or not path_obj.is_file():
|
||||
return None
|
||||
from urllib.parse import quote
|
||||
base = str(request.url_for("document_preview_file", document_id=document_id))
|
||||
return f"{base}?path={quote(str(path_obj))}&v={int(path_obj.stat().st_mtime)}"
|
||||
|
||||
@router.get("/{document_id}", response_class=HTMLResponse)
|
||||
def document_detail(document_id: str, request: Request, queue: str | None = None, db: Session = Depends(get_db)):
|
||||
def document_detail(document_id: str, request: Request, queue: str | None = None, viewer_source: str = "scan", db: Session = Depends(get_db)):
|
||||
current_user = getattr(request.state, "current_user", None)
|
||||
document = (
|
||||
db.query(Document)
|
||||
|
|
@ -1511,12 +1600,26 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
actual_line_count = len(review_text_value.splitlines()) if review_text_value else 0
|
||||
line_numbers = list(range(1, max(actual_line_count, expected_line_count) + 1))
|
||||
|
||||
file_url = None
|
||||
replica_clean_output = _get_latest_replica_output(document, "clean")
|
||||
replica_scan_backed_output = _get_latest_replica_output(document, "scan_backed")
|
||||
|
||||
scan_path = document.current_path
|
||||
replica_path = replica_clean_output.file_path if replica_clean_output and replica_clean_output.file_path else None
|
||||
replica_scan_backed_path = replica_scan_backed_output.file_path if replica_scan_backed_output and replica_scan_backed_output.file_path else None
|
||||
|
||||
effective_viewer_source = viewer_source or "scan"
|
||||
preview_path = scan_path
|
||||
|
||||
if effective_viewer_source == "replica" and replica_path:
|
||||
preview_path = replica_path
|
||||
elif effective_viewer_source == "replica_scan_backed" and replica_scan_backed_path:
|
||||
preview_path = replica_scan_backed_path
|
||||
else:
|
||||
effective_viewer_source = "scan"
|
||||
preview_path = scan_path
|
||||
|
||||
storage_available = _storage_available()
|
||||
if document.current_path:
|
||||
current_path = Path(document.current_path)
|
||||
if current_path.exists() and current_path.is_file():
|
||||
file_url = str(request.url_for("document_preview_file", document_id=document.document_id))
|
||||
file_url = _build_preview_url_for_path(request, document.document_id, preview_path)
|
||||
|
||||
app_url = str(request.url_for("document_detail", document_id=document.document_id))
|
||||
error = request.query_params.get("error")
|
||||
|
|
@ -1615,6 +1718,9 @@ def document_detail(document_id: str, request: Request, queue: str | None = None
|
|||
"review_text_value": review_text_value,
|
||||
"file_url": file_url,
|
||||
"storage_available": storage_available,
|
||||
"viewer_source": effective_viewer_source,
|
||||
"replica_clean_output": replica_clean_output,
|
||||
"replica_scan_backed_output": replica_scan_backed_output,
|
||||
"version_rows": version_rows,
|
||||
"current_line_item_version": current_line_item_version,
|
||||
"ocr_version_options": ocr_version_options,
|
||||
|
|
|
|||
|
|
@ -6231,3 +6231,40 @@ table {
|
|||
}
|
||||
}
|
||||
/* ===== end line item queue card polish ===== */
|
||||
|
||||
|
||||
|
||||
.preview-card-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 0.75rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.preview-source-toggle {
|
||||
display: flex;
|
||||
gap: 0.45rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.preview-source-link {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
min-height: 2rem;
|
||||
padding: 0.35rem 0.7rem;
|
||||
border: 1px solid #d7dce5;
|
||||
border-radius: 999px;
|
||||
background: #fff;
|
||||
color: #334155;
|
||||
text-decoration: none;
|
||||
font-size: 0.82rem;
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
.preview-source-link.active {
|
||||
background: #0f172a;
|
||||
border-color: #0f172a;
|
||||
color: #fff;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>{% block title %}Document Processor{% endblock %}</title>
|
||||
<link rel="stylesheet" href="/static/app.css?v=171">
|
||||
<link rel="stylesheet" href="/static/app.css?v=174">
|
||||
<link rel="stylesheet" href="/static/app-shell.css?v=158">
|
||||
</head>
|
||||
<body>
|
||||
|
|
|
|||
|
|
@ -73,14 +73,26 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
<div class="success-message">OCR rerun successfully.</div>
|
||||
{% elif success == "regenerated_line_items" %}
|
||||
<div class="success-message">Line items regenerated successfully.</div>
|
||||
{% elif success == "saved_replica_pdf" %}
|
||||
<div class="success-message">Replica PDF saved.</div>
|
||||
{% elif success == "saved_replica_pdf_scan_backed" %}
|
||||
<div class="success-message">Scan-backed replica PDF saved.</div>
|
||||
{% elif success == "saved_reviewed_ocr" %}
|
||||
<div class="success-message">Reviewed OCR saved.</div>
|
||||
{% elif success == "saved_replica_pdf" %}
|
||||
<div class="success-message">Replica PDF saved.</div>
|
||||
{% elif success == "saved_replica_pdf_scan_backed" %}
|
||||
<div class="success-message">Scan-backed replica PDF saved.</div>
|
||||
{% elif success == "saved_reviewed_ocr" %}
|
||||
<div class="success-message">Reviewed OCR saved.</div>
|
||||
{% elif error == "rerun_ocr_failed" %}
|
||||
<div class="error-box">OCR rerun failed.</div>
|
||||
{% elif error == "deprecated_pdf_route_disabled" %}
|
||||
<div class="error-box">This deprecated PDF save route has been disabled. Use Save Document instead.</div>
|
||||
{% elif error == "save_replica_pdf_failed" %}
|
||||
<div class="error-box">Could not save replica PDF.</div>
|
||||
{% elif error == "save_replica_pdf_scan_backed_failed" %}
|
||||
<div class="error-box">Could not save scan-backed replica PDF.</div>
|
||||
{% elif error == "save_field_enriched_failed" %}
|
||||
<div class="error-box">Could not save field-enriched PDF.</div>
|
||||
{% endif %}
|
||||
|
|
@ -163,6 +175,14 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
<button type="button" id="toggle-path-edit" class="top-pill-button">Edit path</button>
|
||||
</div>
|
||||
</form>
|
||||
<div class="button-row" style="margin-top:0.6rem;">
|
||||
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf" style="display:inline;">
|
||||
<button type="submit">Save Replica PDF</button>
|
||||
</form>
|
||||
<form method="post" action="/documents/{{ document.document_id }}/save-replica-pdf-scan-backed" style="display:inline;">
|
||||
<button type="submit">Save Replica PDF (Scan-backed)</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
|
@ -200,7 +220,20 @@ document.addEventListener("DOMContentLoaded", () => {
|
|||
<div class="workspace-grid">
|
||||
<section>
|
||||
<div class="card preview-card">
|
||||
<div class="preview-card-header">
|
||||
<h2 class="card-title">Document preview</h2>
|
||||
|
||||
<div class="preview-source-toggle">
|
||||
<a class="preview-source-link{% if viewer_source == 'scan' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=scan">Scan</a>
|
||||
{% if replica_clean_output %}
|
||||
<a class="preview-source-link{% if viewer_source == 'replica' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica">Replica</a>
|
||||
{% endif %}
|
||||
{% if replica_scan_backed_output %}
|
||||
<a class="preview-source-link{% if viewer_source == 'replica_scan_backed' %} active{% endif %}" href="/documents/{{ document.document_id }}?tab={{ active_tab }}&viewer_source=replica_scan_backed">Replica (Scan-backed)</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
</div>
|
||||
{% if not storage_available %}
|
||||
<p class="empty-state">Storage mount unavailable. Preview is temporarily unavailable.</p>
|
||||
{% elif file_url %}
|
||||
|
|
|
|||
Loading…
Reference in New Issue