utility-app/scripts/review-old-word-doc-generat...

import json
import re
from pathlib import Path

try:
    from docx import Document
except Exception:
    Document = None

OLD_APP = Path("/mnt/storage/sftp/mcelwain/repository/word-doc-generator")
OUT_DIR = Path("diagnostics")
OUT_DIR.mkdir(parents=True, exist_ok=True)

PLACEHOLDER_RE = re.compile(r"\{([A-Za-z0-9_:\-]+)\}")


def read_text(path):
    try:
        return path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return ""


def find_placeholders_in_text(text):
    return sorted(set(PLACEHOLDER_RE.findall(text)))


def find_placeholders_in_docx(path):
    if Document is None:
        return []

    found = set()
    try:
        doc = Document(path)
    except Exception:
        return []

    def scan_paragraphs(paragraphs):
        for p in paragraphs:
            found.update(find_placeholders_in_text(p.text))

    def scan_table(table):
        for row in table.rows:
            for cell in row.cells:
                scan_paragraphs(cell.paragraphs)
                for nested in cell.tables:
                    scan_table(nested)

    scan_paragraphs(doc.paragraphs)
    for table in doc.tables:
        scan_table(table)

    return sorted(found)


def categorize_field(name):
    lower = name.lower()

    if lower.startswith("client2"):
        return "Client 2 Information"
    if lower.startswith("client") or lower in {"dob", "ssn", "ssnlastfour", "alias", "email"}:
        return "Client Information"
    if lower.startswith("case"):
        return "Case Information"
    if lower.startswith("settlement"):
        return "Settlement Information"
    if lower.startswith("installment") or lower.startswith("fee") or lower in {"nameoncard", "cardnumber", "securitycode", "expiration", "billingaddress", "billingzip"}:
        return "Fee / Payment Information"
    if lower.startswith("debtcollector"):
        return "Debt Collector Information"
    if lower.startswith("disco"):
        return "Discovery Information"
    if lower in {"today", "currentdate", "currentdatemm-dd-yyyy"}:
        return "Date Fields"
    if lower == "notes":
        return "Notes"

    return "Other Fields"


def field_type(name):
    lower = name.lower()
    if "notes" in lower or "appearanceinfo" in lower or "paymentoptions" in lower:
        return "textarea"
    if "date" in lower or lower in {"dob"}:
        return "date"
    if "email" in lower:
        return "email"
    if "phone" in lower or "fax" in lower:
        return "tel"
    return "text"


def make_sections(fields):
    grouped = {}
    for name in fields:
        grouped.setdefault(categorize_field(name), []).append(name)

    preferred_order = [
        "Date Fields",
        "Client Information",
        "Client 2 Information",
        "Case Information",
        "Discovery Information",
        "Settlement Information",
        "Fee / Payment Information",
        "Debt Collector Information",
        "Notes",
        "Other Fields",
    ]

    sections = []
    for heading in preferred_order:
        names = grouped.get(heading)
        if not names:
            continue

        sections.append({
            "heading": heading,
            "collapsible": heading not in {"Client Information", "Case Information"},
            "defaultOpen": heading in {"Client Information", "Case Information"},
            "fields": [
                {
                    "name": name,
                    "label": re.sub(r"([a-z])([A-Z])", r"\1 \2", name).replace("_", " ").strip().title(),
                    "type": field_type(name),
                    "required": False
                }
                for name in sorted(names)
            ]
        })

    return sections


js_files = sorted(OLD_APP.rglob("*.js"))
html_files = sorted(OLD_APP.rglob("*.html"))
css_files = sorted(OLD_APP.rglob("*.css"))
docx_files = sorted(OLD_APP.rglob("*.docx"))
xlsx_files = sorted(OLD_APP.rglob("*.xlsx"))

all_text_placeholders = set()
function_hits = []

function_terms = {
    "DOCX generation": ["docx", "Docxtemplater", "generateDocument", "generateDoc"],
    "Excel generation": ["xlsx", "generateExcel", "template.xlsx"],
    "vCard generation": ["vcard", "vCard", "BEGIN:VCARD"],
    "Calendar / ICS generation": ["ics", "BEGIN:VCALENDAR", "VEVENT"],
    "Client folder generation": ["generateClientFolder", "client folder"],
    "Settlement calculations": ["settlementPayment", "settlementInstallment", "remainingBalance"],
}

for path in js_files + html_files:
    text = read_text(path)
    all_text_placeholders.update(find_placeholders_in_text(text))

    for label, terms in function_terms.items():
        if any(term in text for term in terms):
            function_hits.append((label, str(path.relative_to(OLD_APP))))

template_rows = []
all_template_placeholders = set()

for path in docx_files:
    placeholders = find_placeholders_in_docx(path)
    all_template_placeholders.update(placeholders)
    template_rows.append({
        "template": str(path.relative_to(OLD_APP)),
        "placeholder_count": len(placeholders),
        "placeholders": placeholders,
    })

all_fields = sorted(all_text_placeholders | all_template_placeholders)

profile = {
    "id": "legacy_word_doc_generator",
    "name": "Legacy Word Doc Generator Profile",
    "description": "Draft profile generated from the legacy word-doc-generator app.",
    "template": "REPLACE_WITH_SELECTED_TEMPLATE.docx",
    "outputFilename": "legacy_document_{timestamp_YYYY-MM-DD_HH-mm-ss}.docx",
    "sourceApp": str(OLD_APP),
    "sections": make_sections(all_fields),
    "legacyFeatures": sorted(set(label for label, _ in function_hits)),
    "templatesFound": template_rows,
}

profile_path = OUT_DIR / "legacy_word_doc_generator_profile_draft.json"
profile_path.write_text(json.dumps(profile, indent=2), encoding="utf-8")

report = []
report.append("# Legacy Word Doc Generator Review")
report.append("")
report.append(f"Source app: `{OLD_APP}`")
report.append("")
report.append("## Files Found")
report.append("")
report.append(f"- JS files: {len(js_files)}")
report.append(f"- HTML files: {len(html_files)}")
report.append(f"- CSS files: {len(css_files)}")
report.append(f"- DOCX templates: {len(docx_files)}")
report.append(f"- XLSX files: {len(xlsx_files)}")
report.append("")
report.append("## Legacy Features Detected")
report.append("")

if function_hits:
    seen = set()
    for label, rel in function_hits:
        key = (label, rel)
        if key in seen:
            continue
        seen.add(key)
        report.append(f"- {label}: `{rel}`")
else:
    report.append("- No major legacy feature signatures detected.")

report.append("")
report.append("## Templates Found")
report.append("")

if template_rows:
    for row in template_rows:
        report.append(f"### `{row['template']}`")
        report.append(f"- Placeholder count: {row['placeholder_count']}")
        if row["placeholders"]:
            report.append("- Placeholders:")
            for name in row["placeholders"]:
                report.append(f"  - `{{{name}}}`")
        report.append("")
else:
    report.append("- No DOCX templates found.")

report.append("")
report.append("## All Fields Detected")
report.append("")
for name in all_fields:
    report.append(f"- `{{{name}}}`")

report.append("")
report.append("## Draft Profile")
report.append("")
report.append(f"Generated: `{profile_path}`")
report.append("")

report_path = OUT_DIR / "legacy_word_doc_generator_review.md"
report_path.write_text("\n".join(report), encoding="utf-8")

print(f"Wrote {report_path}")
print(f"Wrote {profile_path}")
print(f"Detected {len(all_fields)} unique fields/placeholders")