utility-app/scripts/build-legal-profile-from-ol...

import json
import re
import subprocess
from pathlib import Path

OLD_APP = Path("/mnt/storage/sftp/mcelwain/repository/word-doc-generator")
OLD_PUBLIC = OLD_APP / "public"
OLD_HTML = OLD_PUBLIC / "index.html"

OUT_PROFILE = Path("tools/doc_generator/content/document_types/legal_profile.json")
TEMPLATES_OUT = Path("tools/doc_generator/content/templates/legacy")

TAG_RE = re.compile(r'<(input|select|textarea)\b[^>]*>', re.IGNORECASE | re.DOTALL)
ATTR_RE = re.compile(r'([a-zA-Z_:][-a-zA-Z0-9_:.]*)=["\']([^"\']*)["\']')
LABEL_RE = re.compile(
    r'<label[^>]*for=["\']([^"\']+)["\'][^>]*>(.*?)</label>',
    re.IGNORECASE | re.DOTALL
)

EXCLUDE_FIELD_NAMES = {
    "letterTemplateFile",
    "discoTemplateFile",
    "excelFile",
    "csvFile",
    "templateFile",
    "file",
    "SSNLastFour",
    "SSN2LastFour",
    "caseAccLastFour",
    "casePlaintiffFileName",
    "caseAnswerDateString",
    "caseAnswerDateYYYY-MM-DD",
    "caseAnswerDateYyyyMmDd",
    "caseAnswerFiledDateString",
    "caseFilingDateString",
    "caseDispositionDateString",
    "discoCosDateString",
    "discoResponseCosDateString",
}

EXCLUDE_PATTERNS = [
    r"^settlementPaymentDate\d{2}$",
    r"^settlementPaymentAmount\d{2}$",
    r"^settlementRemaingBalance\d{2}$",
    r"^settlementRemainingBalance\d{2}$",
    r"^debtCollector\d+AccLastFour$",
]


def attrs_from_tag(tag):
    return dict(ATTR_RE.findall(tag))


def clean_html_label(value):
    value = re.sub(r"<[^>]+>", "", value)
    value = value.replace(":", "")
    value = re.sub(r"\s+", " ", value).strip()
    return value


def nice_label(name):
    label = re.sub(r"([a-z])([A-Z])", r"\1 \2", name)
    label = label.replace("_", " ").replace("-", " ")
    label = label.title()
    label = label.replace("Ssn", "SSN")
    label = label.replace("Dob", "DOB")
    label = label.replace("Mm Dd Yyyy", "MM DD YYYY")
    return label


def should_exclude(name):
    if not name:
        return True
    if name in EXCLUDE_FIELD_NAMES:
        return True
    if name.endswith("TemplateFile"):
        return True
    return any(re.match(pattern, name) for pattern in EXCLUDE_PATTERNS)


def run_node_list_extractor():
    js = f"""
import {{ pathToFileURL }} from 'url';

const files = [
  'casePlaintiffInfo.js',
  'opposingCounselInfo.js',
  'judgeInfo.js',
  'caseFilingAttorneyInfo.js',
  'filingAttorneyInfo.js',
  'debtCollectorInfo.js'
];

const base = {json.dumps(str(OLD_PUBLIC))};
const result = {{}};

for (const file of files) {{
  try {{
    const mod = await import(pathToFileURL(`${{base}}/${{file}}`).href);
    for (const [exportName, value] of Object.entries(mod)) {{
      if (value && typeof value === 'object' && !Array.isArray(value)) {{
        result[exportName] = Object.keys(value).sort();
      }}
    }}
  }} catch (err) {{}}
}}

console.log(JSON.stringify(result));
"""
    try:
        completed = subprocess.run(
            ["node", "--input-type=module", "-e", js],
            check=True,
            capture_output=True,
            text=True,
        )
        return json.loads(completed.stdout)
    except Exception:
        return {}


def field_type(name, tag_name, attrs):
    lower = name.lower()

    if tag_name.lower() == "textarea":
        return "textarea"

    html_type = attrs.get("type", "").lower()
    if html_type in {"date", "email", "tel", "number"}:
        return html_type

    if "date" in lower or lower == "dob":
        return "date"
    if "email" in lower:
        return "email"
    if "phone" in lower or "fax" in lower:
        return "tel"
    if list_name_for_field(name):
        return "autocomplete"

    return "text"


def list_name_for_field(name):
    if name == "casePlaintiff":
        return "plaintiffs"
    if name == "caseOpposingCounsel":
        return "opposingCounsel"
    if name == "caseDivisionJudge":
        return "judges"
    if name == "caseFilingAttorney":
        return "filingAttorneys"
    if name in {"caseState", "homeState", "client2homeState"}:
        return "states"
    if name == "caseDesignation":
        return "caseDesignations"
    if re.fullmatch(r"debtCollector\d+Name", name):
        return "debtCollectors"
    return None


def section_for(name):
    lower = name.lower()

    if lower.startswith("client2"):
        return "Client 2 Information"
    if lower.startswith("client") or lower in {
        "ssn", "dob", "alias", "email",
        "homeaddress", "homecity", "homestate", "homezip", "homecounty",
        "homephone", "cellphone"
    }:
        return "Client Information"
    if lower.startswith("case"):
        return "Case Information"
    if lower.startswith("disco"):
        return "Discovery Information"
    if lower.startswith("settlement"):
        return "Settlement Information"
    if lower.startswith("installment") or lower.startswith("fee") or lower in {
        "nameoncard", "cardnumber", "securitycode", "expiration",
        "billingaddress", "billingzip"
    }:
        return "Fee / Payment Information"
    if lower.startswith("debtcollector") or name == "numCollectors":
        return "Debt Collector Information"
    if lower == "notes":
        return "Notes"

    return "Other Fields"


def discover_templates():
    templates = []

    for path in sorted(TEMPLATES_OUT.rglob("*.docx")):
        rel = path.relative_to(Path("tools/doc_generator/content/templates")).as_posix()
        template_id = re.sub(r"[^a-zA-Z0-9]+", "_", path.stem).strip("_").lower()

        label = path.relative_to(TEMPLATES_OUT).as_posix()
        label = label.replace(".docx", "")
        label = label.replace("/", " / ")
        label = label.replace("_", " ")

        templates.append({
            "id": template_id,
            "label": label,
            "template": rel,
            "outputFilename": f"{template_id}_{{caseNumber}}_{{timestamp_YYYY-MM-DD_HH-mm-ss}}.docx"
        })

    return templates


html = OLD_HTML.read_text(encoding="utf-8", errors="ignore")

labels = {
    field_id: clean_html_label(label)
    for field_id, label in LABEL_RE.findall(html)
}

fields_seen = []
field_meta = {}

for match in TAG_RE.finditer(html):
    tag_name = match.group(1)
    tag = match.group(0)
    attrs = attrs_from_tag(tag)

    name = attrs.get("name") or attrs.get("id")
    if should_exclude(name):
        continue

    if name not in fields_seen:
        fields_seen.append(name)
        field_meta[name] = (tag_name, attrs)

grouped = {}

for name in fields_seen:
    tag_name, attrs = field_meta[name]
    ftype = field_type(name, tag_name, attrs)

    field = {
        "name": name,
        "label": labels.get(name) or nice_label(name),
        "type": ftype,
        "required": False
    }

    list_name = list_name_for_field(name)
    if list_name:
        field["list"] = list_name

    grouped.setdefault(section_for(name), []).append(field)

preferred_order = [
    "Client Information",
    "Client 2 Information",
    "Case Information",
    "Discovery Information",
    "Settlement Information",
    "Fee / Payment Information",
    "Debt Collector Information",
    "Notes",
    "Other Fields",
]

sections = []

for heading in preferred_order:
    fields = grouped.get(heading)
    if not fields:
        continue

    sections.append({
        "heading": heading,
        "collapsible": heading not in {"Client Information", "Case Information"},
        "defaultOpen": heading in {"Client Information", "Case Information"},
        "fields": fields
    })

lists_raw = run_node_list_extractor()

lists = {
    "plaintiffs": lists_raw.get("casePlaintiffInfo", []),
    "opposingCounsel": lists_raw.get("caseOpposingCounselInfo", []) or lists_raw.get("opposingCounselInfo", []),
    "judges": lists_raw.get("judgeInfo", []),
    "filingAttorneys": lists_raw.get("caseFilingAttorneyInfo", []) or lists_raw.get("filingAttorneyInfo", []),
    "debtCollectors": lists_raw.get("debtCollectorInfo", []),
    "states": ["MO", "KS"],
    "caseDesignations": [
        "Associate Circuit",
        "Circuit",
        "Limited Actions",
        "Small Claims"
    ]
}

templates = discover_templates()

profile = {
    "id": "legal_profile",
    "name": "Legal Profile",
    "description": "Consumer debt defense legal profile based on the legacy app form fields. Additional template fields are calculated at generation time.",
    "template": templates[0]["template"] if templates else "legacy/Canned-Emails.docx",
    "outputFilename": "legal_{caseNumber}_{timestamp_YYYY-MM-DD_HH-mm-ss}.docx",
    "lists": lists,
    "templates": templates,
    "calculations": [
        {
            "script": "legacy_legal",
            "runOn": "generate",
            "description": "Generate old-template compatible calculated fields.",
            "outputsDynamic": {
                "settlementSchedule": {
                    "countField": "settlementInstallmentNo",
                    "indexFormat": "decimal2",
                    "maxCount": 120,
                    "fields": [
                        "settlementPaymentDate",
                        "settlementPaymentAmount",
                        "settlementRemaingBalance",
                        "settlementRemainingBalance"
                    ]
                }
            }
        }
    ],
    "sections": sections
}

OUT_PROFILE.write_text(json.dumps(profile, indent=2), encoding="utf-8")

print(f"Wrote {OUT_PROFILE}")
print(f"Visible HTML fields: {len(fields_seen)}")
for section in sections:
    print(f"- {section['heading']}: {len(section['fields'])}")