From b49e4b732a6327acdc941cdb1171c071a85c4b65 Mon Sep 17 00:00:00 2001 From: McElwain Date: Fri, 1 May 2026 21:54:15 -0500 Subject: [PATCH] feat: generate colored PDF highlight annotations --- app/highlight.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 app/highlight.py diff --git a/app/highlight.py b/app/highlight.py new file mode 100644 index 0000000..17465c2 --- /dev/null +++ b/app/highlight.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +from pathlib import Path +import argparse +import pandas as pd +import fitz + +BASE_DIR = Path(__file__).resolve().parents[1] +INPUT_DIR = BASE_DIR / "input" +OUTPUT_DIR = BASE_DIR / "output" +DATA_DIR = BASE_DIR / "data" + +COLORS = { + "green": (0, 1, 0), + "red": (1, 0, 0), + "blue": (0, 0, 1), + "yellow": (1, 1, 0), +} + +def add_highlight(page, row): + x = float(row["x"]) + y = float(row["y"]) + w = float(row["w"]) + h = float(row["h"]) + color_name = str(row.get("color", "green")).lower().strip() + color = COLORS.get(color_name, COLORS["green"]) + + rect = fitz.Rect(x, y, x + w, y + h) + + annot = page.add_highlight_annot(rect) + annot.set_colors(stroke=color) + annot.set_info({ + "title": str(row.get("author", "PDF Annotation Merge")), + "subject": str(row.get("category", f"{color_name.title()} Highlight")), + "content": str(row.get("note_text", "")), + }) + annot.update(opacity=float(row.get("opacity", 0.35) or 0.35)) + +def generate_highlights(csv_path): + df = pd.read_csv(csv_path) + OUTPUT_DIR.mkdir(exist_ok=True) + + required = {"file", "page", "x", "y", "w", "h", "color"} + missing = required - set(df.columns) + if missing: + raise ValueError(f"CSV is missing required columns: {sorted(missing)}") + + for filename, group in df.groupby("file"): + input_pdf = INPUT_DIR / filename + output_pdf = OUTPUT_DIR / filename.replace(".pdf", "_highlighted.pdf") + + if not input_pdf.exists(): + print(f"SKIP: missing input file: {input_pdf}") + continue + + doc = fitz.open(input_pdf) + + for _, row in group.iterrows(): + page_number = int(row["page"]) - 1 + if page_number < 0 or page_number >= len(doc): + print(f"SKIP: invalid page {row['page']} for {filename}") + continue + add_highlight(doc[page_number], row) + + doc.save(output_pdf, garbage=4, deflate=True) + doc.close() + print(f"WROTE: {output_pdf}") + +def main(): + parser = argparse.ArgumentParser(description="Generate PDF highlight annotations from CSV.") + parser.add_argument( + "-c", + "--csv", + default=str(DATA_DIR / "highlights.csv"), + help="Path to highlights CSV file. Default: data/highlights.csv", + ) + args = parser.parse_args() + generate_highlights(Path(args.csv)) + +if __name__ == "__main__": + main()