diff options
author | Joanne <zehan_ding@brown.edu> | 2025-06-17 13:02:50 -0400 |
---|---|---|
committer | Joanne <zehan_ding@brown.edu> | 2025-06-17 13:02:50 -0400 |
commit | 2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch) | |
tree | 344a6f798f692fdd4921ab5a6762e907f5ad7b06 /summarize_dash_ts.py | |
parent | 430db63077868fa54829721d6530a810aa4d4588 (diff) | |
parent | ccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff) |
Merge branch 'agent-paper-main' of https://github.com/brown-dash/Dash-Web into joanne-tutorialagent
Diffstat (limited to 'summarize_dash_ts.py')
-rw-r--r-- | summarize_dash_ts.py | 248 |
1 files changed, 248 insertions, 0 deletions
diff --git a/summarize_dash_ts.py b/summarize_dash_ts.py new file mode 100644 index 000000000..69f80fde5 --- /dev/null +++ b/summarize_dash_ts.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +""" +summarize_dash_ts.py – v4 (periodic-save edition) + +• Dumps every .ts/.tsx file (skipping node_modules, etc.) +• Calls GPT-4o with Structured Outputs (JSON-schema “const” on filename) +• Prints each raw JSON reply (unless --quiet) +• Flushes the growing summary file to disk every N files (default 10) + +pip install openai tqdm rich +""" + +from __future__ import annotations + +import argparse +import json +import os +import pathlib +import sys +from textwrap import dedent +from typing import Dict, Iterable, List + +import openai +from rich.console import Console +from rich.tree import Tree +from tqdm import tqdm + +PERIODIC_SAVE_EVERY = 10 # ← change here if you want finer or coarser saves + + +# ───────────────────────── CLI ────────────────────────── +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(prog="summarize_dash_ts.py") + p.add_argument("-r", "--root", type=pathlib.Path, default=".", help="Repo root") + p.add_argument("--model", default="gpt-4o-2024-08-06") + p.add_argument("--api-key", help="OpenAI API key (else env var)") + p.add_argument("--max-tokens", type=int, default=512) + p.add_argument( + "--skip-dirs", + nargs="*", + default=["node_modules", ".git", "dist", "build", ".next"], + ) + p.add_argument( + "--preview", type=int, default=5, help="How many summaries to echo at the end" + ) + p.add_argument( + "--quiet", + action="store_true", + help="Suppress the per-file raw JSON spam once you trust the run", + ) + return p.parse_args() + + +# ────────────────── helpers ────────────────── +def iter_ts(root: pathlib.Path, skip: List[str]) -> Iterable[pathlib.Path]: + for dpath, dnames, fnames in os.walk(root): + dnames[:] = [d for d in dnames if d not in skip] + for fn in fnames: + if fn.endswith((".ts", ".tsx")): + yield pathlib.Path(dpath) / fn + + +def safe_open(p: pathlib.Path): + try: + return p.open(encoding="utf-8") + except UnicodeDecodeError: + return p.open(encoding="utf-8", errors="replace") + + +def make_tree(paths: list[pathlib.Path], root: pathlib.Path) -> Tree: + t = Tree(str(root)) + nodes: dict[pathlib.Path, Tree] = {root: t} + for p in sorted(paths): + cur = root + for part in p.relative_to(root).parts: + cur = cur / part + if cur not in nodes: + nodes[cur] = nodes[cur.parent].add(part) + return t + + +def write_tree_with_summaries(*, tree: Tree, summaries: dict[pathlib.Path, str], + root: pathlib.Path, out_path: pathlib.Path) -> None: + tmp = out_path.with_suffix(".tmp") + with tmp.open("w", encoding="utf-8") as f: + + def walk(node: Tree, rel_path: pathlib.Path = pathlib.Path("."), indent: str = ""): + last = node.children[-1] if node.children else None + for child in node.children: + marker = "└── " if child is last else "├── " + new_indent = indent + (" " if child is last else "│ ") + child_rel = rel_path / child.label # ← **the missing bit** + + # absolute path used as dict-key during summarization loop + abs_path = root / child_rel + if abs_path in summaries: + f.write(f"{indent}{marker}{child.label} – {summaries[abs_path]}\n") + else: + f.write(f"{indent}{marker}{child.label}\n") + + walk(child, child_rel, new_indent) + + walk(tree) + tmp.replace(out_path) + + +# ────────────────── prompt bits ────────────────── +SYSTEM = """ +You are an expert TypeScript code summarizer for the Dash hypermedia code-base. + +You will be given ONE complete file and its **exact** relative path. + +Return ONLY JSON matching this shape: + +{ + "filename": "<EXACT path you were given>", + "summary": "<3–5 sentences, <80 words>" +} + +No markdown, no extra keys. +""".strip() + +OVERVIEW = dedent( + """ + Dash is a browser-based hypermedia system from Brown University that lets users + mix PDFs, web pages, audio, video, ink and rich-text on a free-form canvas, + create Vannevar-Bush-style “trails”, and tag/spatially arrange docs for + nonlinear workflows. 99 % of the code-base is TypeScript/React. + """ +).strip() + +SCHEMA_BASE = { + "type": "object", + "properties": { + "filename": {"type": "string"}, + "summary": {"type": "string"}, + }, + "required": ["filename", "summary"], + "additionalProperties": False, +} + + +def ask_llm( + client: openai.OpenAI, + model: str, + rel_path: str, + code: str, + max_tokens: int, + verbose: bool = True, +) -> str: + schema = { + "name": "dash_file_summary", + "strict": True, + "schema": dict( + SCHEMA_BASE, + properties=dict( + SCHEMA_BASE["properties"], filename={"type": "string", "const": rel_path} + ), + ), + } + + messages = [ + {"role": "system", "content": SYSTEM}, + { + "role": "user", + "content": f"{OVERVIEW}\n\n(PATH = {rel_path})\n\n===== BEGIN FILE =====\n{code}\n===== END FILE =====", + }, + ] + + comp = client.chat.completions.create( + model=model, + messages=messages, + response_format={"type": "json_schema", "json_schema": schema}, + max_tokens=max_tokens, + ) + + raw = comp.choices[0].message.content + if verbose: + print(f"\n📝 Raw JSON for {rel_path}:\n{raw}\n") + + data = json.loads(raw) + if data["filename"] != rel_path: + Console().print( + f"[red]⚠︎ Filename mismatch – model said {data['filename']!r}[/red]" + ) + data["filename"] = rel_path + return data["summary"].strip() + + +# ────────────────── main ────────────────── +def main() -> None: + args = parse_args() + openai.api_key = args.api_key or os.getenv("OPENAI_API_KEY") or sys.exit( + "Need OPENAI_API_KEY" + ) + + root = args.root.resolve() + con = Console() + con.print(f":mag: [bold]Scanning[/bold] {root}") + + files = list(iter_ts(root, args.skip_dirs)) + if not files: + con.print("[yellow]No TS/TSX files found[/yellow]") + return + + # 1. full dump of file contents (unchanged) + tree = make_tree(files, root) + (root / "ts_files_with_content.txt").write_text( + Console(record=True, width=120).print(tree, end="") or "" + ) + with (root / "ts_files_with_content.txt").open("a", encoding="utf-8") as fp: + for p in tqdm(files, desc="Dumping source"): + fp.write(f"{p.relative_to(root)}\n{'-'*80}\n") + fp.write(safe_open(p).read()) + fp.write(f"\n{'='*80}\n\n") + + # 2. summaries (periodic save) + client = openai.OpenAI() + summaries: Dict[pathlib.Path, str] = {} + out_file = root / "ts_files_with_summaries.txt" + + for idx, p in enumerate(tqdm(files, desc="GPT-4o summarizing"), 1): + summaries[p] = ask_llm( + client, + args.model, + str(p.relative_to(root)), + safe_open(p).read(), + args.max_tokens, + verbose=not args.quiet, + ) + + if idx % PERIODIC_SAVE_EVERY == 0: + write_tree_with_summaries(tree=tree, summaries=summaries, root=root, out_path=out_file) + con.print(f"[green]✔ Flushed after {idx} files[/green]") + + # final flush + write_tree_with_summaries(tree=tree, summaries=summaries, root=root, out_path=out_file) + + # preview + con.print("\n[cyan]Sample summaries:[/cyan]") + for i, (p, s) in enumerate(list(summaries.items())[: args.preview], 1): + con.print(f"{i}. {p.relative_to(root)} → {s}") + + con.print(f":sparkles: Done – wrote [bold]{out_file}[/bold]") + + +if __name__ == "__main__": + main() |