Merge branch 'agent-paper-main' of https://github.com/brown-dash/Dash-Web into joanne-tutorialagent

author: Joanne <zehan_ding@brown.edu> 2025-06-17 13:02:50 -0400
committer: Joanne <zehan_ding@brown.edu> 2025-06-17 13:02:50 -0400
commit: 2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch)
tree: 344a6f798f692fdd4921ab5a6762e907f5ad7b06 /summarize_dash_ts.py
parent: 430db63077868fa54829721d6530a810aa4d4588 (diff)
parent: ccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff)
1 files changed, 248 insertions, 0 deletions
diff --git a/summarize_dash_ts.py b/summarize_dash_ts.py
new file mode 100644
index 000000000..69f80fde5
--- /dev/null
+++ b/summarize_dash_ts.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+summarize_dash_ts.py  –  v4   (periodic-save edition)
+
+• Dumps every .ts/.tsx file (skipping node_modules, etc.)
+• Calls GPT-4o with Structured Outputs (JSON-schema “const” on filename)
+• Prints each raw JSON reply (unless --quiet)
+• Flushes the growing summary file to disk every N files (default 10)
+
+pip install openai tqdm rich
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import pathlib
+import sys
+from textwrap import dedent
+from typing import Dict, Iterable, List
+
+import openai
+from rich.console import Console
+from rich.tree import Tree
+from tqdm import tqdm
+
+PERIODIC_SAVE_EVERY = 10  # ← change here if you want finer or coarser saves
+
+
+# ─────────────────────────  CLI  ──────────────────────────
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(prog="summarize_dash_ts.py")
+    p.add_argument("-r", "--root", type=pathlib.Path, default=".", help="Repo root")
+    p.add_argument("--model", default="gpt-4o-2024-08-06")
+    p.add_argument("--api-key", help="OpenAI API key (else env var)")
+    p.add_argument("--max-tokens", type=int, default=512)
+    p.add_argument(
+        "--skip-dirs",
+        nargs="*",
+        default=["node_modules", ".git", "dist", "build", ".next"],
+    )
+    p.add_argument(
+        "--preview", type=int, default=5, help="How many summaries to echo at the end"
+    )
+    p.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Suppress the per-file raw JSON spam once you trust the run",
+    )
+    return p.parse_args()
+
+
+# ──────────────────  helpers ──────────────────
+def iter_ts(root: pathlib.Path, skip: List[str]) -> Iterable[pathlib.Path]:
+    for dpath, dnames, fnames in os.walk(root):
+        dnames[:] = [d for d in dnames if d not in skip]
+        for fn in fnames:
+            if fn.endswith((".ts", ".tsx")):
+                yield pathlib.Path(dpath) / fn
+
+
+def safe_open(p: pathlib.Path):
+    try:
+        return p.open(encoding="utf-8")
+    except UnicodeDecodeError:
+        return p.open(encoding="utf-8", errors="replace")
+
+
+def make_tree(paths: list[pathlib.Path], root: pathlib.Path) -> Tree:
+    t = Tree(str(root))
+    nodes: dict[pathlib.Path, Tree] = {root: t}
+    for p in sorted(paths):
+        cur = root
+        for part in p.relative_to(root).parts:
+            cur = cur / part
+            if cur not in nodes:
+                nodes[cur] = nodes[cur.parent].add(part)
+    return t
+
+
+def write_tree_with_summaries(*, tree: Tree, summaries: dict[pathlib.Path, str],
+                              root: pathlib.Path, out_path: pathlib.Path) -> None:
+    tmp = out_path.with_suffix(".tmp")
+    with tmp.open("w", encoding="utf-8") as f:
+
+        def walk(node: Tree, rel_path: pathlib.Path = pathlib.Path("."), indent: str = ""):
+            last = node.children[-1] if node.children else None
+            for child in node.children:
+                marker = "└── " if child is last else "├── "
+                new_indent = indent + ("    " if child is last else "│   ")
+                child_rel = rel_path / child.label          # ← **the missing bit**
+
+                # absolute path used as dict-key during summarization loop
+                abs_path = root / child_rel
+                if abs_path in summaries:
+                    f.write(f"{indent}{marker}{child.label}  –  {summaries[abs_path]}\n")
+                else:
+                    f.write(f"{indent}{marker}{child.label}\n")
+
+                walk(child, child_rel, new_indent)
+
+        walk(tree)
+    tmp.replace(out_path)
+
+
+# ──────────────────  prompt bits  ──────────────────
+SYSTEM = """
+You are an expert TypeScript code summarizer for the Dash hypermedia code-base.
+
+You will be given ONE complete file and its **exact** relative path.
+
+Return ONLY JSON matching this shape:
+
+{
+  "filename": "<EXACT path you were given>",
+  "summary": "<3–5 sentences, <80 words>"
+}
+
+No markdown, no extra keys.
+""".strip()
+
+OVERVIEW = dedent(
+    """
+    Dash is a browser-based hypermedia system from Brown University that lets users
+    mix PDFs, web pages, audio, video, ink and rich-text on a free-form canvas,
+    create Vannevar-Bush-style “trails”, and tag/spatially arrange docs for
+    nonlinear workflows. 99 % of the code-base is TypeScript/React.
+    """
+).strip()
+
+SCHEMA_BASE = {
+    "type": "object",
+    "properties": {
+        "filename": {"type": "string"},
+        "summary": {"type": "string"},
+    },
+    "required": ["filename", "summary"],
+    "additionalProperties": False,
+}
+
+
+def ask_llm(
+    client: openai.OpenAI,
+    model: str,
+    rel_path: str,
+    code: str,
+    max_tokens: int,
+    verbose: bool = True,
+) -> str:
+    schema = {
+        "name": "dash_file_summary",
+        "strict": True,
+        "schema": dict(
+            SCHEMA_BASE,
+            properties=dict(
+                SCHEMA_BASE["properties"], filename={"type": "string", "const": rel_path}
+            ),
+        ),
+    }
+
+    messages = [
+        {"role": "system", "content": SYSTEM},
+        {
+            "role": "user",
+            "content": f"{OVERVIEW}\n\n(PATH = {rel_path})\n\n===== BEGIN FILE =====\n{code}\n===== END FILE =====",
+        },
+    ]
+
+    comp = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        response_format={"type": "json_schema", "json_schema": schema},
+        max_tokens=max_tokens,
+    )
+
+    raw = comp.choices[0].message.content
+    if verbose:
+        print(f"\n📝 Raw JSON for {rel_path}:\n{raw}\n")
+
+    data = json.loads(raw)
+    if data["filename"] != rel_path:
+        Console().print(
+            f"[red]⚠︎ Filename mismatch – model said {data['filename']!r}[/red]"
+        )
+        data["filename"] = rel_path
+    return data["summary"].strip()
+
+
+# ──────────────────  main ──────────────────
+def main() -> None:
+    args = parse_args()
+    openai.api_key = args.api_key or os.getenv("OPENAI_API_KEY") or sys.exit(
+        "Need OPENAI_API_KEY"
+    )
+
+    root = args.root.resolve()
+    con = Console()
+    con.print(f":mag:  [bold]Scanning[/bold] {root}")
+
+    files = list(iter_ts(root, args.skip_dirs))
+    if not files:
+        con.print("[yellow]No TS/TSX files found[/yellow]")
+        return
+
+    # 1. full dump of file contents (unchanged)
+    tree = make_tree(files, root)
+    (root / "ts_files_with_content.txt").write_text(
+        Console(record=True, width=120).print(tree, end="") or ""
+    )
+    with (root / "ts_files_with_content.txt").open("a", encoding="utf-8") as fp:
+        for p in tqdm(files, desc="Dumping source"):
+            fp.write(f"{p.relative_to(root)}\n{'-'*80}\n")
+            fp.write(safe_open(p).read())
+            fp.write(f"\n{'='*80}\n\n")
+
+    # 2. summaries (periodic save)
+    client = openai.OpenAI()
+    summaries: Dict[pathlib.Path, str] = {}
+    out_file = root / "ts_files_with_summaries.txt"
+
+    for idx, p in enumerate(tqdm(files, desc="GPT-4o summarizing"), 1):
+        summaries[p] = ask_llm(
+            client,
+            args.model,
+            str(p.relative_to(root)),
+            safe_open(p).read(),
+            args.max_tokens,
+            verbose=not args.quiet,
+        )
+
+        if idx % PERIODIC_SAVE_EVERY == 0:
+            write_tree_with_summaries(tree=tree, summaries=summaries, root=root, out_path=out_file)
+            con.print(f"[green]✔  Flushed after {idx} files[/green]")
+
+    # final flush
+    write_tree_with_summaries(tree=tree, summaries=summaries, root=root, out_path=out_file)
+
+    # preview
+    con.print("\n[cyan]Sample summaries:[/cyan]")
+    for i, (p, s) in enumerate(list(summaries.items())[: args.preview], 1):
+        con.print(f"{i}. {p.relative_to(root)} → {s}")
+
+    con.print(f":sparkles:  Done – wrote [bold]{out_file}[/bold]")
+
+
+if __name__ == "__main__":
+    main()
author	Joanne <zehan_ding@brown.edu>	2025-06-17 13:02:50 -0400
committer	Joanne <zehan_ding@brown.edu>	2025-06-17 13:02:50 -0400
commit	2aa2c26b95a539d220e46b20cdfbef6ae39d6c43 (patch)
tree	344a6f798f692fdd4921ab5a6762e907f5ad7b06 /summarize_dash_ts.py
parent	430db63077868fa54829721d6530a810aa4d4588 (diff)
parent	ccfdf905400cd4b81d8cde0f16bb0e15cd65621b (diff)