"""导出所有 execution 的对话内容,供外部 LLM 批量生成摘要。 每个 execution 导出一个文件,包含: - execution ID - 每轮 user input 和 assistant output - 文件变更列表 输出目录: export/session_summaries/ 输出格式: {exec_id_short}.txt 用法: python -B scripts/ops/export_session_conversations.py python -B scripts/ops/export_session_conversations.py --limit 50 python -B scripts/ops/export_session_conversations.py --output-dir /path/to/dir """ import json import os import sys from pathlib import Path # 添加 scripts/ops 到 path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from _env_paths import ensure_repo_root ensure_repo_root() from extract_kiro_session import ( find_kiro_agent_dir, find_workspace_session_dir, find_all_executions, find_execution_log, load_sessions_json, parse_messages, parse_actions, DEFAULT_GLOBAL_STORAGE, ) OUTPUT_DIR = Path("export/session_summaries") def extract_conversation_text(log: dict) -> dict: """从 execution log 中提取对话文本。 返回: { "exec_id": str, "workflow": str, "status": str, "start_time": str, "rounds": [{"role": "user"|"assistant", "text": str}, ...], "files_modified": [str], "files_created": [str], } """ messages = log.get("context", {}).get("messages", []) actions = log.get("actions", []) conversation = parse_messages(messages) timeline = parse_actions(actions) rounds = [] for msg in conversation: role = msg.get("role", "") if role not in ("human", "bot"): continue texts = [] for entry in msg.get("entries", []): et = entry.get("type", "") if et == "text": t = entry.get("text", "").strip() if t: # 跳过 steering/system prompt 自动注入 if t.startswith("## Included Rules") or t.startswith(" 2000: combined = combined[:2000] + "\n[...截断...]" rounds.append({ "role": "user" if role == "human" else "assistant", "text": combined, }) # 文件变更 files_modified = [] files_created = [] for step in timeline: fc = step.get("_file_change") if fc: fname = fc.get("file", "?") if fc.get("original"): if fname not in files_modified: files_modified.append(fname) else: if fname not in files_created: files_created.append(fname) from extract_kiro_session import ts_fmt return { "exec_id": log.get("executionId", "?"), "workflow": log.get("workflowType", "?"), "status": log.get("status", "?"), "start_time": ts_fmt(log.get("startTime")), "rounds": rounds, "files_modified": files_modified, "files_created": files_created, } def render_conversation_file(data: dict) -> str: """渲染为纯文本格式,供 LLM 阅读。""" L = [] L.append(f"EXECUTION_ID: {data['exec_id']}") L.append(f"WORKFLOW: {data['workflow']}") L.append(f"STATUS: {data['status']}") L.append(f"START_TIME: {data['start_time']}") if data["files_modified"]: L.append(f"FILES_MODIFIED: {', '.join(data['files_modified'][:20])}") if data["files_created"]: L.append(f"FILES_CREATED: {', '.join(data['files_created'][:20])}") L.append(f"ROUNDS: {len(data['rounds'])}") L.append("---") for i, r in enumerate(data["rounds"], 1): role_label = "USER" if r["role"] == "user" else "ASSISTANT" L.append(f"\n[{role_label} #{i}]") L.append(r["text"]) L.append("\n---END---") return "\n".join(L) def main(): import argparse parser = argparse.ArgumentParser(description="导出 session 对话内容供 LLM 批量总结") parser.add_argument("--limit", type=int, help="最多导出 N 条") parser.add_argument("--output-dir", type=str, help="输出目录") parser.add_argument("--skip-existing", action="store_true", help="跳过已导出的") args = parser.parse_args() out_dir = Path(args.output_dir) if args.output_dir else OUTPUT_DIR out_dir.mkdir(parents=True, exist_ok=True) gs = DEFAULT_GLOBAL_STORAGE ws = os.getcwd() agent_dir = find_kiro_agent_dir(gs) if not agent_dir: print("[export] kiro.kiroagent dir not found") return session_dir = find_workspace_session_dir(agent_dir, ws) chat_ids = None if session_dir: sessions = load_sessions_json(session_dir) chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions if s.get("chatSessionId") or s.get("sessionId")} all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids) if not all_execs: print("[export] no executions found") return # 只导出主 execution(非子代理) # 子代理的 execution 通常没有独立的用户输入 count = 0 skipped = 0 errors = 0 for execution in all_execs: eid = execution.get("executionId", "") short = eid[:8] if args.skip_existing: out_file = out_dir / f"{short}.txt" if out_file.exists(): skipped += 1 continue log_path = find_execution_log(agent_dir, execution["_hex_dir"], execution) if not log_path: continue try: with open(log_path, "r", encoding="utf-8") as f: log = json.load(f) except Exception: errors += 1 continue # 跳过子代理 execution(没有独立用户输入) if log.get("workflowType") in ("sub-agent",): continue data = extract_conversation_text(log) # 跳过没有对话内容的 if not data["rounds"]: continue out_file = out_dir / f"{short}.txt" out_file.write_text(render_conversation_file(data), encoding="utf-8") count += 1 if count % 50 == 0: print(f"[export] {count} exported...") if args.limit and count >= args.limit: break # 生成 manifest(ID 列表,供后续导入摘要时匹配) manifest = out_dir / "_manifest.json" existing = sorted([f.stem for f in out_dir.glob("*.txt") if f.stem != "_manifest"]) manifest.write_text( json.dumps({"count": len(existing), "ids": existing}, indent=2, ensure_ascii=False) + "\n", encoding="utf-8" ) print(f"[export] done: {count} exported, {skipped} skipped, {errors} errors") print(f"[export] output: {out_dir}") print(f"[export] manifest: {manifest}") print(f"\n下一步:") print(f" 1. 用 LLM 处理 {out_dir}/*.txt,为每个文件生成 50-200 字中文摘要") print(f" 2. 将结果写入 {out_dir}/_summaries.json,格式:") print(f' {{"exec_id_short": "摘要文本", ...}}') print(f" 3. 运行导入脚本将摘要写入索引") if __name__ == "__main__": main()