微信小程序页面迁移校验之前 P5任务处理之前

2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions
--- a/scripts/ops/export_session_conversations.py
+++ b/scripts/ops/export_session_conversations.py
@@ -0,0 +1,242 @@
+"""导出所有 execution 的对话内容，供外部 LLM 批量生成摘要。
+
+每个 execution 导出一个文件，包含：
+- execution ID
+- 每轮 user input 和 assistant output
+- 文件变更列表
+
+输出目录: export/session_summaries/
+输出格式: {exec_id_short}.txt
+
+用法:
+  python -B scripts/ops/export_session_conversations.py
+  python -B scripts/ops/export_session_conversations.py --limit 50
+  python -B scripts/ops/export_session_conversations.py --output-dir /path/to/dir
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+
+# 添加 scripts/ops 到 path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from _env_paths import ensure_repo_root
+ensure_repo_root()
+
+from extract_kiro_session import (
+    find_kiro_agent_dir,
+    find_workspace_session_dir,
+    find_all_executions,
+    find_execution_log,
+    load_sessions_json,
+    parse_messages,
+    parse_actions,
+    DEFAULT_GLOBAL_STORAGE,
+)
+
+OUTPUT_DIR = Path("export/session_summaries")
+
+
+def extract_conversation_text(log: dict) -> dict:
+    """从 execution log 中提取对话文本。
+
+    返回:
+        {
+            "exec_id": str,
+            "workflow": str,
+            "status": str,
+            "start_time": str,
+            "rounds": [{"role": "user"|"assistant", "text": str}, ...],
+            "files_modified": [str],
+            "files_created": [str],
+        }
+    """
+    messages = log.get("context", {}).get("messages", [])
+    actions = log.get("actions", [])
+    conversation = parse_messages(messages)
+    timeline = parse_actions(actions)
+
+    rounds = []
+    for msg in conversation:
+        role = msg.get("role", "")
+        if role not in ("human", "bot"):
+            continue
+
+        texts = []
+        for entry in msg.get("entries", []):
+            et = entry.get("type", "")
+            if et == "text":
+                t = entry.get("text", "").strip()
+                if t:
+                    # 跳过 steering/system prompt 自动注入
+                    if t.startswith("## Included Rules") or t.startswith("<steering"):
+                        continue
+                    texts.append(t)
+            elif et == "toolUse":
+                # 只记录工具名，不记录参数（节省 token）
+                texts.append(f"[调用工具: {entry.get('name', '?')}]")
+            elif et == "toolUseResponse":
+                ok = "成功" if entry.get("success") else "失败"
+                texts.append(f"[工具结果: {entry.get('name', '?')} {ok}]")
+
+        if texts:
+            combined = "\n".join(texts)
+            # 截断过长的单条消息（节省 token）
+            if len(combined) > 2000:
+                combined = combined[:2000] + "\n[...截断...]"
+            rounds.append({
+                "role": "user" if role == "human" else "assistant",
+                "text": combined,
+            })
+
+    # 文件变更
+    files_modified = []
+    files_created = []
+    for step in timeline:
+        fc = step.get("_file_change")
+        if fc:
+            fname = fc.get("file", "?")
+            if fc.get("original"):
+                if fname not in files_modified:
+                    files_modified.append(fname)
+            else:
+                if fname not in files_created:
+                    files_created.append(fname)
+
+    from extract_kiro_session import ts_fmt
+    return {
+        "exec_id": log.get("executionId", "?"),
+        "workflow": log.get("workflowType", "?"),
+        "status": log.get("status", "?"),
+        "start_time": ts_fmt(log.get("startTime")),
+        "rounds": rounds,
+        "files_modified": files_modified,
+        "files_created": files_created,
+    }
+
+
+def render_conversation_file(data: dict) -> str:
+    """渲染为纯文本格式，供 LLM 阅读。"""
+    L = []
+    L.append(f"EXECUTION_ID: {data['exec_id']}")
+    L.append(f"WORKFLOW: {data['workflow']}")
+    L.append(f"STATUS: {data['status']}")
+    L.append(f"START_TIME: {data['start_time']}")
+
+    if data["files_modified"]:
+        L.append(f"FILES_MODIFIED: {', '.join(data['files_modified'][:20])}")
+    if data["files_created"]:
+        L.append(f"FILES_CREATED: {', '.join(data['files_created'][:20])}")
+
+    L.append(f"ROUNDS: {len(data['rounds'])}")
+    L.append("---")
+
+    for i, r in enumerate(data["rounds"], 1):
+        role_label = "USER" if r["role"] == "user" else "ASSISTANT"
+        L.append(f"\n[{role_label} #{i}]")
+        L.append(r["text"])
+
+    L.append("\n---END---")
+    return "\n".join(L)
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="导出 session 对话内容供 LLM 批量总结")
+    parser.add_argument("--limit", type=int, help="最多导出 N 条")
+    parser.add_argument("--output-dir", type=str, help="输出目录")
+    parser.add_argument("--skip-existing", action="store_true", help="跳过已导出的")
+    args = parser.parse_args()
+
+    out_dir = Path(args.output_dir) if args.output_dir else OUTPUT_DIR
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    gs = DEFAULT_GLOBAL_STORAGE
+    ws = os.getcwd()
+
+    agent_dir = find_kiro_agent_dir(gs)
+    if not agent_dir:
+        print("[export] kiro.kiroagent dir not found")
+        return
+
+    session_dir = find_workspace_session_dir(agent_dir, ws)
+    chat_ids = None
+    if session_dir:
+        sessions = load_sessions_json(session_dir)
+        chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
+                    if s.get("chatSessionId") or s.get("sessionId")}
+
+    all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
+    if not all_execs:
+        print("[export] no executions found")
+        return
+
+    # 只导出主 execution（非子代理）
+    # 子代理的 execution 通常没有独立的用户输入
+    count = 0
+    skipped = 0
+    errors = 0
+
+    for execution in all_execs:
+        eid = execution.get("executionId", "")
+        short = eid[:8]
+
+        if args.skip_existing:
+            out_file = out_dir / f"{short}.txt"
+            if out_file.exists():
+                skipped += 1
+                continue
+
+        log_path = find_execution_log(agent_dir, execution["_hex_dir"], execution)
+        if not log_path:
+            continue
+
+        try:
+            with open(log_path, "r", encoding="utf-8") as f:
+                log = json.load(f)
+        except Exception:
+            errors += 1
+            continue
+
+        # 跳过子代理 execution（没有独立用户输入）
+        if log.get("workflowType") in ("sub-agent",):
+            continue
+
+        data = extract_conversation_text(log)
+
+        # 跳过没有对话内容的
+        if not data["rounds"]:
+            continue
+
+        out_file = out_dir / f"{short}.txt"
+        out_file.write_text(render_conversation_file(data), encoding="utf-8")
+        count += 1
+
+        if count % 50 == 0:
+            print(f"[export] {count} exported...")
+
+        if args.limit and count >= args.limit:
+            break
+
+    # 生成 manifest（ID 列表，供后续导入摘要时匹配）
+    manifest = out_dir / "_manifest.json"
+    existing = sorted([f.stem for f in out_dir.glob("*.txt") if f.stem != "_manifest"])
+    manifest.write_text(
+        json.dumps({"count": len(existing), "ids": existing}, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8"
+    )
+
+    print(f"[export] done: {count} exported, {skipped} skipped, {errors} errors")
+    print(f"[export] output: {out_dir}")
+    print(f"[export] manifest: {manifest}")
+    print(f"\n下一步：")
+    print(f"  1. 用 LLM 处理 {out_dir}/*.txt，为每个文件生成 50-200 字中文摘要")
+    print(f"  2. 将结果写入 {out_dir}/_summaries.json，格式：")
+    print(f'     {{"exec_id_short": "摘要文本", ...}}')
+    print(f"  3. 运行导入脚本将摘要写入索引")
+
+
+if __name__ == "__main__":
+    main()