#!/usr/bin/env python3 """extract_kiro_session — Kiro 执行日志全量提取器 v2。 改进点(相比 v1): 1. 系统提示词去重:首次保存到 _system_prompts/sp_{hash8}.md,后续引用 2. 目录分层:YYYY-MM/DD/{chatSessionId_short}/ 下按对话组织 3. 子代理递归提取:主 execution + 子 execution 放同一目录,按调用顺序编号 4. ID 替换:kiro-diff URI → 真实文件路径,terminalId → 进程描述 5. CONTEXT TRANSFER 中的 steering 内容折叠引用 6. 无内容的 model action 压缩为一行 用法: python scripts/ops/extract_kiro_session.py # 提取最新 execution python scripts/ops/extract_kiro_session.py --all # 提取所有未索引的 python scripts/ops/extract_kiro_session.py --recent 20 # 提取最近 N 个未索引的 python scripts/ops/extract_kiro_session.py --execution-id XX # 提取指定 execution """ import base64 import hashlib import json import os import re import sys from datetime import datetime, timezone, timedelta from typing import Optional from _env_paths import ensure_repo_root ensure_repo_root() CST = timezone(timedelta(hours=8)) # Kiro 固定的 execution manifest 文件名 MANIFEST_FILENAME = "f62de366d0006e17ea00a01f6624aabf" # 输出路径 SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs") INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json") # 精简版:仅主对话 INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json") # 完整版:主对话 + 子代理 SYSTEM_PROMPTS_DIR = os.path.join(SESSION_LOG_DIR, "_system_prompts") # globalStorage 默认路径 DEFAULT_GLOBAL_STORAGE = os.path.join( os.environ.get("APPDATA", ""), "Kiro", "User", "globalStorage" ) # ═══════════════════════════════════════════════════════════ # 工具函数 # ═══════════════════════════════════════════════════════════ def ts_fmt(ms) -> str: if not ms: return "N/A" try: return datetime.fromtimestamp(ms / 1000, tz=CST).strftime("%Y-%m-%d %H:%M:%S") except Exception: return str(ms) def ts_iso(ms) -> str: if not ms: return "" try: return datetime.fromtimestamp(ms / 1000, tz=CST).isoformat() except Exception: return "" def ts_date_parts(ms) -> tuple[str, str, str]: """返回 (YYYY-MM, DD, HHMMSS) 用于目录分层和文件命名""" try: dt = datetime.fromtimestamp(ms / 1000, tz=CST) return dt.strftime("%Y-%m"), dt.strftime("%d"), dt.strftime("%H%M%S") except Exception: return "unknown", "00", "000000" def _resolve_chat_dir(day_dir: str, chat_short: str, first_hms: str) -> str: """为 chatSession 确定带当天序号的输出目录。 规则: 1. 先在 day_dir 下查找已有的同 chatSession 目录(同一天的后续轮次) 2. 再在整个 SESSION_LOG_DIR 下搜索(跨天场景:chatSession 首轮在其他日期) 3. 都没找到则在 day_dir 下分配新序号创建 - 目录格式:{seq:02d}_{chat_short}_{first_hms}/ """ os.makedirs(day_dir, exist_ok=True) # 1. 在当天目录下查找 for d in os.listdir(day_dir): if os.path.isdir(os.path.join(day_dir, d)) and chat_short in d: return os.path.join(day_dir, d) # 2. 跨天搜索:遍历所有 YYYY-MM/DD/ 目录 log_root = SESSION_LOG_DIR if os.path.isdir(log_root): for ym in os.listdir(log_root): ym_path = os.path.join(log_root, ym) if not os.path.isdir(ym_path) or ym.startswith("_"): continue for dd in os.listdir(ym_path): dd_path = os.path.join(ym_path, dd) if not os.path.isdir(dd_path): continue for d in os.listdir(dd_path): if os.path.isdir(os.path.join(dd_path, d)) and chat_short in d: return os.path.join(dd_path, d) # 3. 新 chatSession:分配序号 existing_seqs = [] for d in os.listdir(day_dir): if os.path.isdir(os.path.join(day_dir, d)) and len(d) >= 2 and d[:2].isdigit(): existing_seqs.append(int(d[:2])) next_seq = max(existing_seqs, default=0) + 1 new_dir = os.path.join(day_dir, f"{next_seq:02d}_{chat_short}_{first_hms}") os.makedirs(new_dir, exist_ok=True) return new_dir def _write_cross_day_ref(exec_day_dir: str, chat_short: str, chat_dir: str): """在 execution 所在日期目录下生成跨天指引文件。 当一个 chatSession 跨天时,后续日期的 day_dir 下不会有该对话的目录, 生成 _ref_{chatShort}.md 告知该对话归在哪个目录。 """ os.makedirs(exec_day_dir, exist_ok=True) ref_path = os.path.join(exec_day_dir, f"_ref_{chat_short}.md") if os.path.isfile(ref_path): return # 已存在,不重复写 rel_target = os.path.relpath(chat_dir, exec_day_dir).replace("\\", "/") with open(ref_path, "w", encoding="utf-8") as f: f.write(f"# 跨天对话指引\n\n") f.write(f"chatSession `{chat_short}` 的完整记录归档在:\n\n") f.write(f"→ `{rel_target}`\n\n") f.write(f"(绝对路径:`{chat_dir.replace(chr(92), '/')}`)\n") def trunc(s, n=3000) -> str: if not isinstance(s, str): return str(s) return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]" def safe_json(obj, n=5000) -> str: try: s = json.dumps(obj, ensure_ascii=False, indent=2) except Exception: s = str(obj) return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]" def fence(content: str, lang: str = "") -> str: """生成安全的 Markdown 代码围栏。 检测 content 中最长连续反引号序列,外层用更多反引号包裹。 同时转义行首 # 避免被解析为 Markdown 标题。 如果内容中有未闭合的围栏,在末尾补上关闭围栏。 """ if not content: return f"```{lang}\n```" # 修复内容中未闭合的围栏(原始数据截断导致) fence_stack = [] for line in content.split("\n"): stripped = line.strip() m = re.match(r"^(`{3,})", stripped) if m: ticks = len(m.group(1)) # 如果栈顶有相同 tick 数的开启围栏,且当前行是纯关闭围栏 if fence_stack and fence_stack[-1] == ticks and stripped == '`' * ticks: fence_stack.pop() else: fence_stack.append(ticks) # 补上所有未闭合的围栏(从栈顶开始关闭) if fence_stack: suffix_lines = ['`' * t for t in reversed(fence_stack)] content = content + "\n" + "\n".join(suffix_lines) # 找出内容中最长的连续反引号 max_ticks = 2 cur = 0 for ch in content: if ch == '`': cur += 1 if cur > max_ticks: max_ticks = cur else: cur = 0 outer = '`' * (max_ticks + 1) # 转义行首 # —— 加零宽空格使其不被解析为标题 safe = _escape_heading(content) return f"{outer}{lang}\n{safe}\n{outer}" def _escape_heading(text: str) -> str: """转义文本中行首的 # 符号,防止被 Markdown 解析为标题。 在 # 前插入零宽空格 (\\u200b)。 """ lines = text.split('\n') out = [] for line in lines: if line.lstrip().startswith('#'): # 找到第一个 # 的位置,在前面插入零宽空格 idx = 0 while idx < len(line) and line[idx] in (' ', '\t'): idx += 1 out.append(line[:idx] + '\u200b' + line[idx:]) else: out.append(line) return '\n'.join(out) def hash8(text: str) -> str: return hashlib.sha256(text.encode("utf-8")).hexdigest()[:8] # ═══════════════════════════════════════════════════════════ # 系统提示词去重 # ═══════════════════════════════════════════════════════════ def save_system_prompt(text: str) -> str: """保存系统提示词到 _system_prompts/,返回引用文件名。 如果已存在相同 hash 的文件则跳过。 """ h = hash8(text) filename = f"sp_{h}.md" filepath = os.path.join(SYSTEM_PROMPTS_DIR, filename) if not os.path.isfile(filepath): os.makedirs(SYSTEM_PROMPTS_DIR, exist_ok=True) with open(filepath, "w", encoding="utf-8") as f: f.write(f"# 系统提示词 (hash: {h})\n\n") f.write(text) return filename def is_system_prompt(text: str) -> bool: """判断文本是否为系统提示词(Kiro 注入的 / 等)""" if not text: return False # 系统提示词通常以 开头或包含 return ( "" in text[:200] or "" in text[:500] or text.strip().startswith("You are Kiro") ) def is_steering_block(text: str) -> bool: """判断文本是否为 steering-reminder 注入""" return "" in text[:100] # ═══════════════════════════════════════════════════════════ # ID 替换与路径还原 # ═══════════════════════════════════════════════════════════ # kiro-diff URI 模式:kiro-diff:/path?commitId=xxx&executionId=yyy KIRO_DIFF_PATTERN = re.compile( r'kiro-diff:(/[^?]+)\?commitId(?:%3D|=)([^&]+)&executionId(?:%3D|=)([^"\'}\s]+)' ) def resolve_kiro_diff_uri(uri: str) -> str: """将 kiro-diff: URI 替换为可读的文件路径描述""" m = KIRO_DIFF_PATTERN.search(uri) if m: filepath = m.group(1) commit_id = m.group(2) return f"{filepath} (版本: {commit_id[:8]})" return uri def resolve_ids_in_text(text: str) -> str: """在文本中替换已知的 ID 模式为可读信息""" if not text or not isinstance(text, str): return str(text) if text else "" # 替换 kiro-diff URI text = KIRO_DIFF_PATTERN.sub( lambda m: f'[文件快照] {m.group(1)} (版本 {m.group(2)[:8]})', text ) # 替换 file:///c%3A/ 编码路径 text = re.sub( r'file:///([a-zA-Z])%3A/', lambda m: f'{m.group(1).upper()}:/', text ) return text def resolve_tool_args(name: str, args: dict) -> dict: """对工具调用参数做可读性增强""" resolved = dict(args) # document 类型中的 target 可能是 kiro-diff URI if "target" in resolved and isinstance(resolved["target"], str): resolved["target"] = resolve_kiro_diff_uri(resolved["target"]) # editCode / strReplace 中的 path if "path" in resolved and isinstance(resolved["path"], str): resolved["path"] = resolve_ids_in_text(resolved["path"]) # document entries 中的 modified/original for key in ("modified", "original", "local"): if key in resolved and isinstance(resolved[key], str): resolved[key] = resolve_ids_in_text(resolved[key]) return resolved # ═══════════════════════════════════════════════════════════ # 定位逻辑 # ═══════════════════════════════════════════════════════════ def find_kiro_agent_dir(global_storage: str) -> Optional[str]: agent_dir = os.path.join(global_storage, "kiro.kiroagent") return agent_dir if os.path.isdir(agent_dir) else None def decode_base64url_dir(dirname: str) -> str: try: b64 = dirname.replace("__", "==") return base64.urlsafe_b64decode(b64).decode("utf-8", errors="replace") except Exception: return "" def find_workspace_session_dir(agent_dir: str, workspace_path: str) -> Optional[str]: ws_dir = os.path.join(agent_dir, "workspace-sessions") if not os.path.isdir(ws_dir): return None target = workspace_path.replace("\\", "/").rstrip("/").lower() for entry in os.scandir(ws_dir): if entry.is_dir(): decoded = decode_base64url_dir(entry.name) if decoded.replace("\\", "/").rstrip("/").lower() == target: return entry.path return None def load_sessions_json(session_dir: str) -> list[dict]: sessions_file = os.path.join(session_dir, "sessions.json") if not os.path.isfile(sessions_file): return [] try: with open(sessions_file, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): return data if isinstance(data, dict) and "sessions" in data: return data["sessions"] return [] except Exception: return [] def load_session_detail(session_dir: str, session_id: str) -> Optional[dict]: path = os.path.join(session_dir, f"{session_id}.json") if not os.path.isfile(path): return None try: with open(path, "r", encoding="utf-8") as f: return json.load(f) except Exception: return None def find_all_manifests(agent_dir: str) -> list[tuple[str, str]]: results = [] hex_pattern = re.compile(r"^[0-9a-f]{32}$") for entry in os.scandir(agent_dir): if entry.is_dir() and hex_pattern.match(entry.name): manifest = os.path.join(entry.path, MANIFEST_FILENAME) if os.path.isfile(manifest): results.append((entry.name, manifest)) return results def load_manifest(manifest_path: str) -> list[dict]: try: with open(manifest_path, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): return data # Kiro 新版 manifest 格式: {"executions": [...], "version": ...} if isinstance(data, dict) and "executions" in data: return data["executions"] return [] except Exception: return [] def find_execution_log(agent_dir: str, hex_dir: str, execution: dict) -> Optional[str]: """定位完整 execution log 文件""" exec_id = execution.get("executionId", "") hex_path = os.path.join(agent_dir, hex_dir) for entry in os.scandir(hex_path): if not entry.is_dir(): continue for sub_entry in os.scandir(entry.path): if not sub_entry.is_file() or sub_entry.stat().st_size < 1000: continue try: with open(sub_entry.path, "r", encoding="utf-8") as f: head = f.read(500) if exec_id in head: f.seek(0) data = json.load(f) if data.get("executionId") == exec_id: return sub_entry.path except Exception: continue return None def find_all_executions( agent_dir: str, chat_session_ids: Optional[set[str]] = None, execution_id: Optional[str] = None, ) -> list[dict]: """从所有 manifest 中找匹配的 execution,按 endTime 降序""" manifests = find_all_manifests(agent_dir) all_execs = [] for hex_dir, manifest_path in manifests: entries = load_manifest(manifest_path) for entry in entries: entry["_hex_dir"] = hex_dir if execution_id: eid = entry.get("executionId", "") if eid == execution_id or eid.startswith(execution_id): return [entry] # 仅当 execution 有 chatSessionId 且不在 workspace session 集合中时才过滤 csid = entry.get("chatSessionId") if chat_session_ids and csid and csid not in chat_session_ids: continue all_execs.append(entry) all_execs.sort(key=lambda e: e.get("endTime", 0), reverse=True) return all_execs # ═══════════════════════════════════════════════════════════ # 解析逻辑 # ═══════════════════════════════════════════════════════════ def parse_messages(messages: list) -> list[dict]: """解析 context.messages,处理系统提示词去重和 ID 替换""" conversation = [] for i, msg in enumerate(messages): entries = msg.get("entries", []) parsed = [] for entry in entries: if not isinstance(entry, dict): continue etype = entry.get("type", "unknown") if etype == "text": text = entry.get("text", "") # 检测系统提示词 if is_system_prompt(text): sp_file = save_system_prompt(text) parsed.append({ "type": "system_prompt_ref", "ref_file": sp_file, "char_count": len(text), }) elif is_steering_block(text): # steering 内容折叠,只保留文件名列表 steering_files = re.findall(r'(\w[\w-]+\.md):', text) parsed.append({ "type": "steering_ref", "files": steering_files or ["(steering block)"], "char_count": len(text), }) else: parsed.append({"type": "text", "text": resolve_ids_in_text(text)}) elif etype == "toolUse": args = resolve_tool_args(entry.get("name", ""), entry.get("args", {})) parsed.append({ "type": "toolUse", "id": entry.get("id"), "name": entry.get("name"), "args": args, }) elif etype == "toolUseResponse": msg_text = entry.get("message", "") parsed.append({ "type": "toolUseResponse", "id": entry.get("id"), "name": entry.get("name"), "message": resolve_ids_in_text(msg_text), "success": entry.get("success"), }) elif etype == "document": doc = entry.get("document", {}) doc_type = doc.get("type", "") target = doc.get("target", "") # steering 类型的 document:提取文件名 if doc_type == "steering": display_name = doc.get("displayName", "") parsed.append({ "type": "steering_doc", "name": display_name or "steering", }) else: parsed.append({ "type": "document", "doc_type": doc_type, "target": resolve_ids_in_text(target) if target else "", }) else: parsed.append({"type": etype, "raw_keys": list(entry.keys())}) conversation.append({ "index": i, "role": msg.get("role", "?"), "messageId": msg.get("messageId", "?"), "entries": parsed, }) return conversation def parse_actions(actions: list) -> list[dict]: """解析 actions,压缩无内容的 model action""" timeline = [] for i, action in enumerate(actions): atype = action.get("actionType", "") astate = action.get("actionState", "") # 压缩无内容的 model action 为摘要 if atype == "model" and "output" not in action and "input" not in action: timeline.append({ "index": i, "actionType": "model", "actionState": astate, "emittedAt": ts_fmt(action.get("emittedAt")), "_compressed": True, }) continue entry = { "index": i, "actionId": action.get("actionId"), "actionType": atype, "actionState": astate, "emittedAt": ts_fmt(action.get("emittedAt")), } if action.get("subExecutionId"): entry["subExecutionId"] = action["subExecutionId"] if action.get("endTime"): entry["endTime"] = ts_fmt(action["endTime"]) for k in ("intentResult", "input", "output"): if k in action: val = action[k] # 对 output/input 中的文本做 ID 替换 if isinstance(val, dict): val = dict(val) # 避免修改原始数据 # 提取文件变更信息(write/create action 的 originalContent/modifiedContent) if k == "input" and ("originalContent" in val or "modifiedContent" in val): file_path = val.get("file", val.get("path", "?")) entry["_file_change"] = { "file": resolve_ids_in_text(str(file_path)), "original": val.get("originalContent", ""), "modified": val.get("modifiedContent", ""), } # 从 input 中移除大文本,保留元信息 slim = {vk: resolve_ids_in_text(str(vv)) if isinstance(vv, str) else vv for vk, vv in val.items() if vk not in ("originalContent", "modifiedContent")} entry[k] = slim continue for vk, vv in val.items(): if isinstance(vv, str): val[vk] = resolve_ids_in_text(vv) entry[k] = val timeline.append(entry) return timeline def extract_sub_execution_ids(actions: list) -> list[str]: """从 actions 中提取所有 subExecutionId(按出现顺序)""" seen = set() result = [] for action in actions: sid = action.get("subExecutionId") if sid and sid not in seen: seen.add(sid) result.append(sid) return result # ═══════════════════════════════════════════════════════════ # Diff 快照收集 # ═══════════════════════════════════════════════════════════ def collect_diffs(agent_dir: str, hex_dir: str, execution: dict) -> dict[str, dict]: """从 execution 的 actions 中提取文件变更的 diff 信息。 Kiro 在 write/create action 的 input 中内联了 originalContent 和 modifiedContent。 """ # diff 快照存储在固定目录 74a08cf8.../commitId/ 下, # 但 action input 中已内联内容,直接从 actions 提取更可靠 return {} # ═══════════════════════════════════════════════════════════ # Prompt Log 匹配 # ═══════════════════════════════════════════════════════════ PROMPT_LOG_DIR = os.path.join("docs", "audit", "prompt_logs") def find_matching_prompt_log(start_time_ms: int) -> Optional[str]: """根据 execution startTime 匹配最近的 prompt_log 文件。 prompt_log 文件名格式: prompt_log_YYYYMMDD_HHMMSS.md 匹配窗口: startTime 前后 5 分钟内最近的一个。 """ if not os.path.isdir(PROMPT_LOG_DIR): return None try: exec_dt = datetime.fromtimestamp(start_time_ms / 1000, tz=CST) except Exception: return None best_match = None best_delta = float("inf") pattern = re.compile(r"prompt_log_(\d{8}_\d{6})\.md$") for f in os.scandir(PROMPT_LOG_DIR): if not f.is_file(): continue m = pattern.match(f.name) if not m: continue try: log_dt = datetime.strptime(m.group(1), "%Y%m%d_%H%M%S").replace(tzinfo=CST) delta = abs((exec_dt - log_dt).total_seconds()) if delta < 300 and delta < best_delta: # 5 分钟窗口 best_delta = delta best_match = f.path except Exception: continue return best_match # ═══════════════════════════════════════════════════════════ # Markdown 生成 # ═══════════════════════════════════════════════════════════ def _msg_semantic_label(msg: dict) -> str: """为对话消息生成语义标签,用于快速定位。""" entries = msg.get("entries", []) if not entries: return "" parts = [] for e in entries: et = e["type"] if et == "system_prompt_ref": parts.append("系统提示词") elif et == "steering_ref": parts.append(f"Steering({len(e.get('files', []))})") elif et == "steering_doc": parts.append(f"Steering:`{e.get('name', '?')}`") elif et == "toolUse": name = e.get("name", "?") # 提取关键参数作为上下文 args = e.get("args", {}) ctx = "" if name in ("readFile", "readCode", "readMultipleFiles"): ctx = args.get("path", "") or ",".join(args.get("paths", [])[:2]) elif name in ("fsWrite", "strReplace", "editCode"): ctx = args.get("path", "") elif name == "grepSearch": ctx = args.get("query", "")[:30] elif name == "invokeSubAgent": ctx = args.get("name", "") elif name == "executePwsh": ctx = (args.get("command", "") or "")[:40] elif name == "taskStatus": ctx = args.get("status", "") if ctx: parts.append(f"调用 `{name}` → {ctx}") else: parts.append(f"调用 `{name}`") elif et == "toolUseResponse": name = e.get("name", "?") ok = "✅" if e.get("success") else "❌" parts.append(f"结果 `{name}` {ok}") elif et == "document": parts.append(f"文档:{e.get('doc_type', '?')}") elif et == "text": # 文本内容:提取前 50 字符作为预览(bot 和 human 都加) role = msg.get("role", "") if role in ("bot", "human"): text = (e.get("text") or "").strip() if text: preview = text[:50].replace("\n", " ") if len(text) > 50: preview += "…" icon = "💬" if role == "bot" else "📝" parts.append(f"{icon} `{preview}`") return ", ".join(parts) if parts else "" def _step_semantic_label(step: dict) -> str: """为 action step 生成带图标的语义标签。""" at = step.get("actionType", "?") state = step.get("actionState", "?") fc = step.get("_file_change") sub_eid = step.get("subExecutionId") # 状态图标 if state == "Error": state_icon = "❌" elif state in ("Success", "Accepted"): state_icon = "✅" else: state_icon = "⏳" # 类型图标 + 上下文 if at in ("write", "append") and fc: fname = fc.get("file", "?") short = fname.rsplit("/", 1)[-1] if "/" in fname else fname orig = fc.get("original", "") if orig: return f"⚡ `{at}` 修改 `{short}` {state_icon}" else: return f"⚡ `{at}` 新建 `{short}` {state_icon}" elif at == "invokeSubAgent": inp = step.get("input", {}) # Kiro 原始 log 用 subAgentName,工具 schema 用 name agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?" return f"🔀 `invokeSubAgent` → {agent_name} {state_icon}" elif at == "subagentResponse": return f"🔀 `subagentResponse` {state_icon}" elif at in ("readFiles", "readCode"): inp = step.get("input", {}) if isinstance(inp, dict): files = inp.get("files", []) if files and isinstance(files[0], dict): paths = [f.get("path", "?") for f in files[:2]] else: paths = [str(f) for f in files[:2]] ctx = ", ".join(paths) else: ctx = "" return f"📖 `{at}` {ctx} {state_icon}" if ctx else f"📖 `{at}` {state_icon}" elif at == "search": return f"🔍 `search` {state_icon}" elif at == "say": return f"💬 `say` {state_icon}" elif at == "taskStatus": return f"📋 `taskStatus` {state_icon}" elif at == "steering": return f"📄 `steering` {state_icon}" elif at == "runCommand": return f"🖥️ `runCommand` {state_icon}" elif at == "getDiagnostics": return f"🩺 `getDiagnostics` {state_icon}" elif at == "ContextualHookInvoked": inp = step.get("input", {}) hook_name = inp.get("name", "?") if isinstance(inp, dict) else "?" return f"🪝 Hook `{hook_name}` {state_icon}" elif at == "intentClassification": ir = step.get("intentResult", {}) cls = ir.get("classification", "?") if isinstance(ir, dict) else "?" return f"🎯 意图: `{cls}` {state_icon}" elif at == "replace": inp = step.get("input", {}) path = inp.get("file", inp.get("path", "?")) if isinstance(inp, dict) else "?" short = path.rsplit("/", 1)[-1] if "/" in str(path) else str(path) return f"✏️ `replace` `{short}` {state_icon}" else: return f"`{at}` [{state}]" def _build_execution_summary( log: dict, conversation: list[dict], timeline: list[dict], sub_file_map: Optional[dict[str, str]] = None, ) -> dict: """构建结构化执行摘要(零 LLM 成本,纯规则化提取)。 返回 dict 供 md 渲染和索引存储共用。 """ dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000 files_modified = [] files_created = [] sub_agents = [] errors = [] for step in timeline: if step.get("_compressed"): continue idx = step.get("index", "?") at = step.get("actionType", "?") state = step.get("actionState", "?") fc = step.get("_file_change") if fc: fname = fc.get("file", "?") if fc.get("original"): files_modified.append(fname) else: files_created.append(fname) if at == "invokeSubAgent": inp = step.get("input", {}) agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?" sub_agents.append(agent_name) if state == "Error": errors.append(f"Step {idx}: `{at}`") for msg in conversation: for e in msg.get("entries", []): if e.get("type") == "toolUseResponse" and not e.get("success"): errors.append(f"Msg {msg['index']}: `{e.get('name', '?')}`") # 去重文件名 files_modified = list(dict.fromkeys(files_modified)) files_created = list(dict.fromkeys(files_created)) # description 由外部 LLM 生成(百炼 API),提取阶段不生成 description = "" return { "workflow": log.get("workflowType", "?"), "status": log.get("status", "?"), "duration_s": round(dur, 1), "msg_count": len(conversation), "action_count": len(timeline), "files_modified": files_modified, "files_created": files_created, "sub_agents": sub_agents, "errors": errors, "description": description, } def _render_summary_md(summary: dict, sub_file_map: Optional[dict[str, str]] = None) -> str: """将结构化摘要渲染为 Markdown 文本(放在文件最前面)。""" L = [] # 一句话概览 status_icon = "✅" if summary["status"] == "succeed" else "❌" L.append(f"{status_icon} `{summary['workflow']}` | {summary['duration_s']}s | " f"{summary['msg_count']} msgs | {summary['action_count']} actions") L.append("") desc = summary.get("description", "") if desc: L.append(f"> {desc}") L.append("") fm = summary["files_modified"] fc = summary["files_created"] if fm or fc: L.append(f"**文件变更** (修改 {len(fm)} / 新建 {len(fc)})") for f in fm: L.append(f"- ⚡ 修改 `{f}`") for f in fc: L.append(f"- ✨ 新建 `{f}`") L.append("") sa = summary["sub_agents"] if sa: L.append(f"**子代理** ({len(sa)}): {', '.join(f'`{a}`' for a in sa)}") L.append("") errs = summary["errors"] if errs: L.append(f"**错误** ({len(errs)})") for e in errs: L.append(f"- ❌ {e}") L.append("") if not fm and not fc and not sa and not errs: L.append("*(无文件变更、子代理调用或错误)*") return "\n".join(L) def _build_nav_summary( conversation: list[dict], timeline: list[dict], sub_file_map: Optional[dict[str, str]] = None, ) -> str: """生成快速导航摘要:文件变更、子代理、错误。""" file_changes = [] sub_agents = [] errors = [] for step in timeline: if step.get("_compressed"): continue idx = step.get("index", "?") at = step.get("actionType", "?") state = step.get("actionState", "?") # 文件变更 fc = step.get("_file_change") if fc: fname = fc.get("file", "?") orig = fc.get("original", "") action = "修改" if orig else "新建" file_changes.append(f"- Step {idx}: ⚡ {action} `{fname}`") # 子代理 if at == "invokeSubAgent": inp = step.get("input", {}) agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?" sub_eid = step.get("subExecutionId", "") sub_path = "" if sub_file_map and sub_eid and sub_eid in sub_file_map: sub_path = f" → `{sub_file_map[sub_eid].replace(chr(92), '/')}`" sub_agents.append(f"- Step {idx}: 🔀 `{agent_name}`{sub_path}") # 错误 if state == "Error": errors.append(f"- Step {idx}: ❌ `{at}`") # 对话中的错误工具结果 for msg in conversation: for e in msg.get("entries", []): if e.get("type") == "toolUseResponse" and not e.get("success"): errors.append(f"- Msg {msg['index']}: ❌ `{e.get('name', '?')}`") lines = [] if file_changes: lines.append(f"**文件变更** ({len(file_changes)})") lines.extend(file_changes) lines.append("") if sub_agents: lines.append(f"**子代理调用** ({len(sub_agents)})") lines.extend(sub_agents) lines.append("") if errors: lines.append(f"**错误** ({len(errors)})") lines.extend(errors) lines.append("") if not lines: lines.append("*(无文件变更、子代理调用或错误)*") return "\n".join(lines) def generate_full_record( log: dict, conversation: list[dict], timeline: list[dict], diffs: dict[str, dict], session_info: Optional[dict] = None, prompt_log_path: Optional[str] = None, is_sub: bool = False, sub_index: int = 0, sub_file_map: Optional[dict[str, str]] = None, prev_msg_count: int = 0, ) -> tuple[str, dict]: """生成单个 execution 的 Markdown 全量记录。 Args: log: 原始 execution log JSON conversation: parse_messages 输出 timeline: parse_actions 输出 diffs: collect_diffs 输出 session_info: 会话配置(仅主 execution 有) prompt_log_path: 匹配的 prompt_log 文件路径 is_sub: 是否为子代理 execution sub_index: 子代理序号(从 1 开始) prev_msg_count: 前一轮 execution 的消息数,用于去重(跳过累积的历史消息) """ L = [] exec_id = log.get("executionId", "?") chat_id = log.get("chatSessionId", "?") # 构建结构化摘要(供 md 和索引共用) _summary = _build_execution_summary(log, conversation, timeline, sub_file_map) # 标题 if is_sub: L.append(f"# 子代理 Execution #{sub_index}\n") else: L.append("# Kiro 会话全量记录\n") L.append(f"> 生成时间: {datetime.now(CST).strftime('%Y-%m-%d %H:%M:%S')} CST\n") # ── 0. 执行摘要(文件最前面,AI 读前 20 行即可掌握全貌)── L.append("## 📋 执行摘要\n") L.append(_render_summary_md(_summary, sub_file_map)) L.append("") # ── 1. 元数据 ── L.append("## 1. 元数据\n") L.append("| 字段 | 值 |") L.append("|------|-----|") L.append(f"| executionId | `{exec_id}` |") L.append(f"| chatSessionId | `{chat_id}` |") L.append(f"| workflowType | `{log.get('workflowType', '?')}` |") L.append(f"| autonomyMode | `{log.get('autonomyMode', '?')}` |") L.append(f"| status | `{log.get('status', '?')}` |") L.append(f"| startTime | `{ts_fmt(log.get('startTime'))}` |") L.append(f"| endTime | `{ts_fmt(log.get('endTime'))}` |") dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000 L.append(f"| duration | `{dur:.1f}s` |") L.append(f"| contextUsage | `{log.get('contextUsagePercentage', 0):.2f}%` |") L.append("") if session_info and not is_sub: L.append(f"- 会话标题: `{session_info.get('title', '?')}`") L.append(f"- 创建时间: `{ts_fmt(int(session_info.get('dateCreated', 0)))}`") L.append(f"- 工作区: `{session_info.get('workspaceDirectory', '?')}`") L.append("") if prompt_log_path and not is_sub: rel = os.path.relpath(prompt_log_path).replace("\\", "/") L.append(f"- 关联 prompt_log: `{rel}`") L.append("") # ── 2. 用户输入 ── L.append("## 2. 用户输入\n") input_text = "" for msg in log.get("input", {}).get("data", {}).get("messages", []): for entry in msg.get("content", msg.get("entries", [])): if isinstance(entry, dict) and entry.get("text"): input_text += entry["text"] + "\n" if input_text.strip(): L.append(fence(input_text.strip()) + "\n") else: L.append("*(无用户输入)*\n") # ── 3. 对话记录 ── L.append("## 3. 对话记录\n") # 去重:同一 chatSession 的非首轮 execution,context.messages 包含前几轮的累积历史 # prev_msg_count > 0 时跳过前 N 条,只渲染本轮新增的消息 new_msgs = conversation[prev_msg_count:] if prev_msg_count > 0 else conversation h = sum(1 for m in new_msgs if m["role"] == "human") b = sum(1 for m in new_msgs if m["role"] == "bot") t = sum(1 for m in new_msgs if m["role"] == "tool") if prev_msg_count > 0: L.append(f"共 {len(new_msgs)} 条新增消息 (跳过前 {prev_msg_count} 条历史): human={h}, bot={b}, tool={t}\n") else: L.append(f"共 {len(new_msgs)} 条消息: human={h}, bot={b}, tool={t}\n") for msg in new_msgs: emoji = {"human": "👤", "bot": "🤖", "tool": "🔧"}.get(msg["role"], "❓") # 生成语义标签 msg_label = _msg_semantic_label(msg) label_suffix = f" — {msg_label}" if msg_label else "" # P0: 压缩 hook 输出的空消息(特征:HUMAN 消息只含 "Output:\nCommand executed..." 或 "Output:\nCommand timed out...") if msg["role"] == "human" and len(msg["entries"]) == 1: e0 = msg["entries"][0] if e0["type"] == "text": _txt = (e0.get("text") or "").strip() if _txt.startswith("Output:") and ("Exit Code:" in _txt) and len(_txt) < 200: # 提取 exit code import re as _re _ec_match = _re.search(r"Exit Code:\s*(-?\d+)", _txt) _ec = _ec_match.group(1) if _ec_match else "?" L.append(f"### Msg {msg['index']}: 👤 HUMAN — 🪝 Hook 输出 (exit={_ec})\n") continue L.append(f"### Msg {msg['index']}: {emoji} {msg['role'].upper()}{label_suffix}\n") for entry in msg["entries"]: et = entry["type"] if et == "system_prompt_ref": ref = entry["ref_file"] chars = entry["char_count"] sp_path = f"docs/audit/session_logs/_system_prompts/{ref}" L.append(f"**[系统提示词]** → `{sp_path}` ({chars} 字符)\n") elif et == "steering_ref": files = ", ".join(entry["files"]) chars = entry["char_count"] L.append(f"**[Steering]** 引用: {files} ({chars} 字符)\n") elif et == "text": text = entry.get("text", "") if not text: L.append("*(空)*\n") else: L.append(fence(text) + "\n") elif et == "toolUse": name = entry.get("name", "?") args = entry.get("args", {}) L.append(f"**[🔧 调用]** `{name}`\n") # P1: strReplace/editCode 的代码变更用 diff 格式展示 if name in ("strReplace", "editCode") and isinstance(args, dict): _path = args.get("path", "?") _lang = "python" if _path.endswith(".py") else "sql" if _path.endswith(".sql") else "" L.append(f"- 文件: `{_path}`\n") _old = args.get("oldStr", args.get("old_str", "")) _new = args.get("newStr", args.get("new_str", "")) _sel = args.get("selector", "") _op = args.get("operation", "") _repl = args.get("replacement", "") if _sel: L.append(f"- selector: `{_sel}`" + (f" ({_op})" if _op else "")) if _old: L.append(f"- 删除:\n" + fence(trunc(_old, 2000), _lang)) if _new: L.append(f"- 插入:\n" + fence(trunc(_new, 2000), _lang)) if _repl: L.append(f"- 替换为:\n" + fence(trunc(_repl, 2000), _lang)) L.append("") else: L.append(fence(safe_json(args, 5000), "json") + "\n") elif et == "toolUseResponse": ok = "✅" if entry.get("success") else "❌" L.append(f"**[📋 结果]** `{entry.get('name', '?')}` {ok}\n") msg_text = entry.get("message", "") if msg_text: L.append(fence(trunc(msg_text, 5000)) + "\n") elif et == "document": target = entry.get("target", "") L.append(f"**[📄 文档]** type=`{entry.get('doc_type')}` target=`{target}`\n") elif et == "steering_doc": L.append(f"**[📄 Steering]** `{entry.get('name', 'steering')}`\n") else: L.append(f"**[{et}]** keys={entry.get('raw_keys')}\n") # ── 4. Actions 时间线 ── L.append("## 4. Actions 时间线\n") L.append(f"共 {len(timeline)} 个\n") for step in timeline: if step.get("_compressed"): L.append(f"- `model` [{step.get('actionState')}] @ {step.get('emittedAt')}\n") continue at = step.get('actionType', '?') state = step.get('actionState', '?') # 生成语义标签 step_label = _step_semantic_label(step) L.append(f"### Step {step['index']}: {step_label} @ {step.get('emittedAt','?')}\n") if step.get("subExecutionId"): sub_eid = step["subExecutionId"] L.append(f"- subExecutionId: `{sub_eid}`") # 标注子代理文件路径(如果有映射) if sub_file_map and sub_eid in sub_file_map: sub_path = sub_file_map[sub_eid].replace("\\", "/") L.append(f"- 子代理记录: `{sub_path}`") if step.get("endTime"): L.append(f"- endTime: {step['endTime']}") # 文件变更展示 if step.get("_file_change"): fc = step["_file_change"] fname = fc.get("file", "?") orig = fc.get("original", "") mod = fc.get("modified", "") lang = "python" if fname.endswith(".py") else "sql" if fname.endswith(".sql") else "" L.append(f"- 文件变更: `{fname}`") if orig and mod: L.append(f" - 修改前 ({len(orig)} 字符):\n" + fence(trunc(orig, 3000), lang)) L.append(f" - 修改后 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang)) elif mod: L.append(f" - 新建 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang)) # 特殊处理各种 action type 的内容展示 _at = step.get("actionType", "") if _at == "say": _say_msg = (step.get("output") or {}).get("message", "") if _say_msg: L.append(f"- 💬 AI 回复:\n\n{_say_msg}\n") else: for k in ("intentResult", "input", "output"): if k in step: L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json")) elif _at == "invokeSubAgent": _sub_input = step.get("input") or {} _sub_prompt = _sub_input.get("prompt", "") _sub_name = _sub_input.get("name", "") or _sub_input.get("subAgentName", "") or "?" if _sub_prompt: L.append(f"- 📤 委托 `{_sub_name}`:\n\n{trunc(_sub_prompt, 3000)}\n") _sub_output = step.get("output") or {} _sub_resp = _sub_output.get("response", "") if _sub_resp: L.append(f"- 📥 子代理输出:\n\n{_sub_resp}\n") elif not _sub_prompt: for k in ("intentResult", "input", "output"): if k in step: L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json")) elif _at == "subagentResponse": _sr_input = step.get("input") or {} _sr_resp = _sr_input.get("response", "") if _sr_resp: L.append(f"- 📥 子代理返回:\n\n{_sr_resp}\n") else: for k in ("intentResult", "input", "output"): if k in step: L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json")) elif _at == "ContextualHookInvoked": # P1: hook 执行——提取名称、命令、exitCode _hi = step.get("input") or {} _ho = step.get("output") or {} _h_name = _hi.get("name", "?") _h_cmd = _ho.get("command", "") _h_result = _ho.get("result", {}) _h_exit = _h_result.get("exitCode", "?") if isinstance(_h_result, dict) else "?" _h_out = _h_result.get("output", "") if isinstance(_h_result, dict) else "" if _h_cmd: L.append(f"- `$ {_h_cmd}`") L.append(f"- Exit: `{_h_exit}`") if _h_out and _h_out != "Command executed successfully with no output.": L.append(f"- 输出:\n" + fence(trunc(_h_out, 2000))) elif _at == "intentClassification": # P1: 意图分类——压缩为一行 _ir = step.get("intentResult", {}) _cls = _ir.get("classification", "?") if isinstance(_ir, dict) else "?" L.append(f"- 分类结果: `{_cls}`") elif _at == "runCommand": # P0: 命令执行——提取命令、exitCode、输出 _rc_in = step.get("input") or {} _rc_out = step.get("output") or {} _rc_cmd = _rc_in.get("command", "") if isinstance(_rc_in, dict) else "" _rc_result = _rc_out.get("result", {}) if isinstance(_rc_out, dict) else {} _rc_exit = _rc_result.get("exitCode", "?") if isinstance(_rc_result, dict) else "?" _rc_output = _rc_result.get("output", "") if isinstance(_rc_result, dict) else "" if _rc_cmd: L.append(f"- `$ {_rc_cmd}`") L.append(f"- Exit: `{_rc_exit}`") if _rc_output: L.append(f"- 输出:\n" + fence(trunc(_rc_output, 3000))) elif _at == "search": # P2: 搜索——提取 query 和 why _s_in = step.get("input") or {} _s_query = _s_in.get("query", "") if isinstance(_s_in, dict) else "" _s_why = _s_in.get("why", "") if isinstance(_s_in, dict) else "" if _s_query: L.append(f"- 🔍 `{trunc(_s_query, 100)}`") if _s_why: L.append(f"- 原因: {trunc(_s_why, 200)}") # 展示搜索结果(如果有) _s_out = step.get("output") if _s_out and isinstance(_s_out, dict): _s_files = _s_out.get("files", []) if _s_files: L.append(f"- 结果: {len(_s_files)} 个文件") elif _at == "steering": # P2: steering——提取文件名列表 _st_in = step.get("input") or {} _st_docs = _st_in.get("documents", []) if isinstance(_st_in, dict) else [] if _st_docs: import urllib.parse names = [] for d in _st_docs[:10]: if isinstance(d, str): # URL 编码的路径,提取文件名 decoded = urllib.parse.unquote(d) name = decoded.rsplit("/", 1)[-1] if "/" in decoded else decoded names.append(name) if names: L.append(f"- 文件: {', '.join(names)}") else: for k in ("input", "output"): if k in step: L.append(f"- {k}:\n" + fence(safe_json(step[k], 3000), "json")) elif _at == "getDiagnostics": # P2: 诊断——提取路径和问题数 _gd_in = step.get("input") or {} _gd_out = step.get("output") or {} _gd_paths = _gd_in.get("paths", []) if isinstance(_gd_in, dict) else [] if _gd_paths: L.append(f"- 文件: {', '.join(str(p) for p in _gd_paths[:5])}") if isinstance(_gd_out, dict): _gd_diags = _gd_out.get("diagnostics", []) if isinstance(_gd_diags, list): L.append(f"- 问题数: {len(_gd_diags)}") for d in _gd_diags[:5]: if isinstance(d, dict): L.append(f" - {d.get('severity', '?')}: {d.get('message', '?')[:100]}") elif not _gd_diags: L.append("- ✅ 无问题") elif _at in ("readFiles", "readCode"): # P3: 文件读取——只展示路径,不展示内容 _rf_in = step.get("input") or {} if isinstance(_rf_in, dict): _rf_files = _rf_in.get("files", []) paths = [] for f in _rf_files[:5]: if isinstance(f, dict): paths.append(f.get("path", "?")) else: paths.append(str(f)) if paths: L.append(f"- 文件: {', '.join(paths)}") else: for k in ("intentResult", "input", "output"): if k in step: L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json")) L.append("") # ── 5. 资源消耗 ── usage = log.get("usageSummary", []) if usage: L.append("## 5. 资源消耗\n") L.append("| 工具 | 消耗 | 单位 |") L.append("|------|------|------|") total = 0 for u in usage: tools = ", ".join(u.get("usedTools", ["-"])) amt = u.get("usage", 0) total += amt L.append(f"| {tools} | {amt} | {u.get('unit', '?')} |") L.append(f"| **合计** | **{total:.4f}** | |") L.append("") return "\n".join(L), _summary # ═══════════════════════════════════════════════════════════ # 索引管理 # ═══════════════════════════════════════════════════════════ def load_index() -> dict: if os.path.isfile(INDEX_PATH): try: with open(INDEX_PATH, "r", encoding="utf-8") as f: return json.load(f) except Exception: pass return {"version": 2, "entries": {}} def load_full_index() -> dict: if os.path.isfile(INDEX_FULL_PATH): try: with open(INDEX_FULL_PATH, "r", encoding="utf-8") as f: return json.load(f) except Exception: pass return {"version": 2, "entries": {}} def save_index(index: dict): os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True) with open(INDEX_PATH, "w", encoding="utf-8") as f: json.dump(index, f, ensure_ascii=False, indent=2) _save_day_indexes(index, "_day_index.json") def save_full_index(index: dict): os.makedirs(os.path.dirname(INDEX_FULL_PATH), exist_ok=True) with open(INDEX_FULL_PATH, "w", encoding="utf-8") as f: json.dump(index, f, ensure_ascii=False, indent=2) _save_day_indexes(index, "_day_index_full.json") def _save_day_indexes(index: dict, filename: str): """将根级索引按 output_dir 中的 {ym}/{dd} 拆分,写入每个 day_dir 下。 路径格式:docs/audit/session_logs/{ym}/{dd}/{filename} 每个 day 级索引只包含 output_dir 在该天目录下的 entry。 """ entries = index.get("entries", {}) if not entries: return # 按 day_dir 分组 day_groups: dict[str, dict[str, dict]] = {} prefix = SESSION_LOG_DIR.replace("\\", "/") for eid, ent in entries.items(): out_dir = ent.get("output_dir", "").replace("\\", "/") if not out_dir.startswith(prefix): continue # out_dir 格式:docs/audit/session_logs/2026-03/03/01_abc12345_013337 # 取到 day_dir:docs/audit/session_logs/2026-03/03 rel = out_dir[len(prefix):].lstrip("/") parts = rel.split("/") if len(parts) >= 2: day_key = f"{parts[0]}/{parts[1]}" # "2026-03/03" day_groups.setdefault(day_key, {})[eid] = ent for day_key, day_entries in day_groups.items(): day_dir = os.path.join(SESSION_LOG_DIR, day_key.replace("/", os.sep)) day_idx_path = os.path.join(day_dir, filename) os.makedirs(day_dir, exist_ok=True) day_data = {"version": 2, "entries": day_entries} with open(day_idx_path, "w", encoding="utf-8") as f: json.dump(day_data, f, ensure_ascii=False, indent=2) def update_index(index: dict, exec_id: str, output_dir: str, meta: dict, summary: Optional[dict] = None, is_sub: bool = False, parent_exec_id: str = ""): """添加一条索引记录,含结构化摘要供 AI 低成本查询。 is_sub=True 时标记为子代理 entry,并记录 parent_exec_id。 """ entry = { "output_dir": output_dir.replace("\\", "/"), "chatSessionId": meta.get("chatSessionId", ""), "startTime": meta.get("startTime", ""), "endTime": meta.get("endTime", ""), "status": meta.get("status", ""), "workflowType": meta.get("workflowType", ""), "indexed_at": datetime.now(CST).isoformat(), } if is_sub: entry["is_sub"] = True if parent_exec_id: entry["parent_exec_id"] = parent_exec_id if summary: entry["summary"] = { "duration_s": summary.get("duration_s", 0), "msg_count": summary.get("msg_count", 0), "action_count": summary.get("action_count", 0), "files_modified": summary.get("files_modified", []), "files_created": summary.get("files_created", []), "sub_agents": summary.get("sub_agents", []), "errors": summary.get("errors", []), } if summary.get("description"): entry["description"] = summary["description"] index["entries"][exec_id] = entry # ═══════════════════════════════════════════════════════════ # 主提取逻辑 # ═══════════════════════════════════════════════════════════ def extract_single_execution( agent_dir: str, hex_dir: str, execution: dict, session_dir: Optional[str], index: dict, full_index: Optional[dict] = None, sub_mode: bool = False, sub_index: int = 0, output_base_dir: Optional[str] = None, parent_exec_id: str = "", ) -> Optional[str]: """提取单个 execution 并写入文件。 Args: agent_dir: kiro.kiroagent 目录 hex_dir: execution 所在的 32 位 hex 目录 execution: manifest 中的 execution 条目 session_dir: workspace-sessions 子目录(用于加载会话信息) index: 精简索引字典(仅主对话) full_index: 完整索引字典(主对话 + 子代理),None 时不写入 sub_mode: 是否为子代理模式 sub_index: 子代理序号 output_base_dir: 子代理模式下的输出目录(与主 execution 同目录) parent_exec_id: 子代理的父 execution ID Returns: 输出目录路径,或 None(如果失败/已索引) """ exec_id = execution.get("executionId", "") chat_id = execution.get("chatSessionId", "") # 跳过已索引且文件仍存在的(子代理不检查,因为它们跟随主 execution) if not sub_mode and exec_id in index.get("entries", {}): existing_dir = index["entries"][exec_id].get("output_dir", "") if existing_dir and os.path.isdir(existing_dir): return None # 文件已被清理,从索引中移除,继续提取 del index["entries"][exec_id] # 加载 execution log log_path = find_execution_log(agent_dir, hex_dir, execution) if not log_path: return None try: with open(log_path, "r", encoding="utf-8") as f: log = json.load(f) except Exception: return None # 从完整 log 补充 chatSessionId(新版 manifest 条目中可能缺失) if not chat_id: chat_id = log.get("chatSessionId", "") # 解析 messages = log.get("context", {}).get("messages", []) actions = log.get("actions", []) conversation = parse_messages(messages) timeline = parse_actions(actions) diffs = collect_diffs(agent_dir, hex_dir, execution) # 会话信息(仅主 execution) session_info = None if not sub_mode and session_dir and chat_id: session_info = load_session_detail(session_dir, chat_id) # prompt_log 匹配(仅主 execution) prompt_log = None if not sub_mode: start_time = log.get("startTime", 0) prompt_log = find_matching_prompt_log(start_time) # 取 execution 开始时间(用于目录和文件命名的时间后缀) _start_ms = log.get("startTime") or execution.get("startTime", 0) _ym, _dd, _hms = ts_date_parts(_start_ms) # 确定输出目录 if sub_mode and output_base_dir: out_dir = output_base_dir else: chat_short = chat_id[:8] if chat_id else hash8(exec_id) day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd) out_dir = _resolve_chat_dir(day_dir, chat_short, _hms) os.makedirs(out_dir, exist_ok=True) # 跨天指引:如果 execution 所在日期与 chatSession 目录所在日期不同, # 在 execution 日期的 day_dir 下生成 _ref_{chatShort}.md if not sub_mode: chat_short = chat_id[:8] if chat_id else hash8(exec_id) # out_dir 的父目录是 chatSession 首轮所在的 day_dir chat_day_dir = os.path.dirname(out_dir) exec_day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd) if os.path.normpath(chat_day_dir) != os.path.normpath(exec_day_dir): _write_cross_day_ref(exec_day_dir, chat_short, out_dir) # 先递归提取子代理,收集 sub_file_map(subExecutionId → 文件路径) sub_file_map: dict[str, str] = {} if not sub_mode: sub_exec_ids = extract_sub_execution_ids(actions) for si, sub_eid in enumerate(sub_exec_ids, 1): sub_execs = find_all_executions(agent_dir, execution_id=sub_eid) if sub_execs: sub_exec = sub_execs[0] extract_single_execution( agent_dir=agent_dir, hex_dir=sub_exec["_hex_dir"], execution=sub_exec, session_dir=session_dir, index=index, full_index=full_index, sub_mode=True, sub_index=si, output_base_dir=out_dir, parent_exec_id=exec_id, ) sub_filename = f"sub_{si:02d}_{sub_eid[:8]}.md" sub_file_map[sub_eid] = os.path.join(out_dir, sub_filename) # 计算 prev_msg_count:同 chatSession 的前一轮 execution 的消息数 # 用于去重——跳过 context.messages 中累积的历史消息 # 策略:从同目录已有的 main_*.md 文件摘要中解析 msg_count # (不依赖索引,因为并行提取时索引可能尚未合并) prev_msg_count = 0 existing_mains = [] cur_msg_count = len(conversation) if not sub_mode and out_dir and os.path.isdir(out_dir): existing_mains = sorted( f for f in os.listdir(out_dir) if f.startswith("main_") and f.endswith(".md") ) if existing_mains: # 从已有 main 文件的摘要行解析 msg count # 摘要格式:✅ `chat-agent` | 1406.6s | 44 msgs | 266 actions # 同时检查对话记录部分的"跳过前 N 条历史"来获取累积消息数 # 累积消息数 = 该轮的 prev_msg_count + 该轮新增的 msg_count import re as _re_prev for _mf in existing_mains: try: _mf_path = os.path.join(out_dir, _mf) _mf_msgs = 0 _mf_skipped = 0 with open(_mf_path, "r", encoding="utf-8") as _fh: for _ln_idx, _ln in enumerate(_fh): if _ln_idx > 200: break # 摘要行:44 msgs _mc_match = _re_prev.search(r'\|\s*(\d+)\s*msgs\s*\|', _ln) if _mc_match: _mf_msgs = int(_mc_match.group(1)) # 对话记录行:共 23 条新增消息 (跳过前 44 条历史) _skip_match = _re_prev.search(r'共\s*(\d+)\s*条新增消息\s*\(跳过前\s*(\d+)\s*条', _ln) if _skip_match: _mf_msgs = int(_skip_match.group(1)) _mf_skipped = int(_skip_match.group(2)) break # 对话记录行(首轮):共 11 条消息: _full_match = _re_prev.search(r'共\s*(\d+)\s*条消息:', _ln) if _full_match and _mf_skipped == 0: _mf_msgs = int(_full_match.group(1)) # 累积消息数 = 跳过的 + 本轮新增的 _cumulative = _mf_skipped + _mf_msgs if _cumulative > prev_msg_count and _cumulative < cur_msg_count: prev_msg_count = _cumulative except Exception: pass # 生成 Markdown + 结构化摘要(主 execution 带 sub_file_map) md, summary = generate_full_record( log=log, conversation=conversation, timeline=timeline, diffs=diffs, session_info=session_info, prompt_log_path=prompt_log, is_sub=sub_mode, sub_index=sub_index, sub_file_map=sub_file_map if not sub_mode else None, prev_msg_count=prev_msg_count, ) # 写入文件 if sub_mode: filename = f"sub_{sub_index:02d}_{exec_id[:8]}.md" else: # 自动编号:复用上面已扫描的 existing_mains(prev_msg_count 分支), # 若未进入该分支则重新扫描 if not existing_mains and os.path.isdir(out_dir): existing_mains = sorted( f for f in os.listdir(out_dir) if f.startswith("main_") and f.endswith(".md") ) main_idx = len(existing_mains) + 1 filename = f"main_{main_idx:02d}_{exec_id[:8]}.md" filepath = os.path.join(out_dir, filename) # surrogate 字符(如 \udccb)无法用 utf-8 编码,替换为 U+FFFD md_safe = md.encode("utf-8", errors="surrogateescape").decode("utf-8", errors="replace") with open(filepath, "w", encoding="utf-8") as f: f.write(md_safe) # 更新索引 _meta = { "chatSessionId": chat_id, "startTime": ts_fmt(log.get("startTime")), "endTime": ts_fmt(log.get("endTime")), "status": log.get("status", ""), "workflowType": log.get("workflowType", ""), } if not sub_mode: # 主对话:写入精简索引 + 完整索引 update_index(index, exec_id, out_dir, _meta, summary=summary) if full_index is not None: update_index(full_index, exec_id, out_dir, _meta, summary=summary) else: # 子代理:只写入完整索引 if full_index is not None: update_index(full_index, exec_id, out_dir, _meta, summary=summary, is_sub=True, parent_exec_id=parent_exec_id) return out_dir # ═══════════════════════════════════════════════════════════ # 入口函数 # ═══════════════════════════════════════════════════════════ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optional[str] = None): """提取所有未索引的终态 execution(供 agent_on_stop 调用)。 同一 chatSession 的 execution 按 startTime 升序提取,确保 prev_msg_count 正确计算。 当前轮次在 agent on stop 触发时通常还是 running 状态,会在下一次调用时回补。 """ gs = global_storage or DEFAULT_GLOBAL_STORAGE ws = workspace_path or os.getcwd() agent_dir = find_kiro_agent_dir(gs) if not agent_dir: return session_dir = find_workspace_session_dir(agent_dir, ws) chat_ids = None if session_dir: sessions = load_sessions_json(session_dir) chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions if s.get("chatSessionId") or s.get("sessionId")} TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted") all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids) if not all_execs: return index = load_index() full_index = load_full_index() # 筛选:未索引 + 终态 ready = [e for e in all_execs if e.get("executionId", "") not in index.get("entries", {}) and e.get("status", "") in TERMINAL_STATUSES] if not ready: return # 按 chatSessionId 分组,组内按 startTime 升序 # 确保同一对话的 execution 按时间顺序提取,prev_msg_count 才能正确计算 from collections import defaultdict chat_groups: dict[str, list[dict]] = defaultdict(list) for e in ready: cid = e.get("chatSessionId", "") or "unknown" chat_groups[cid].append(e) for cid in chat_groups: chat_groups[cid].sort(key=lambda x: x.get("startTime", 0)) extracted_count = 0 for cid, group_execs in chat_groups.items(): for execution in group_execs: result = extract_single_execution( agent_dir=agent_dir, hex_dir=execution["_hex_dir"], execution=execution, session_dir=session_dir, index=index, full_index=full_index, ) if result: extracted_count += 1 print(f"[session-extract] extracted: {result}") if extracted_count > 0: save_index(index) save_full_index(full_index) if extracted_count > 1: print(f"[session-extract] total: {extracted_count} executions") def extract_all_unindexed( global_storage: Optional[str] = None, workspace_path: Optional[str] = None, limit: Optional[int] = None, workers: int = 8, ): """提取所有未索引的 execution(多线程并行)""" gs = global_storage or DEFAULT_GLOBAL_STORAGE ws = workspace_path or os.getcwd() agent_dir = find_kiro_agent_dir(gs) if not agent_dir: print("[session-extract] kiro.kiroagent dir not found") return session_dir = find_workspace_session_dir(agent_dir, ws) chat_ids = None if session_dir: sessions = load_sessions_json(session_dir) # 兼容 chatSessionId(旧版)和 sessionId(新版)两种字段名 chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions if s.get("chatSessionId") or s.get("sessionId")} all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids) if not all_execs: print("[session-extract] no executions found") return index = load_index() full_index = load_full_index() # 筛选未索引的 execution(只提取终态的,跳过 running 等非终态) TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted") todo = [e for e in all_execs if e.get("executionId", "") not in index.get("entries", {}) and e.get("status", "") in TERMINAL_STATUSES] if limit: todo = todo[:limit] if not todo: print("[session-extract] all indexed, nothing to do") return print(f"[session-extract] {len(todo)} executions to extract (workers={workers})") import threading from concurrent.futures import ThreadPoolExecutor, as_completed # 按 chatSessionId 分组,同组内按 startTime 排序串行提取 # (同一 chatSession 的 context.messages 是累积的,需要按顺序提取以去重) from collections import defaultdict chat_groups: dict[str, list[dict]] = defaultdict(list) for e in todo: cid = e.get("chatSessionId", "") or "unknown" chat_groups[cid].append(e) for cid in chat_groups: chat_groups[cid].sort(key=lambda x: x.get("startTime", 0)) lock = threading.Lock() count = 0 def _extract_group(group_execs): """串行提取同一 chatSession 的所有 execution""" local_index = {"version": 2, "entries": {}} local_full = {"version": 2, "entries": {}} results = [] for execution in group_execs: try: result = extract_single_execution( agent_dir=agent_dir, hex_dir=execution["_hex_dir"], execution=execution, session_dir=session_dir, index=local_index, full_index=local_full, ) if result: results.append(result) except Exception as e: eid = execution.get("executionId", "?")[:8] print(f"[session-extract] ✗ {eid}: {e}") return results, local_index.get("entries", {}), local_full.get("entries", {}) with ThreadPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_extract_group, execs): cid for cid, execs in chat_groups.items()} for future in as_completed(futures): results, idx_entries, full_entries = future.result() if results: with lock: count += len(results) index["entries"].update(idx_entries) full_index["entries"].update(full_entries) if count % 50 == 0: save_index(index) save_full_index(full_index) print(f"[session-extract] [{count}/{len(todo)}] checkpoint saved") elif count % 10 == 0: print(f"[session-extract] [{count}/{len(todo)}]") # 最终保存 save_index(index) save_full_index(full_index) print(f"[session-extract] done, extracted {count}/{len(todo)}") def extract_by_id( execution_id: str, global_storage: Optional[str] = None, ): """提取指定 executionId 的 execution""" gs = global_storage or DEFAULT_GLOBAL_STORAGE agent_dir = find_kiro_agent_dir(gs) if not agent_dir: print("[session-extract] kiro.kiroagent dir not found") return execs = find_all_executions(agent_dir, execution_id=execution_id) if not execs: print(f"[session-extract] execution not found: {execution_id}") return # 验证确实匹配到了目标 execution(前缀匹配) matched = execs[0] if not matched.get("executionId", "").startswith(execution_id): print(f"[session-extract] execution not found: {execution_id}") return index = load_index() full_index = load_full_index() result = extract_single_execution( agent_dir=agent_dir, hex_dir=execs[0]["_hex_dir"], execution=execs[0], session_dir=None, index=index, full_index=full_index, ) if result: save_index(index) save_full_index(full_index) print(f"[session-extract] extracted: {result}") # ═══════════════════════════════════════════════════════════ # CLI 入口 # ═══════════════════════════════════════════════════════════ def main(): import argparse parser = argparse.ArgumentParser(description="Kiro 执行日志全量提取器 v2") parser.add_argument("--all", action="store_true", help="提取所有未索引的 execution") parser.add_argument("--recent", type=int, metavar="N", help="提取最近 N 条未索引的") parser.add_argument("--workers", type=int, default=8, help="并行线程数(默认 8)") parser.add_argument("--execution-id", type=str, help="提取指定 executionId") parser.add_argument("--global-storage", type=str, help="globalStorage 路径") parser.add_argument("--workspace", type=str, help="workspace 路径") args = parser.parse_args() gs = args.global_storage ws = args.workspace if args.execution_id: extract_by_id(args.execution_id, global_storage=gs) elif args.all: extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers) elif args.recent: extract_all_unindexed(global_storage=gs, workspace_path=ws, limit=args.recent, workers=args.workers) else: extract_latest(global_storage=gs, workspace_path=ws) if __name__ == "__main__": main()