1896 lines
76 KiB
Python
1896 lines
76 KiB
Python
#!/usr/bin/env python3
|
||
"""extract_kiro_session — Kiro 执行日志全量提取器 v2。
|
||
|
||
改进点(相比 v1):
|
||
1. 系统提示词去重:首次保存到 _system_prompts/sp_{hash8}.md,后续引用
|
||
2. 目录分层:YYYY-MM/DD/{chatSessionId_short}/ 下按对话组织
|
||
3. 子代理递归提取:主 execution + 子 execution 放同一目录,按调用顺序编号
|
||
4. ID 替换:kiro-diff URI → 真实文件路径,terminalId → 进程描述
|
||
5. CONTEXT TRANSFER 中的 steering 内容折叠引用
|
||
6. 无内容的 model action 压缩为一行
|
||
|
||
用法:
|
||
python scripts/ops/extract_kiro_session.py # 提取最新 execution
|
||
python scripts/ops/extract_kiro_session.py --all # 提取所有未索引的
|
||
python scripts/ops/extract_kiro_session.py --recent 20 # 提取最近 N 个未索引的
|
||
python scripts/ops/extract_kiro_session.py --execution-id XX # 提取指定 execution
|
||
"""
|
||
|
||
import base64
|
||
import hashlib
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
from datetime import datetime, timezone, timedelta
|
||
from typing import Optional
|
||
|
||
from _env_paths import ensure_repo_root
|
||
|
||
ensure_repo_root()
|
||
|
||
CST = timezone(timedelta(hours=8))
|
||
|
||
# Kiro 固定的 execution manifest 文件名
|
||
MANIFEST_FILENAME = "f62de366d0006e17ea00a01f6624aabf"
|
||
|
||
# 输出路径
|
||
SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs")
|
||
INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json") # 精简版:仅主对话
|
||
INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json") # 完整版:主对话 + 子代理
|
||
SYSTEM_PROMPTS_DIR = os.path.join(SESSION_LOG_DIR, "_system_prompts")
|
||
|
||
# globalStorage 默认路径
|
||
DEFAULT_GLOBAL_STORAGE = os.path.join(
|
||
os.environ.get("APPDATA", ""),
|
||
"Kiro", "User", "globalStorage"
|
||
)
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 工具函数
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def ts_fmt(ms) -> str:
|
||
if not ms:
|
||
return "N/A"
|
||
try:
|
||
return datetime.fromtimestamp(ms / 1000, tz=CST).strftime("%Y-%m-%d %H:%M:%S")
|
||
except Exception:
|
||
return str(ms)
|
||
|
||
def ts_iso(ms) -> str:
|
||
if not ms:
|
||
return ""
|
||
try:
|
||
return datetime.fromtimestamp(ms / 1000, tz=CST).isoformat()
|
||
except Exception:
|
||
return ""
|
||
|
||
def ts_date_parts(ms) -> tuple[str, str, str]:
|
||
"""返回 (YYYY-MM, DD, HHMMSS) 用于目录分层和文件命名"""
|
||
try:
|
||
dt = datetime.fromtimestamp(ms / 1000, tz=CST)
|
||
return dt.strftime("%Y-%m"), dt.strftime("%d"), dt.strftime("%H%M%S")
|
||
except Exception:
|
||
return "unknown", "00", "000000"
|
||
|
||
|
||
def _resolve_chat_dir(day_dir: str, chat_short: str, first_hms: str) -> str:
|
||
"""为 chatSession 确定带当天序号的输出目录。
|
||
|
||
规则:
|
||
1. 先在 day_dir 下查找已有的同 chatSession 目录(同一天的后续轮次)
|
||
2. 再在整个 SESSION_LOG_DIR 下搜索(跨天场景:chatSession 首轮在其他日期)
|
||
3. 都没找到则在 day_dir 下分配新序号创建
|
||
- 目录格式:{seq:02d}_{chat_short}_{first_hms}/
|
||
"""
|
||
os.makedirs(day_dir, exist_ok=True)
|
||
|
||
# 1. 在当天目录下查找
|
||
for d in os.listdir(day_dir):
|
||
if os.path.isdir(os.path.join(day_dir, d)) and chat_short in d:
|
||
return os.path.join(day_dir, d)
|
||
|
||
# 2. 跨天搜索:遍历所有 YYYY-MM/DD/ 目录
|
||
log_root = SESSION_LOG_DIR
|
||
if os.path.isdir(log_root):
|
||
for ym in os.listdir(log_root):
|
||
ym_path = os.path.join(log_root, ym)
|
||
if not os.path.isdir(ym_path) or ym.startswith("_"):
|
||
continue
|
||
for dd in os.listdir(ym_path):
|
||
dd_path = os.path.join(ym_path, dd)
|
||
if not os.path.isdir(dd_path):
|
||
continue
|
||
for d in os.listdir(dd_path):
|
||
if os.path.isdir(os.path.join(dd_path, d)) and chat_short in d:
|
||
return os.path.join(dd_path, d)
|
||
|
||
# 3. 新 chatSession:分配序号
|
||
existing_seqs = []
|
||
for d in os.listdir(day_dir):
|
||
if os.path.isdir(os.path.join(day_dir, d)) and len(d) >= 2 and d[:2].isdigit():
|
||
existing_seqs.append(int(d[:2]))
|
||
next_seq = max(existing_seqs, default=0) + 1
|
||
new_dir = os.path.join(day_dir, f"{next_seq:02d}_{chat_short}_{first_hms}")
|
||
os.makedirs(new_dir, exist_ok=True)
|
||
return new_dir
|
||
|
||
|
||
def _write_cross_day_ref(exec_day_dir: str, chat_short: str, chat_dir: str):
|
||
"""在 execution 所在日期目录下生成跨天指引文件。
|
||
|
||
当一个 chatSession 跨天时,后续日期的 day_dir 下不会有该对话的目录,
|
||
生成 _ref_{chatShort}.md 告知该对话归在哪个目录。
|
||
"""
|
||
os.makedirs(exec_day_dir, exist_ok=True)
|
||
ref_path = os.path.join(exec_day_dir, f"_ref_{chat_short}.md")
|
||
if os.path.isfile(ref_path):
|
||
return # 已存在,不重复写
|
||
rel_target = os.path.relpath(chat_dir, exec_day_dir).replace("\\", "/")
|
||
with open(ref_path, "w", encoding="utf-8") as f:
|
||
f.write(f"# 跨天对话指引\n\n")
|
||
f.write(f"chatSession `{chat_short}` 的完整记录归档在:\n\n")
|
||
f.write(f"→ `{rel_target}`\n\n")
|
||
f.write(f"(绝对路径:`{chat_dir.replace(chr(92), '/')}`)\n")
|
||
|
||
|
||
def trunc(s, n=3000) -> str:
|
||
if not isinstance(s, str):
|
||
return str(s)
|
||
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
|
||
|
||
def safe_json(obj, n=5000) -> str:
|
||
try:
|
||
s = json.dumps(obj, ensure_ascii=False, indent=2)
|
||
except Exception:
|
||
s = str(obj)
|
||
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
|
||
|
||
def fence(content: str, lang: str = "") -> str:
|
||
"""生成安全的 Markdown 代码围栏。
|
||
检测 content 中最长连续反引号序列,外层用更多反引号包裹。
|
||
同时转义行首 # 避免被解析为 Markdown 标题。
|
||
如果内容中有未闭合的围栏,在末尾补上关闭围栏。
|
||
"""
|
||
if not content:
|
||
return f"```{lang}\n```"
|
||
# 修复内容中未闭合的围栏(原始数据截断导致)
|
||
fence_stack = []
|
||
for line in content.split("\n"):
|
||
stripped = line.strip()
|
||
m = re.match(r"^(`{3,})", stripped)
|
||
if m:
|
||
ticks = len(m.group(1))
|
||
# 如果栈顶有相同 tick 数的开启围栏,且当前行是纯关闭围栏
|
||
if fence_stack and fence_stack[-1] == ticks and stripped == '`' * ticks:
|
||
fence_stack.pop()
|
||
else:
|
||
fence_stack.append(ticks)
|
||
# 补上所有未闭合的围栏(从栈顶开始关闭)
|
||
if fence_stack:
|
||
suffix_lines = ['`' * t for t in reversed(fence_stack)]
|
||
content = content + "\n" + "\n".join(suffix_lines)
|
||
# 找出内容中最长的连续反引号
|
||
max_ticks = 2
|
||
cur = 0
|
||
for ch in content:
|
||
if ch == '`':
|
||
cur += 1
|
||
if cur > max_ticks:
|
||
max_ticks = cur
|
||
else:
|
||
cur = 0
|
||
outer = '`' * (max_ticks + 1)
|
||
# 转义行首 # —— 加零宽空格使其不被解析为标题
|
||
safe = _escape_heading(content)
|
||
return f"{outer}{lang}\n{safe}\n{outer}"
|
||
|
||
|
||
|
||
def _escape_heading(text: str) -> str:
|
||
"""转义文本中行首的 # 符号,防止被 Markdown 解析为标题。
|
||
在 # 前插入零宽空格 (\\u200b)。
|
||
"""
|
||
lines = text.split('\n')
|
||
out = []
|
||
for line in lines:
|
||
if line.lstrip().startswith('#'):
|
||
# 找到第一个 # 的位置,在前面插入零宽空格
|
||
idx = 0
|
||
while idx < len(line) and line[idx] in (' ', '\t'):
|
||
idx += 1
|
||
out.append(line[:idx] + '\u200b' + line[idx:])
|
||
else:
|
||
out.append(line)
|
||
return '\n'.join(out)
|
||
|
||
|
||
def hash8(text: str) -> str:
|
||
return hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 系统提示词去重
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def save_system_prompt(text: str) -> str:
|
||
"""保存系统提示词到 _system_prompts/,返回引用文件名。
|
||
如果已存在相同 hash 的文件则跳过。
|
||
"""
|
||
h = hash8(text)
|
||
filename = f"sp_{h}.md"
|
||
filepath = os.path.join(SYSTEM_PROMPTS_DIR, filename)
|
||
if not os.path.isfile(filepath):
|
||
os.makedirs(SYSTEM_PROMPTS_DIR, exist_ok=True)
|
||
with open(filepath, "w", encoding="utf-8") as f:
|
||
f.write(f"# 系统提示词 (hash: {h})\n\n")
|
||
f.write(text)
|
||
return filename
|
||
|
||
|
||
def is_system_prompt(text: str) -> bool:
|
||
"""判断文本是否为系统提示词(Kiro 注入的 <identity>/<capabilities> 等)"""
|
||
if not text:
|
||
return False
|
||
# 系统提示词通常以 <identity> 开头或包含 <capabilities>
|
||
return (
|
||
"<identity>" in text[:200]
|
||
or "<capabilities>" in text[:500]
|
||
or text.strip().startswith("You are Kiro")
|
||
)
|
||
|
||
|
||
def is_steering_block(text: str) -> bool:
|
||
"""判断文本是否为 steering-reminder 注入"""
|
||
return "<steering-reminder>" in text[:100]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# ID 替换与路径还原
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
# kiro-diff URI 模式:kiro-diff:/path?commitId=xxx&executionId=yyy
|
||
KIRO_DIFF_PATTERN = re.compile(
|
||
r'kiro-diff:(/[^?]+)\?commitId(?:%3D|=)([^&]+)&executionId(?:%3D|=)([^"\'}\s]+)'
|
||
)
|
||
|
||
def resolve_kiro_diff_uri(uri: str) -> str:
|
||
"""将 kiro-diff: URI 替换为可读的文件路径描述"""
|
||
m = KIRO_DIFF_PATTERN.search(uri)
|
||
if m:
|
||
filepath = m.group(1)
|
||
commit_id = m.group(2)
|
||
return f"{filepath} (版本: {commit_id[:8]})"
|
||
return uri
|
||
|
||
|
||
def resolve_ids_in_text(text: str) -> str:
|
||
"""在文本中替换已知的 ID 模式为可读信息"""
|
||
if not text or not isinstance(text, str):
|
||
return str(text) if text else ""
|
||
# 替换 kiro-diff URI
|
||
text = KIRO_DIFF_PATTERN.sub(
|
||
lambda m: f'[文件快照] {m.group(1)} (版本 {m.group(2)[:8]})',
|
||
text
|
||
)
|
||
# 替换 file:///c%3A/ 编码路径
|
||
text = re.sub(
|
||
r'file:///([a-zA-Z])%3A/',
|
||
lambda m: f'{m.group(1).upper()}:/',
|
||
text
|
||
)
|
||
return text
|
||
|
||
|
||
def resolve_tool_args(name: str, args: dict) -> dict:
|
||
"""对工具调用参数做可读性增强"""
|
||
resolved = dict(args)
|
||
|
||
# document 类型中的 target 可能是 kiro-diff URI
|
||
if "target" in resolved and isinstance(resolved["target"], str):
|
||
resolved["target"] = resolve_kiro_diff_uri(resolved["target"])
|
||
|
||
# editCode / strReplace 中的 path
|
||
if "path" in resolved and isinstance(resolved["path"], str):
|
||
resolved["path"] = resolve_ids_in_text(resolved["path"])
|
||
|
||
# document entries 中的 modified/original
|
||
for key in ("modified", "original", "local"):
|
||
if key in resolved and isinstance(resolved[key], str):
|
||
resolved[key] = resolve_ids_in_text(resolved[key])
|
||
|
||
return resolved
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 定位逻辑
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def find_kiro_agent_dir(global_storage: str) -> Optional[str]:
|
||
agent_dir = os.path.join(global_storage, "kiro.kiroagent")
|
||
return agent_dir if os.path.isdir(agent_dir) else None
|
||
|
||
|
||
def decode_base64url_dir(dirname: str) -> str:
|
||
try:
|
||
b64 = dirname.replace("__", "==")
|
||
return base64.urlsafe_b64decode(b64).decode("utf-8", errors="replace")
|
||
except Exception:
|
||
return ""
|
||
|
||
|
||
def find_workspace_session_dir(agent_dir: str, workspace_path: str) -> Optional[str]:
|
||
ws_dir = os.path.join(agent_dir, "workspace-sessions")
|
||
if not os.path.isdir(ws_dir):
|
||
return None
|
||
target = workspace_path.replace("\\", "/").rstrip("/").lower()
|
||
for entry in os.scandir(ws_dir):
|
||
if entry.is_dir():
|
||
decoded = decode_base64url_dir(entry.name)
|
||
if decoded.replace("\\", "/").rstrip("/").lower() == target:
|
||
return entry.path
|
||
return None
|
||
|
||
|
||
def load_sessions_json(session_dir: str) -> list[dict]:
|
||
sessions_file = os.path.join(session_dir, "sessions.json")
|
||
if not os.path.isfile(sessions_file):
|
||
return []
|
||
try:
|
||
with open(sessions_file, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
if isinstance(data, list):
|
||
return data
|
||
if isinstance(data, dict) and "sessions" in data:
|
||
return data["sessions"]
|
||
return []
|
||
except Exception:
|
||
return []
|
||
|
||
|
||
def load_session_detail(session_dir: str, session_id: str) -> Optional[dict]:
|
||
path = os.path.join(session_dir, f"{session_id}.json")
|
||
if not os.path.isfile(path):
|
||
return None
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def find_all_manifests(agent_dir: str) -> list[tuple[str, str]]:
|
||
results = []
|
||
hex_pattern = re.compile(r"^[0-9a-f]{32}$")
|
||
for entry in os.scandir(agent_dir):
|
||
if entry.is_dir() and hex_pattern.match(entry.name):
|
||
manifest = os.path.join(entry.path, MANIFEST_FILENAME)
|
||
if os.path.isfile(manifest):
|
||
results.append((entry.name, manifest))
|
||
return results
|
||
|
||
|
||
def load_manifest(manifest_path: str) -> list[dict]:
|
||
try:
|
||
with open(manifest_path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
if isinstance(data, list):
|
||
return data
|
||
# Kiro 新版 manifest 格式: {"executions": [...], "version": ...}
|
||
if isinstance(data, dict) and "executions" in data:
|
||
return data["executions"]
|
||
return []
|
||
except Exception:
|
||
return []
|
||
|
||
|
||
|
||
def find_execution_log(agent_dir: str, hex_dir: str, execution: dict) -> Optional[str]:
|
||
"""定位完整 execution log 文件"""
|
||
exec_id = execution.get("executionId", "")
|
||
hex_path = os.path.join(agent_dir, hex_dir)
|
||
for entry in os.scandir(hex_path):
|
||
if not entry.is_dir():
|
||
continue
|
||
for sub_entry in os.scandir(entry.path):
|
||
if not sub_entry.is_file() or sub_entry.stat().st_size < 1000:
|
||
continue
|
||
try:
|
||
with open(sub_entry.path, "r", encoding="utf-8") as f:
|
||
head = f.read(500)
|
||
if exec_id in head:
|
||
f.seek(0)
|
||
data = json.load(f)
|
||
if data.get("executionId") == exec_id:
|
||
return sub_entry.path
|
||
except Exception:
|
||
continue
|
||
return None
|
||
|
||
|
||
def find_all_executions(
|
||
agent_dir: str,
|
||
chat_session_ids: Optional[set[str]] = None,
|
||
execution_id: Optional[str] = None,
|
||
) -> list[dict]:
|
||
"""从所有 manifest 中找匹配的 execution,按 endTime 降序"""
|
||
manifests = find_all_manifests(agent_dir)
|
||
all_execs = []
|
||
for hex_dir, manifest_path in manifests:
|
||
entries = load_manifest(manifest_path)
|
||
for entry in entries:
|
||
entry["_hex_dir"] = hex_dir
|
||
if execution_id:
|
||
eid = entry.get("executionId", "")
|
||
if eid == execution_id or eid.startswith(execution_id):
|
||
return [entry]
|
||
# 仅当 execution 有 chatSessionId 且不在 workspace session 集合中时才过滤
|
||
csid = entry.get("chatSessionId")
|
||
if chat_session_ids and csid and csid not in chat_session_ids:
|
||
continue
|
||
all_execs.append(entry)
|
||
all_execs.sort(key=lambda e: e.get("endTime", 0), reverse=True)
|
||
return all_execs
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 解析逻辑
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def parse_messages(messages: list) -> list[dict]:
|
||
"""解析 context.messages,处理系统提示词去重和 ID 替换"""
|
||
conversation = []
|
||
for i, msg in enumerate(messages):
|
||
entries = msg.get("entries", [])
|
||
parsed = []
|
||
for entry in entries:
|
||
if not isinstance(entry, dict):
|
||
continue
|
||
etype = entry.get("type", "unknown")
|
||
if etype == "text":
|
||
text = entry.get("text", "")
|
||
# 检测系统提示词
|
||
if is_system_prompt(text):
|
||
sp_file = save_system_prompt(text)
|
||
parsed.append({
|
||
"type": "system_prompt_ref",
|
||
"ref_file": sp_file,
|
||
"char_count": len(text),
|
||
})
|
||
elif is_steering_block(text):
|
||
# steering 内容折叠,只保留文件名列表
|
||
steering_files = re.findall(r'(\w[\w-]+\.md):', text)
|
||
parsed.append({
|
||
"type": "steering_ref",
|
||
"files": steering_files or ["(steering block)"],
|
||
"char_count": len(text),
|
||
})
|
||
else:
|
||
parsed.append({"type": "text", "text": resolve_ids_in_text(text)})
|
||
elif etype == "toolUse":
|
||
args = resolve_tool_args(entry.get("name", ""), entry.get("args", {}))
|
||
parsed.append({
|
||
"type": "toolUse",
|
||
"id": entry.get("id"),
|
||
"name": entry.get("name"),
|
||
"args": args,
|
||
})
|
||
elif etype == "toolUseResponse":
|
||
msg_text = entry.get("message", "")
|
||
parsed.append({
|
||
"type": "toolUseResponse",
|
||
"id": entry.get("id"),
|
||
"name": entry.get("name"),
|
||
"message": resolve_ids_in_text(msg_text),
|
||
"success": entry.get("success"),
|
||
})
|
||
elif etype == "document":
|
||
doc = entry.get("document", {})
|
||
doc_type = doc.get("type", "")
|
||
target = doc.get("target", "")
|
||
# steering 类型的 document:提取文件名
|
||
if doc_type == "steering":
|
||
display_name = doc.get("displayName", "")
|
||
parsed.append({
|
||
"type": "steering_doc",
|
||
"name": display_name or "steering",
|
||
})
|
||
else:
|
||
parsed.append({
|
||
"type": "document",
|
||
"doc_type": doc_type,
|
||
"target": resolve_ids_in_text(target) if target else "",
|
||
})
|
||
else:
|
||
parsed.append({"type": etype, "raw_keys": list(entry.keys())})
|
||
conversation.append({
|
||
"index": i,
|
||
"role": msg.get("role", "?"),
|
||
"messageId": msg.get("messageId", "?"),
|
||
"entries": parsed,
|
||
})
|
||
return conversation
|
||
|
||
|
||
def parse_actions(actions: list) -> list[dict]:
|
||
"""解析 actions,压缩无内容的 model action"""
|
||
timeline = []
|
||
for i, action in enumerate(actions):
|
||
atype = action.get("actionType", "")
|
||
astate = action.get("actionState", "")
|
||
|
||
# 压缩无内容的 model action 为摘要
|
||
if atype == "model" and "output" not in action and "input" not in action:
|
||
timeline.append({
|
||
"index": i,
|
||
"actionType": "model",
|
||
"actionState": astate,
|
||
"emittedAt": ts_fmt(action.get("emittedAt")),
|
||
"_compressed": True,
|
||
})
|
||
continue
|
||
|
||
entry = {
|
||
"index": i,
|
||
"actionId": action.get("actionId"),
|
||
"actionType": atype,
|
||
"actionState": astate,
|
||
"emittedAt": ts_fmt(action.get("emittedAt")),
|
||
}
|
||
if action.get("subExecutionId"):
|
||
entry["subExecutionId"] = action["subExecutionId"]
|
||
if action.get("endTime"):
|
||
entry["endTime"] = ts_fmt(action["endTime"])
|
||
for k in ("intentResult", "input", "output"):
|
||
if k in action:
|
||
val = action[k]
|
||
# 对 output/input 中的文本做 ID 替换
|
||
if isinstance(val, dict):
|
||
val = dict(val) # 避免修改原始数据
|
||
# 提取文件变更信息(write/create action 的 originalContent/modifiedContent)
|
||
if k == "input" and ("originalContent" in val or "modifiedContent" in val):
|
||
file_path = val.get("file", val.get("path", "?"))
|
||
entry["_file_change"] = {
|
||
"file": resolve_ids_in_text(str(file_path)),
|
||
"original": val.get("originalContent", ""),
|
||
"modified": val.get("modifiedContent", ""),
|
||
}
|
||
# 从 input 中移除大文本,保留元信息
|
||
slim = {vk: resolve_ids_in_text(str(vv)) if isinstance(vv, str) else vv
|
||
for vk, vv in val.items()
|
||
if vk not in ("originalContent", "modifiedContent")}
|
||
entry[k] = slim
|
||
continue
|
||
for vk, vv in val.items():
|
||
if isinstance(vv, str):
|
||
val[vk] = resolve_ids_in_text(vv)
|
||
entry[k] = val
|
||
timeline.append(entry)
|
||
return timeline
|
||
|
||
|
||
def extract_sub_execution_ids(actions: list) -> list[str]:
|
||
"""从 actions 中提取所有 subExecutionId(按出现顺序)"""
|
||
seen = set()
|
||
result = []
|
||
for action in actions:
|
||
sid = action.get("subExecutionId")
|
||
if sid and sid not in seen:
|
||
seen.add(sid)
|
||
result.append(sid)
|
||
return result
|
||
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# Diff 快照收集
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def collect_diffs(agent_dir: str, hex_dir: str, execution: dict) -> dict[str, dict]:
|
||
"""从 execution 的 actions 中提取文件变更的 diff 信息。
|
||
Kiro 在 write/create action 的 input 中内联了 originalContent 和 modifiedContent。
|
||
"""
|
||
# diff 快照存储在固定目录 74a08cf8.../commitId/ 下,
|
||
# 但 action input 中已内联内容,直接从 actions 提取更可靠
|
||
return {}
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# Prompt Log 匹配
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
PROMPT_LOG_DIR = os.path.join("docs", "audit", "prompt_logs")
|
||
|
||
def find_matching_prompt_log(start_time_ms: int) -> Optional[str]:
|
||
"""根据 execution startTime 匹配最近的 prompt_log 文件。
|
||
prompt_log 文件名格式: prompt_log_YYYYMMDD_HHMMSS.md
|
||
匹配窗口: startTime 前后 5 分钟内最近的一个。
|
||
"""
|
||
if not os.path.isdir(PROMPT_LOG_DIR):
|
||
return None
|
||
try:
|
||
exec_dt = datetime.fromtimestamp(start_time_ms / 1000, tz=CST)
|
||
except Exception:
|
||
return None
|
||
|
||
best_match = None
|
||
best_delta = float("inf")
|
||
pattern = re.compile(r"prompt_log_(\d{8}_\d{6})\.md$")
|
||
|
||
for f in os.scandir(PROMPT_LOG_DIR):
|
||
if not f.is_file():
|
||
continue
|
||
m = pattern.match(f.name)
|
||
if not m:
|
||
continue
|
||
try:
|
||
log_dt = datetime.strptime(m.group(1), "%Y%m%d_%H%M%S").replace(tzinfo=CST)
|
||
delta = abs((exec_dt - log_dt).total_seconds())
|
||
if delta < 300 and delta < best_delta: # 5 分钟窗口
|
||
best_delta = delta
|
||
best_match = f.path
|
||
except Exception:
|
||
continue
|
||
return best_match
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# Markdown 生成
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def _msg_semantic_label(msg: dict) -> str:
|
||
"""为对话消息生成语义标签,用于快速定位。"""
|
||
entries = msg.get("entries", [])
|
||
if not entries:
|
||
return ""
|
||
parts = []
|
||
for e in entries:
|
||
et = e["type"]
|
||
if et == "system_prompt_ref":
|
||
parts.append("系统提示词")
|
||
elif et == "steering_ref":
|
||
parts.append(f"Steering({len(e.get('files', []))})")
|
||
elif et == "steering_doc":
|
||
parts.append(f"Steering:`{e.get('name', '?')}`")
|
||
elif et == "toolUse":
|
||
name = e.get("name", "?")
|
||
# 提取关键参数作为上下文
|
||
args = e.get("args", {})
|
||
ctx = ""
|
||
if name in ("readFile", "readCode", "readMultipleFiles"):
|
||
ctx = args.get("path", "") or ",".join(args.get("paths", [])[:2])
|
||
elif name in ("fsWrite", "strReplace", "editCode"):
|
||
ctx = args.get("path", "")
|
||
elif name == "grepSearch":
|
||
ctx = args.get("query", "")[:30]
|
||
elif name == "invokeSubAgent":
|
||
ctx = args.get("name", "")
|
||
elif name == "executePwsh":
|
||
ctx = (args.get("command", "") or "")[:40]
|
||
elif name == "taskStatus":
|
||
ctx = args.get("status", "")
|
||
if ctx:
|
||
parts.append(f"调用 `{name}` → {ctx}")
|
||
else:
|
||
parts.append(f"调用 `{name}`")
|
||
elif et == "toolUseResponse":
|
||
name = e.get("name", "?")
|
||
ok = "✅" if e.get("success") else "❌"
|
||
parts.append(f"结果 `{name}` {ok}")
|
||
elif et == "document":
|
||
parts.append(f"文档:{e.get('doc_type', '?')}")
|
||
elif et == "text":
|
||
# 文本内容:提取前 50 字符作为预览(bot 和 human 都加)
|
||
role = msg.get("role", "")
|
||
if role in ("bot", "human"):
|
||
text = (e.get("text") or "").strip()
|
||
if text:
|
||
preview = text[:50].replace("\n", " ")
|
||
if len(text) > 50:
|
||
preview += "…"
|
||
icon = "💬" if role == "bot" else "📝"
|
||
parts.append(f"{icon} `{preview}`")
|
||
return ", ".join(parts) if parts else ""
|
||
|
||
|
||
def _step_semantic_label(step: dict) -> str:
|
||
"""为 action step 生成带图标的语义标签。"""
|
||
at = step.get("actionType", "?")
|
||
state = step.get("actionState", "?")
|
||
fc = step.get("_file_change")
|
||
sub_eid = step.get("subExecutionId")
|
||
|
||
# 状态图标
|
||
if state == "Error":
|
||
state_icon = "❌"
|
||
elif state in ("Success", "Accepted"):
|
||
state_icon = "✅"
|
||
else:
|
||
state_icon = "⏳"
|
||
|
||
# 类型图标 + 上下文
|
||
if at in ("write", "append") and fc:
|
||
fname = fc.get("file", "?")
|
||
short = fname.rsplit("/", 1)[-1] if "/" in fname else fname
|
||
orig = fc.get("original", "")
|
||
if orig:
|
||
return f"⚡ `{at}` 修改 `{short}` {state_icon}"
|
||
else:
|
||
return f"⚡ `{at}` 新建 `{short}` {state_icon}"
|
||
elif at == "invokeSubAgent":
|
||
inp = step.get("input", {})
|
||
# Kiro 原始 log 用 subAgentName,工具 schema 用 name
|
||
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
|
||
return f"🔀 `invokeSubAgent` → {agent_name} {state_icon}"
|
||
elif at == "subagentResponse":
|
||
return f"🔀 `subagentResponse` {state_icon}"
|
||
elif at in ("readFiles", "readCode"):
|
||
inp = step.get("input", {})
|
||
if isinstance(inp, dict):
|
||
files = inp.get("files", [])
|
||
if files and isinstance(files[0], dict):
|
||
paths = [f.get("path", "?") for f in files[:2]]
|
||
else:
|
||
paths = [str(f) for f in files[:2]]
|
||
ctx = ", ".join(paths)
|
||
else:
|
||
ctx = ""
|
||
return f"📖 `{at}` {ctx} {state_icon}" if ctx else f"📖 `{at}` {state_icon}"
|
||
elif at == "search":
|
||
return f"🔍 `search` {state_icon}"
|
||
elif at == "say":
|
||
return f"💬 `say` {state_icon}"
|
||
elif at == "taskStatus":
|
||
return f"📋 `taskStatus` {state_icon}"
|
||
elif at == "steering":
|
||
return f"📄 `steering` {state_icon}"
|
||
elif at == "runCommand":
|
||
return f"🖥️ `runCommand` {state_icon}"
|
||
elif at == "getDiagnostics":
|
||
return f"🩺 `getDiagnostics` {state_icon}"
|
||
elif at == "ContextualHookInvoked":
|
||
inp = step.get("input", {})
|
||
hook_name = inp.get("name", "?") if isinstance(inp, dict) else "?"
|
||
return f"🪝 Hook `{hook_name}` {state_icon}"
|
||
elif at == "intentClassification":
|
||
ir = step.get("intentResult", {})
|
||
cls = ir.get("classification", "?") if isinstance(ir, dict) else "?"
|
||
return f"🎯 意图: `{cls}` {state_icon}"
|
||
elif at == "replace":
|
||
inp = step.get("input", {})
|
||
path = inp.get("file", inp.get("path", "?")) if isinstance(inp, dict) else "?"
|
||
short = path.rsplit("/", 1)[-1] if "/" in str(path) else str(path)
|
||
return f"✏️ `replace` `{short}` {state_icon}"
|
||
else:
|
||
return f"`{at}` [{state}]"
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def _build_execution_summary(
|
||
log: dict,
|
||
conversation: list[dict],
|
||
timeline: list[dict],
|
||
sub_file_map: Optional[dict[str, str]] = None,
|
||
) -> dict:
|
||
"""构建结构化执行摘要(零 LLM 成本,纯规则化提取)。
|
||
返回 dict 供 md 渲染和索引存储共用。
|
||
"""
|
||
dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
|
||
|
||
files_modified = []
|
||
files_created = []
|
||
sub_agents = []
|
||
errors = []
|
||
|
||
for step in timeline:
|
||
if step.get("_compressed"):
|
||
continue
|
||
idx = step.get("index", "?")
|
||
at = step.get("actionType", "?")
|
||
state = step.get("actionState", "?")
|
||
|
||
fc = step.get("_file_change")
|
||
if fc:
|
||
fname = fc.get("file", "?")
|
||
if fc.get("original"):
|
||
files_modified.append(fname)
|
||
else:
|
||
files_created.append(fname)
|
||
|
||
if at == "invokeSubAgent":
|
||
inp = step.get("input", {})
|
||
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
|
||
sub_agents.append(agent_name)
|
||
|
||
if state == "Error":
|
||
errors.append(f"Step {idx}: `{at}`")
|
||
|
||
for msg in conversation:
|
||
for e in msg.get("entries", []):
|
||
if e.get("type") == "toolUseResponse" and not e.get("success"):
|
||
errors.append(f"Msg {msg['index']}: `{e.get('name', '?')}`")
|
||
|
||
# 去重文件名
|
||
files_modified = list(dict.fromkeys(files_modified))
|
||
files_created = list(dict.fromkeys(files_created))
|
||
|
||
# description 由外部 LLM 生成(百炼 API),提取阶段不生成
|
||
description = ""
|
||
|
||
return {
|
||
"workflow": log.get("workflowType", "?"),
|
||
"status": log.get("status", "?"),
|
||
"duration_s": round(dur, 1),
|
||
"msg_count": len(conversation),
|
||
"action_count": len(timeline),
|
||
"files_modified": files_modified,
|
||
"files_created": files_created,
|
||
"sub_agents": sub_agents,
|
||
"errors": errors,
|
||
"description": description,
|
||
}
|
||
|
||
|
||
def _render_summary_md(summary: dict, sub_file_map: Optional[dict[str, str]] = None) -> str:
|
||
"""将结构化摘要渲染为 Markdown 文本(放在文件最前面)。"""
|
||
L = []
|
||
# 一句话概览
|
||
status_icon = "✅" if summary["status"] == "succeed" else "❌"
|
||
L.append(f"{status_icon} `{summary['workflow']}` | {summary['duration_s']}s | "
|
||
f"{summary['msg_count']} msgs | {summary['action_count']} actions")
|
||
L.append("")
|
||
|
||
desc = summary.get("description", "")
|
||
if desc:
|
||
L.append(f"> {desc}")
|
||
L.append("")
|
||
|
||
fm = summary["files_modified"]
|
||
fc = summary["files_created"]
|
||
if fm or fc:
|
||
L.append(f"**文件变更** (修改 {len(fm)} / 新建 {len(fc)})")
|
||
for f in fm:
|
||
L.append(f"- ⚡ 修改 `{f}`")
|
||
for f in fc:
|
||
L.append(f"- ✨ 新建 `{f}`")
|
||
L.append("")
|
||
|
||
sa = summary["sub_agents"]
|
||
if sa:
|
||
L.append(f"**子代理** ({len(sa)}): {', '.join(f'`{a}`' for a in sa)}")
|
||
L.append("")
|
||
|
||
errs = summary["errors"]
|
||
if errs:
|
||
L.append(f"**错误** ({len(errs)})")
|
||
for e in errs:
|
||
L.append(f"- ❌ {e}")
|
||
L.append("")
|
||
|
||
if not fm and not fc and not sa and not errs:
|
||
L.append("*(无文件变更、子代理调用或错误)*")
|
||
|
||
return "\n".join(L)
|
||
|
||
|
||
def _build_nav_summary(
|
||
conversation: list[dict],
|
||
timeline: list[dict],
|
||
sub_file_map: Optional[dict[str, str]] = None,
|
||
) -> str:
|
||
"""生成快速导航摘要:文件变更、子代理、错误。"""
|
||
file_changes = []
|
||
sub_agents = []
|
||
errors = []
|
||
|
||
for step in timeline:
|
||
if step.get("_compressed"):
|
||
continue
|
||
idx = step.get("index", "?")
|
||
at = step.get("actionType", "?")
|
||
state = step.get("actionState", "?")
|
||
|
||
# 文件变更
|
||
fc = step.get("_file_change")
|
||
if fc:
|
||
fname = fc.get("file", "?")
|
||
orig = fc.get("original", "")
|
||
action = "修改" if orig else "新建"
|
||
file_changes.append(f"- Step {idx}: ⚡ {action} `{fname}`")
|
||
|
||
# 子代理
|
||
if at == "invokeSubAgent":
|
||
inp = step.get("input", {})
|
||
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
|
||
sub_eid = step.get("subExecutionId", "")
|
||
sub_path = ""
|
||
if sub_file_map and sub_eid and sub_eid in sub_file_map:
|
||
sub_path = f" → `{sub_file_map[sub_eid].replace(chr(92), '/')}`"
|
||
sub_agents.append(f"- Step {idx}: 🔀 `{agent_name}`{sub_path}")
|
||
|
||
# 错误
|
||
if state == "Error":
|
||
errors.append(f"- Step {idx}: ❌ `{at}`")
|
||
|
||
# 对话中的错误工具结果
|
||
for msg in conversation:
|
||
for e in msg.get("entries", []):
|
||
if e.get("type") == "toolUseResponse" and not e.get("success"):
|
||
errors.append(f"- Msg {msg['index']}: ❌ `{e.get('name', '?')}`")
|
||
|
||
lines = []
|
||
if file_changes:
|
||
lines.append(f"**文件变更** ({len(file_changes)})")
|
||
lines.extend(file_changes)
|
||
lines.append("")
|
||
if sub_agents:
|
||
lines.append(f"**子代理调用** ({len(sub_agents)})")
|
||
lines.extend(sub_agents)
|
||
lines.append("")
|
||
if errors:
|
||
lines.append(f"**错误** ({len(errors)})")
|
||
lines.extend(errors)
|
||
lines.append("")
|
||
if not lines:
|
||
lines.append("*(无文件变更、子代理调用或错误)*")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def generate_full_record(
|
||
log: dict,
|
||
conversation: list[dict],
|
||
timeline: list[dict],
|
||
diffs: dict[str, dict],
|
||
session_info: Optional[dict] = None,
|
||
prompt_log_path: Optional[str] = None,
|
||
is_sub: bool = False,
|
||
sub_index: int = 0,
|
||
sub_file_map: Optional[dict[str, str]] = None,
|
||
prev_msg_count: int = 0,
|
||
) -> tuple[str, dict]:
|
||
"""生成单个 execution 的 Markdown 全量记录。
|
||
|
||
Args:
|
||
log: 原始 execution log JSON
|
||
conversation: parse_messages 输出
|
||
timeline: parse_actions 输出
|
||
diffs: collect_diffs 输出
|
||
session_info: 会话配置(仅主 execution 有)
|
||
prompt_log_path: 匹配的 prompt_log 文件路径
|
||
is_sub: 是否为子代理 execution
|
||
sub_index: 子代理序号(从 1 开始)
|
||
prev_msg_count: 前一轮 execution 的消息数,用于去重(跳过累积的历史消息)
|
||
"""
|
||
L = []
|
||
exec_id = log.get("executionId", "?")
|
||
chat_id = log.get("chatSessionId", "?")
|
||
|
||
# 构建结构化摘要(供 md 和索引共用)
|
||
_summary = _build_execution_summary(log, conversation, timeline, sub_file_map)
|
||
|
||
# 标题
|
||
if is_sub:
|
||
L.append(f"# 子代理 Execution #{sub_index}\n")
|
||
else:
|
||
L.append("# Kiro 会话全量记录\n")
|
||
L.append(f"> 生成时间: {datetime.now(CST).strftime('%Y-%m-%d %H:%M:%S')} CST\n")
|
||
|
||
# ── 0. 执行摘要(文件最前面,AI 读前 20 行即可掌握全貌)──
|
||
L.append("## 📋 执行摘要\n")
|
||
L.append(_render_summary_md(_summary, sub_file_map))
|
||
L.append("")
|
||
|
||
# ── 1. 元数据 ──
|
||
L.append("## 1. 元数据\n")
|
||
L.append("| 字段 | 值 |")
|
||
L.append("|------|-----|")
|
||
L.append(f"| executionId | `{exec_id}` |")
|
||
L.append(f"| chatSessionId | `{chat_id}` |")
|
||
L.append(f"| workflowType | `{log.get('workflowType', '?')}` |")
|
||
L.append(f"| autonomyMode | `{log.get('autonomyMode', '?')}` |")
|
||
L.append(f"| status | `{log.get('status', '?')}` |")
|
||
L.append(f"| startTime | `{ts_fmt(log.get('startTime'))}` |")
|
||
L.append(f"| endTime | `{ts_fmt(log.get('endTime'))}` |")
|
||
dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
|
||
L.append(f"| duration | `{dur:.1f}s` |")
|
||
L.append(f"| contextUsage | `{log.get('contextUsagePercentage', 0):.2f}%` |")
|
||
L.append("")
|
||
|
||
if session_info and not is_sub:
|
||
L.append(f"- 会话标题: `{session_info.get('title', '?')}`")
|
||
L.append(f"- 创建时间: `{ts_fmt(int(session_info.get('dateCreated', 0)))}`")
|
||
L.append(f"- 工作区: `{session_info.get('workspaceDirectory', '?')}`")
|
||
L.append("")
|
||
|
||
if prompt_log_path and not is_sub:
|
||
rel = os.path.relpath(prompt_log_path).replace("\\", "/")
|
||
L.append(f"- 关联 prompt_log: `{rel}`")
|
||
L.append("")
|
||
|
||
# ── 2. 用户输入 ──
|
||
L.append("## 2. 用户输入\n")
|
||
input_text = ""
|
||
for msg in log.get("input", {}).get("data", {}).get("messages", []):
|
||
for entry in msg.get("content", msg.get("entries", [])):
|
||
if isinstance(entry, dict) and entry.get("text"):
|
||
input_text += entry["text"] + "\n"
|
||
if input_text.strip():
|
||
L.append(fence(input_text.strip()) + "\n")
|
||
else:
|
||
L.append("*(无用户输入)*\n")
|
||
|
||
# ── 3. 对话记录 ──
|
||
L.append("## 3. 对话记录\n")
|
||
|
||
# 去重:同一 chatSession 的非首轮 execution,context.messages 包含前几轮的累积历史
|
||
# prev_msg_count > 0 时跳过前 N 条,只渲染本轮新增的消息
|
||
new_msgs = conversation[prev_msg_count:] if prev_msg_count > 0 else conversation
|
||
h = sum(1 for m in new_msgs if m["role"] == "human")
|
||
b = sum(1 for m in new_msgs if m["role"] == "bot")
|
||
t = sum(1 for m in new_msgs if m["role"] == "tool")
|
||
if prev_msg_count > 0:
|
||
L.append(f"共 {len(new_msgs)} 条新增消息 (跳过前 {prev_msg_count} 条历史): human={h}, bot={b}, tool={t}\n")
|
||
else:
|
||
L.append(f"共 {len(new_msgs)} 条消息: human={h}, bot={b}, tool={t}\n")
|
||
|
||
for msg in new_msgs:
|
||
emoji = {"human": "👤", "bot": "🤖", "tool": "🔧"}.get(msg["role"], "❓")
|
||
# 生成语义标签
|
||
msg_label = _msg_semantic_label(msg)
|
||
label_suffix = f" — {msg_label}" if msg_label else ""
|
||
|
||
# P0: 压缩 hook 输出的空消息(特征:HUMAN 消息只含 "Output:\nCommand executed..." 或 "Output:\nCommand timed out...")
|
||
if msg["role"] == "human" and len(msg["entries"]) == 1:
|
||
e0 = msg["entries"][0]
|
||
if e0["type"] == "text":
|
||
_txt = (e0.get("text") or "").strip()
|
||
if _txt.startswith("Output:") and ("Exit Code:" in _txt) and len(_txt) < 200:
|
||
# 提取 exit code
|
||
import re as _re
|
||
_ec_match = _re.search(r"Exit Code:\s*(-?\d+)", _txt)
|
||
_ec = _ec_match.group(1) if _ec_match else "?"
|
||
L.append(f"### Msg {msg['index']}: 👤 HUMAN — 🪝 Hook 输出 (exit={_ec})\n")
|
||
continue
|
||
|
||
L.append(f"### Msg {msg['index']}: {emoji} {msg['role'].upper()}{label_suffix}\n")
|
||
|
||
for entry in msg["entries"]:
|
||
et = entry["type"]
|
||
|
||
if et == "system_prompt_ref":
|
||
ref = entry["ref_file"]
|
||
chars = entry["char_count"]
|
||
sp_path = f"docs/audit/session_logs/_system_prompts/{ref}"
|
||
L.append(f"**[系统提示词]** → `{sp_path}` ({chars} 字符)\n")
|
||
|
||
elif et == "steering_ref":
|
||
files = ", ".join(entry["files"])
|
||
chars = entry["char_count"]
|
||
L.append(f"**[Steering]** 引用: {files} ({chars} 字符)\n")
|
||
|
||
elif et == "text":
|
||
text = entry.get("text", "")
|
||
if not text:
|
||
L.append("*(空)*\n")
|
||
else:
|
||
L.append(fence(text) + "\n")
|
||
|
||
elif et == "toolUse":
|
||
name = entry.get("name", "?")
|
||
args = entry.get("args", {})
|
||
L.append(f"**[🔧 调用]** `{name}`\n")
|
||
# P1: strReplace/editCode 的代码变更用 diff 格式展示
|
||
if name in ("strReplace", "editCode") and isinstance(args, dict):
|
||
_path = args.get("path", "?")
|
||
_lang = "python" if _path.endswith(".py") else "sql" if _path.endswith(".sql") else ""
|
||
L.append(f"- 文件: `{_path}`\n")
|
||
_old = args.get("oldStr", args.get("old_str", ""))
|
||
_new = args.get("newStr", args.get("new_str", ""))
|
||
_sel = args.get("selector", "")
|
||
_op = args.get("operation", "")
|
||
_repl = args.get("replacement", "")
|
||
if _sel:
|
||
L.append(f"- selector: `{_sel}`" + (f" ({_op})" if _op else ""))
|
||
if _old:
|
||
L.append(f"- 删除:\n" + fence(trunc(_old, 2000), _lang))
|
||
if _new:
|
||
L.append(f"- 插入:\n" + fence(trunc(_new, 2000), _lang))
|
||
if _repl:
|
||
L.append(f"- 替换为:\n" + fence(trunc(_repl, 2000), _lang))
|
||
L.append("")
|
||
else:
|
||
L.append(fence(safe_json(args, 5000), "json") + "\n")
|
||
|
||
elif et == "toolUseResponse":
|
||
ok = "✅" if entry.get("success") else "❌"
|
||
L.append(f"**[📋 结果]** `{entry.get('name', '?')}` {ok}\n")
|
||
msg_text = entry.get("message", "")
|
||
if msg_text:
|
||
L.append(fence(trunc(msg_text, 5000)) + "\n")
|
||
|
||
elif et == "document":
|
||
target = entry.get("target", "")
|
||
L.append(f"**[📄 文档]** type=`{entry.get('doc_type')}` target=`{target}`\n")
|
||
|
||
elif et == "steering_doc":
|
||
L.append(f"**[📄 Steering]** `{entry.get('name', 'steering')}`\n")
|
||
|
||
else:
|
||
L.append(f"**[{et}]** keys={entry.get('raw_keys')}\n")
|
||
|
||
# ── 4. Actions 时间线 ──
|
||
L.append("## 4. Actions 时间线\n")
|
||
L.append(f"共 {len(timeline)} 个\n")
|
||
|
||
for step in timeline:
|
||
if step.get("_compressed"):
|
||
L.append(f"- `model` [{step.get('actionState')}] @ {step.get('emittedAt')}\n")
|
||
continue
|
||
|
||
at = step.get('actionType', '?')
|
||
state = step.get('actionState', '?')
|
||
# 生成语义标签
|
||
step_label = _step_semantic_label(step)
|
||
L.append(f"### Step {step['index']}: {step_label} @ {step.get('emittedAt','?')}\n")
|
||
if step.get("subExecutionId"):
|
||
sub_eid = step["subExecutionId"]
|
||
L.append(f"- subExecutionId: `{sub_eid}`")
|
||
# 标注子代理文件路径(如果有映射)
|
||
if sub_file_map and sub_eid in sub_file_map:
|
||
sub_path = sub_file_map[sub_eid].replace("\\", "/")
|
||
L.append(f"- 子代理记录: `{sub_path}`")
|
||
if step.get("endTime"):
|
||
L.append(f"- endTime: {step['endTime']}")
|
||
# 文件变更展示
|
||
if step.get("_file_change"):
|
||
fc = step["_file_change"]
|
||
fname = fc.get("file", "?")
|
||
orig = fc.get("original", "")
|
||
mod = fc.get("modified", "")
|
||
lang = "python" if fname.endswith(".py") else "sql" if fname.endswith(".sql") else ""
|
||
L.append(f"- 文件变更: `{fname}`")
|
||
if orig and mod:
|
||
L.append(f" - 修改前 ({len(orig)} 字符):\n" + fence(trunc(orig, 3000), lang))
|
||
L.append(f" - 修改后 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
|
||
elif mod:
|
||
L.append(f" - 新建 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
|
||
# 特殊处理各种 action type 的内容展示
|
||
_at = step.get("actionType", "")
|
||
if _at == "say":
|
||
_say_msg = (step.get("output") or {}).get("message", "")
|
||
if _say_msg:
|
||
L.append(f"- 💬 AI 回复:\n\n{_say_msg}\n")
|
||
else:
|
||
for k in ("intentResult", "input", "output"):
|
||
if k in step:
|
||
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
|
||
elif _at == "invokeSubAgent":
|
||
_sub_input = step.get("input") or {}
|
||
_sub_prompt = _sub_input.get("prompt", "")
|
||
_sub_name = _sub_input.get("name", "") or _sub_input.get("subAgentName", "") or "?"
|
||
if _sub_prompt:
|
||
L.append(f"- 📤 委托 `{_sub_name}`:\n\n{trunc(_sub_prompt, 3000)}\n")
|
||
_sub_output = step.get("output") or {}
|
||
_sub_resp = _sub_output.get("response", "")
|
||
if _sub_resp:
|
||
L.append(f"- 📥 子代理输出:\n\n{_sub_resp}\n")
|
||
elif not _sub_prompt:
|
||
for k in ("intentResult", "input", "output"):
|
||
if k in step:
|
||
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
|
||
elif _at == "subagentResponse":
|
||
_sr_input = step.get("input") or {}
|
||
_sr_resp = _sr_input.get("response", "")
|
||
if _sr_resp:
|
||
L.append(f"- 📥 子代理返回:\n\n{_sr_resp}\n")
|
||
else:
|
||
for k in ("intentResult", "input", "output"):
|
||
if k in step:
|
||
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
|
||
elif _at == "ContextualHookInvoked":
|
||
# P1: hook 执行——提取名称、命令、exitCode
|
||
_hi = step.get("input") or {}
|
||
_ho = step.get("output") or {}
|
||
_h_name = _hi.get("name", "?")
|
||
_h_cmd = _ho.get("command", "")
|
||
_h_result = _ho.get("result", {})
|
||
_h_exit = _h_result.get("exitCode", "?") if isinstance(_h_result, dict) else "?"
|
||
_h_out = _h_result.get("output", "") if isinstance(_h_result, dict) else ""
|
||
if _h_cmd:
|
||
L.append(f"- `$ {_h_cmd}`")
|
||
L.append(f"- Exit: `{_h_exit}`")
|
||
if _h_out and _h_out != "Command executed successfully with no output.":
|
||
L.append(f"- 输出:\n" + fence(trunc(_h_out, 2000)))
|
||
elif _at == "intentClassification":
|
||
# P1: 意图分类——压缩为一行
|
||
_ir = step.get("intentResult", {})
|
||
_cls = _ir.get("classification", "?") if isinstance(_ir, dict) else "?"
|
||
L.append(f"- 分类结果: `{_cls}`")
|
||
elif _at == "runCommand":
|
||
# P0: 命令执行——提取命令、exitCode、输出
|
||
_rc_in = step.get("input") or {}
|
||
_rc_out = step.get("output") or {}
|
||
_rc_cmd = _rc_in.get("command", "") if isinstance(_rc_in, dict) else ""
|
||
_rc_result = _rc_out.get("result", {}) if isinstance(_rc_out, dict) else {}
|
||
_rc_exit = _rc_result.get("exitCode", "?") if isinstance(_rc_result, dict) else "?"
|
||
_rc_output = _rc_result.get("output", "") if isinstance(_rc_result, dict) else ""
|
||
if _rc_cmd:
|
||
L.append(f"- `$ {_rc_cmd}`")
|
||
L.append(f"- Exit: `{_rc_exit}`")
|
||
if _rc_output:
|
||
L.append(f"- 输出:\n" + fence(trunc(_rc_output, 3000)))
|
||
elif _at == "search":
|
||
# P2: 搜索——提取 query 和 why
|
||
_s_in = step.get("input") or {}
|
||
_s_query = _s_in.get("query", "") if isinstance(_s_in, dict) else ""
|
||
_s_why = _s_in.get("why", "") if isinstance(_s_in, dict) else ""
|
||
if _s_query:
|
||
L.append(f"- 🔍 `{trunc(_s_query, 100)}`")
|
||
if _s_why:
|
||
L.append(f"- 原因: {trunc(_s_why, 200)}")
|
||
# 展示搜索结果(如果有)
|
||
_s_out = step.get("output")
|
||
if _s_out and isinstance(_s_out, dict):
|
||
_s_files = _s_out.get("files", [])
|
||
if _s_files:
|
||
L.append(f"- 结果: {len(_s_files)} 个文件")
|
||
elif _at == "steering":
|
||
# P2: steering——提取文件名列表
|
||
_st_in = step.get("input") or {}
|
||
_st_docs = _st_in.get("documents", []) if isinstance(_st_in, dict) else []
|
||
if _st_docs:
|
||
import urllib.parse
|
||
names = []
|
||
for d in _st_docs[:10]:
|
||
if isinstance(d, str):
|
||
# URL 编码的路径,提取文件名
|
||
decoded = urllib.parse.unquote(d)
|
||
name = decoded.rsplit("/", 1)[-1] if "/" in decoded else decoded
|
||
names.append(name)
|
||
if names:
|
||
L.append(f"- 文件: {', '.join(names)}")
|
||
else:
|
||
for k in ("input", "output"):
|
||
if k in step:
|
||
L.append(f"- {k}:\n" + fence(safe_json(step[k], 3000), "json"))
|
||
elif _at == "getDiagnostics":
|
||
# P2: 诊断——提取路径和问题数
|
||
_gd_in = step.get("input") or {}
|
||
_gd_out = step.get("output") or {}
|
||
_gd_paths = _gd_in.get("paths", []) if isinstance(_gd_in, dict) else []
|
||
if _gd_paths:
|
||
L.append(f"- 文件: {', '.join(str(p) for p in _gd_paths[:5])}")
|
||
if isinstance(_gd_out, dict):
|
||
_gd_diags = _gd_out.get("diagnostics", [])
|
||
if isinstance(_gd_diags, list):
|
||
L.append(f"- 问题数: {len(_gd_diags)}")
|
||
for d in _gd_diags[:5]:
|
||
if isinstance(d, dict):
|
||
L.append(f" - {d.get('severity', '?')}: {d.get('message', '?')[:100]}")
|
||
elif not _gd_diags:
|
||
L.append("- ✅ 无问题")
|
||
elif _at in ("readFiles", "readCode"):
|
||
# P3: 文件读取——只展示路径,不展示内容
|
||
_rf_in = step.get("input") or {}
|
||
if isinstance(_rf_in, dict):
|
||
_rf_files = _rf_in.get("files", [])
|
||
paths = []
|
||
for f in _rf_files[:5]:
|
||
if isinstance(f, dict):
|
||
paths.append(f.get("path", "?"))
|
||
else:
|
||
paths.append(str(f))
|
||
if paths:
|
||
L.append(f"- 文件: {', '.join(paths)}")
|
||
else:
|
||
for k in ("intentResult", "input", "output"):
|
||
if k in step:
|
||
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
|
||
L.append("")
|
||
|
||
# ── 5. 资源消耗 ──
|
||
usage = log.get("usageSummary", [])
|
||
if usage:
|
||
L.append("## 5. 资源消耗\n")
|
||
L.append("| 工具 | 消耗 | 单位 |")
|
||
L.append("|------|------|------|")
|
||
total = 0
|
||
for u in usage:
|
||
tools = ", ".join(u.get("usedTools", ["-"]))
|
||
amt = u.get("usage", 0)
|
||
total += amt
|
||
L.append(f"| {tools} | {amt} | {u.get('unit', '?')} |")
|
||
L.append(f"| **合计** | **{total:.4f}** | |")
|
||
L.append("")
|
||
|
||
return "\n".join(L), _summary
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 索引管理
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def load_index() -> dict:
|
||
if os.path.isfile(INDEX_PATH):
|
||
try:
|
||
with open(INDEX_PATH, "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
except Exception:
|
||
pass
|
||
return {"version": 2, "entries": {}}
|
||
|
||
|
||
def load_full_index() -> dict:
|
||
if os.path.isfile(INDEX_FULL_PATH):
|
||
try:
|
||
with open(INDEX_FULL_PATH, "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
except Exception:
|
||
pass
|
||
return {"version": 2, "entries": {}}
|
||
|
||
|
||
def save_index(index: dict):
|
||
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
|
||
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
||
json.dump(index, f, ensure_ascii=False, indent=2)
|
||
_save_day_indexes(index, "_day_index.json")
|
||
|
||
|
||
def save_full_index(index: dict):
|
||
os.makedirs(os.path.dirname(INDEX_FULL_PATH), exist_ok=True)
|
||
with open(INDEX_FULL_PATH, "w", encoding="utf-8") as f:
|
||
json.dump(index, f, ensure_ascii=False, indent=2)
|
||
_save_day_indexes(index, "_day_index_full.json")
|
||
|
||
|
||
def _save_day_indexes(index: dict, filename: str):
|
||
"""将根级索引按 output_dir 中的 {ym}/{dd} 拆分,写入每个 day_dir 下。
|
||
|
||
路径格式:docs/audit/session_logs/{ym}/{dd}/{filename}
|
||
每个 day 级索引只包含 output_dir 在该天目录下的 entry。
|
||
"""
|
||
entries = index.get("entries", {})
|
||
if not entries:
|
||
return
|
||
|
||
# 按 day_dir 分组
|
||
day_groups: dict[str, dict[str, dict]] = {}
|
||
prefix = SESSION_LOG_DIR.replace("\\", "/")
|
||
for eid, ent in entries.items():
|
||
out_dir = ent.get("output_dir", "").replace("\\", "/")
|
||
if not out_dir.startswith(prefix):
|
||
continue
|
||
# out_dir 格式:docs/audit/session_logs/2026-03/03/01_abc12345_013337
|
||
# 取到 day_dir:docs/audit/session_logs/2026-03/03
|
||
rel = out_dir[len(prefix):].lstrip("/")
|
||
parts = rel.split("/")
|
||
if len(parts) >= 2:
|
||
day_key = f"{parts[0]}/{parts[1]}" # "2026-03/03"
|
||
day_groups.setdefault(day_key, {})[eid] = ent
|
||
|
||
for day_key, day_entries in day_groups.items():
|
||
day_dir = os.path.join(SESSION_LOG_DIR, day_key.replace("/", os.sep))
|
||
day_idx_path = os.path.join(day_dir, filename)
|
||
os.makedirs(day_dir, exist_ok=True)
|
||
day_data = {"version": 2, "entries": day_entries}
|
||
with open(day_idx_path, "w", encoding="utf-8") as f:
|
||
json.dump(day_data, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def update_index(index: dict, exec_id: str, output_dir: str, meta: dict,
|
||
summary: Optional[dict] = None, is_sub: bool = False,
|
||
parent_exec_id: str = ""):
|
||
"""添加一条索引记录,含结构化摘要供 AI 低成本查询。
|
||
|
||
is_sub=True 时标记为子代理 entry,并记录 parent_exec_id。
|
||
"""
|
||
entry = {
|
||
"output_dir": output_dir.replace("\\", "/"),
|
||
"chatSessionId": meta.get("chatSessionId", ""),
|
||
"startTime": meta.get("startTime", ""),
|
||
"endTime": meta.get("endTime", ""),
|
||
"status": meta.get("status", ""),
|
||
"workflowType": meta.get("workflowType", ""),
|
||
"indexed_at": datetime.now(CST).isoformat(),
|
||
}
|
||
if is_sub:
|
||
entry["is_sub"] = True
|
||
if parent_exec_id:
|
||
entry["parent_exec_id"] = parent_exec_id
|
||
if summary:
|
||
entry["summary"] = {
|
||
"duration_s": summary.get("duration_s", 0),
|
||
"msg_count": summary.get("msg_count", 0),
|
||
"action_count": summary.get("action_count", 0),
|
||
"files_modified": summary.get("files_modified", []),
|
||
"files_created": summary.get("files_created", []),
|
||
"sub_agents": summary.get("sub_agents", []),
|
||
"errors": summary.get("errors", []),
|
||
}
|
||
if summary.get("description"):
|
||
entry["description"] = summary["description"]
|
||
index["entries"][exec_id] = entry
|
||
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 主提取逻辑
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def extract_single_execution(
|
||
agent_dir: str,
|
||
hex_dir: str,
|
||
execution: dict,
|
||
session_dir: Optional[str],
|
||
index: dict,
|
||
full_index: Optional[dict] = None,
|
||
sub_mode: bool = False,
|
||
sub_index: int = 0,
|
||
output_base_dir: Optional[str] = None,
|
||
parent_exec_id: str = "",
|
||
) -> Optional[str]:
|
||
"""提取单个 execution 并写入文件。
|
||
|
||
Args:
|
||
agent_dir: kiro.kiroagent 目录
|
||
hex_dir: execution 所在的 32 位 hex 目录
|
||
execution: manifest 中的 execution 条目
|
||
session_dir: workspace-sessions 子目录(用于加载会话信息)
|
||
index: 精简索引字典(仅主对话)
|
||
full_index: 完整索引字典(主对话 + 子代理),None 时不写入
|
||
sub_mode: 是否为子代理模式
|
||
sub_index: 子代理序号
|
||
output_base_dir: 子代理模式下的输出目录(与主 execution 同目录)
|
||
parent_exec_id: 子代理的父 execution ID
|
||
|
||
Returns:
|
||
输出目录路径,或 None(如果失败/已索引)
|
||
"""
|
||
exec_id = execution.get("executionId", "")
|
||
chat_id = execution.get("chatSessionId", "")
|
||
|
||
# 跳过已索引且文件仍存在的(子代理不检查,因为它们跟随主 execution)
|
||
if not sub_mode and exec_id in index.get("entries", {}):
|
||
existing_dir = index["entries"][exec_id].get("output_dir", "")
|
||
if existing_dir and os.path.isdir(existing_dir):
|
||
return None
|
||
# 文件已被清理,从索引中移除,继续提取
|
||
del index["entries"][exec_id]
|
||
|
||
# 加载 execution log
|
||
log_path = find_execution_log(agent_dir, hex_dir, execution)
|
||
if not log_path:
|
||
return None
|
||
|
||
try:
|
||
with open(log_path, "r", encoding="utf-8") as f:
|
||
log = json.load(f)
|
||
except Exception:
|
||
return None
|
||
|
||
# 从完整 log 补充 chatSessionId(新版 manifest 条目中可能缺失)
|
||
if not chat_id:
|
||
chat_id = log.get("chatSessionId", "")
|
||
|
||
# 解析
|
||
messages = log.get("context", {}).get("messages", [])
|
||
actions = log.get("actions", [])
|
||
conversation = parse_messages(messages)
|
||
timeline = parse_actions(actions)
|
||
diffs = collect_diffs(agent_dir, hex_dir, execution)
|
||
|
||
# 会话信息(仅主 execution)
|
||
session_info = None
|
||
if not sub_mode and session_dir and chat_id:
|
||
session_info = load_session_detail(session_dir, chat_id)
|
||
|
||
# prompt_log 匹配(仅主 execution)
|
||
prompt_log = None
|
||
if not sub_mode:
|
||
start_time = log.get("startTime", 0)
|
||
prompt_log = find_matching_prompt_log(start_time)
|
||
|
||
# 取 execution 开始时间(用于目录和文件命名的时间后缀)
|
||
_start_ms = log.get("startTime") or execution.get("startTime", 0)
|
||
_ym, _dd, _hms = ts_date_parts(_start_ms)
|
||
|
||
# 确定输出目录
|
||
if sub_mode and output_base_dir:
|
||
out_dir = output_base_dir
|
||
else:
|
||
chat_short = chat_id[:8] if chat_id else hash8(exec_id)
|
||
day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
|
||
out_dir = _resolve_chat_dir(day_dir, chat_short, _hms)
|
||
|
||
os.makedirs(out_dir, exist_ok=True)
|
||
|
||
# 跨天指引:如果 execution 所在日期与 chatSession 目录所在日期不同,
|
||
# 在 execution 日期的 day_dir 下生成 _ref_{chatShort}.md
|
||
if not sub_mode:
|
||
chat_short = chat_id[:8] if chat_id else hash8(exec_id)
|
||
# out_dir 的父目录是 chatSession 首轮所在的 day_dir
|
||
chat_day_dir = os.path.dirname(out_dir)
|
||
exec_day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
|
||
if os.path.normpath(chat_day_dir) != os.path.normpath(exec_day_dir):
|
||
_write_cross_day_ref(exec_day_dir, chat_short, out_dir)
|
||
|
||
# 先递归提取子代理,收集 sub_file_map(subExecutionId → 文件路径)
|
||
sub_file_map: dict[str, str] = {}
|
||
if not sub_mode:
|
||
sub_exec_ids = extract_sub_execution_ids(actions)
|
||
for si, sub_eid in enumerate(sub_exec_ids, 1):
|
||
sub_execs = find_all_executions(agent_dir, execution_id=sub_eid)
|
||
if sub_execs:
|
||
sub_exec = sub_execs[0]
|
||
extract_single_execution(
|
||
agent_dir=agent_dir,
|
||
hex_dir=sub_exec["_hex_dir"],
|
||
execution=sub_exec,
|
||
session_dir=session_dir,
|
||
index=index,
|
||
full_index=full_index,
|
||
sub_mode=True,
|
||
sub_index=si,
|
||
output_base_dir=out_dir,
|
||
parent_exec_id=exec_id,
|
||
)
|
||
sub_filename = f"sub_{si:02d}_{sub_eid[:8]}.md"
|
||
sub_file_map[sub_eid] = os.path.join(out_dir, sub_filename)
|
||
|
||
# 计算 prev_msg_count:同 chatSession 的前一轮 execution 的消息数
|
||
# 用于去重——跳过 context.messages 中累积的历史消息
|
||
# 策略:从同目录已有的 main_*.md 文件摘要中解析 msg_count
|
||
# (不依赖索引,因为并行提取时索引可能尚未合并)
|
||
prev_msg_count = 0
|
||
existing_mains = []
|
||
cur_msg_count = len(conversation)
|
||
if not sub_mode and out_dir and os.path.isdir(out_dir):
|
||
existing_mains = sorted(
|
||
f for f in os.listdir(out_dir)
|
||
if f.startswith("main_") and f.endswith(".md")
|
||
)
|
||
if existing_mains:
|
||
# 从已有 main 文件的摘要行解析 msg count
|
||
# 摘要格式:✅ `chat-agent` | 1406.6s | 44 msgs | 266 actions
|
||
# 同时检查对话记录部分的"跳过前 N 条历史"来获取累积消息数
|
||
# 累积消息数 = 该轮的 prev_msg_count + 该轮新增的 msg_count
|
||
import re as _re_prev
|
||
for _mf in existing_mains:
|
||
try:
|
||
_mf_path = os.path.join(out_dir, _mf)
|
||
_mf_msgs = 0
|
||
_mf_skipped = 0
|
||
with open(_mf_path, "r", encoding="utf-8") as _fh:
|
||
for _ln_idx, _ln in enumerate(_fh):
|
||
if _ln_idx > 200:
|
||
break
|
||
# 摘要行:44 msgs
|
||
_mc_match = _re_prev.search(r'\|\s*(\d+)\s*msgs\s*\|', _ln)
|
||
if _mc_match:
|
||
_mf_msgs = int(_mc_match.group(1))
|
||
# 对话记录行:共 23 条新增消息 (跳过前 44 条历史)
|
||
_skip_match = _re_prev.search(r'共\s*(\d+)\s*条新增消息\s*\(跳过前\s*(\d+)\s*条', _ln)
|
||
if _skip_match:
|
||
_mf_msgs = int(_skip_match.group(1))
|
||
_mf_skipped = int(_skip_match.group(2))
|
||
break
|
||
# 对话记录行(首轮):共 11 条消息:
|
||
_full_match = _re_prev.search(r'共\s*(\d+)\s*条消息:', _ln)
|
||
if _full_match and _mf_skipped == 0:
|
||
_mf_msgs = int(_full_match.group(1))
|
||
# 累积消息数 = 跳过的 + 本轮新增的
|
||
_cumulative = _mf_skipped + _mf_msgs
|
||
if _cumulative > prev_msg_count and _cumulative < cur_msg_count:
|
||
prev_msg_count = _cumulative
|
||
except Exception:
|
||
pass
|
||
|
||
# 生成 Markdown + 结构化摘要(主 execution 带 sub_file_map)
|
||
md, summary = generate_full_record(
|
||
log=log,
|
||
conversation=conversation,
|
||
timeline=timeline,
|
||
diffs=diffs,
|
||
session_info=session_info,
|
||
prompt_log_path=prompt_log,
|
||
is_sub=sub_mode,
|
||
sub_index=sub_index,
|
||
sub_file_map=sub_file_map if not sub_mode else None,
|
||
prev_msg_count=prev_msg_count,
|
||
)
|
||
|
||
# 写入文件
|
||
if sub_mode:
|
||
filename = f"sub_{sub_index:02d}_{exec_id[:8]}.md"
|
||
else:
|
||
# 自动编号:复用上面已扫描的 existing_mains(prev_msg_count 分支),
|
||
# 若未进入该分支则重新扫描
|
||
if not existing_mains and os.path.isdir(out_dir):
|
||
existing_mains = sorted(
|
||
f for f in os.listdir(out_dir)
|
||
if f.startswith("main_") and f.endswith(".md")
|
||
)
|
||
main_idx = len(existing_mains) + 1
|
||
filename = f"main_{main_idx:02d}_{exec_id[:8]}.md"
|
||
|
||
filepath = os.path.join(out_dir, filename)
|
||
# surrogate 字符(如 \udccb)无法用 utf-8 编码,替换为 U+FFFD
|
||
md_safe = md.encode("utf-8", errors="surrogateescape").decode("utf-8", errors="replace")
|
||
with open(filepath, "w", encoding="utf-8") as f:
|
||
f.write(md_safe)
|
||
|
||
# 更新索引
|
||
_meta = {
|
||
"chatSessionId": chat_id,
|
||
"startTime": ts_fmt(log.get("startTime")),
|
||
"endTime": ts_fmt(log.get("endTime")),
|
||
"status": log.get("status", ""),
|
||
"workflowType": log.get("workflowType", ""),
|
||
}
|
||
if not sub_mode:
|
||
# 主对话:写入精简索引 + 完整索引
|
||
update_index(index, exec_id, out_dir, _meta, summary=summary)
|
||
if full_index is not None:
|
||
update_index(full_index, exec_id, out_dir, _meta, summary=summary)
|
||
else:
|
||
# 子代理:只写入完整索引
|
||
if full_index is not None:
|
||
update_index(full_index, exec_id, out_dir, _meta,
|
||
summary=summary, is_sub=True,
|
||
parent_exec_id=parent_exec_id)
|
||
|
||
return out_dir
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# 入口函数
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def extract_latest(global_storage: Optional[str] = None, workspace_path: Optional[str] = None):
|
||
"""提取所有未索引的终态 execution(供 agent_on_stop 调用)。
|
||
|
||
同一 chatSession 的 execution 按 startTime 升序提取,确保 prev_msg_count 正确计算。
|
||
当前轮次在 agent on stop 触发时通常还是 running 状态,会在下一次调用时回补。
|
||
"""
|
||
gs = global_storage or DEFAULT_GLOBAL_STORAGE
|
||
ws = workspace_path or os.getcwd()
|
||
|
||
agent_dir = find_kiro_agent_dir(gs)
|
||
if not agent_dir:
|
||
return
|
||
|
||
session_dir = find_workspace_session_dir(agent_dir, ws)
|
||
chat_ids = None
|
||
if session_dir:
|
||
sessions = load_sessions_json(session_dir)
|
||
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
|
||
if s.get("chatSessionId") or s.get("sessionId")}
|
||
|
||
TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
|
||
|
||
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
|
||
if not all_execs:
|
||
return
|
||
|
||
index = load_index()
|
||
full_index = load_full_index()
|
||
|
||
# 筛选:未索引 + 终态
|
||
ready = [e for e in all_execs
|
||
if e.get("executionId", "") not in index.get("entries", {})
|
||
and e.get("status", "") in TERMINAL_STATUSES]
|
||
|
||
if not ready:
|
||
return
|
||
|
||
# 按 chatSessionId 分组,组内按 startTime 升序
|
||
# 确保同一对话的 execution 按时间顺序提取,prev_msg_count 才能正确计算
|
||
from collections import defaultdict
|
||
chat_groups: dict[str, list[dict]] = defaultdict(list)
|
||
for e in ready:
|
||
cid = e.get("chatSessionId", "") or "unknown"
|
||
chat_groups[cid].append(e)
|
||
for cid in chat_groups:
|
||
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
|
||
|
||
extracted_count = 0
|
||
for cid, group_execs in chat_groups.items():
|
||
for execution in group_execs:
|
||
result = extract_single_execution(
|
||
agent_dir=agent_dir,
|
||
hex_dir=execution["_hex_dir"],
|
||
execution=execution,
|
||
session_dir=session_dir,
|
||
index=index,
|
||
full_index=full_index,
|
||
)
|
||
if result:
|
||
extracted_count += 1
|
||
print(f"[session-extract] extracted: {result}")
|
||
|
||
if extracted_count > 0:
|
||
save_index(index)
|
||
save_full_index(full_index)
|
||
if extracted_count > 1:
|
||
print(f"[session-extract] total: {extracted_count} executions")
|
||
|
||
|
||
def extract_all_unindexed(
|
||
global_storage: Optional[str] = None,
|
||
workspace_path: Optional[str] = None,
|
||
limit: Optional[int] = None,
|
||
workers: int = 8,
|
||
):
|
||
"""提取所有未索引的 execution(多线程并行)"""
|
||
gs = global_storage or DEFAULT_GLOBAL_STORAGE
|
||
ws = workspace_path or os.getcwd()
|
||
|
||
agent_dir = find_kiro_agent_dir(gs)
|
||
if not agent_dir:
|
||
print("[session-extract] kiro.kiroagent dir not found")
|
||
return
|
||
|
||
session_dir = find_workspace_session_dir(agent_dir, ws)
|
||
chat_ids = None
|
||
if session_dir:
|
||
sessions = load_sessions_json(session_dir)
|
||
# 兼容 chatSessionId(旧版)和 sessionId(新版)两种字段名
|
||
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
|
||
if s.get("chatSessionId") or s.get("sessionId")}
|
||
|
||
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
|
||
if not all_execs:
|
||
print("[session-extract] no executions found")
|
||
return
|
||
|
||
index = load_index()
|
||
full_index = load_full_index()
|
||
|
||
# 筛选未索引的 execution(只提取终态的,跳过 running 等非终态)
|
||
TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
|
||
todo = [e for e in all_execs
|
||
if e.get("executionId", "") not in index.get("entries", {})
|
||
and e.get("status", "") in TERMINAL_STATUSES]
|
||
if limit:
|
||
todo = todo[:limit]
|
||
if not todo:
|
||
print("[session-extract] all indexed, nothing to do")
|
||
return
|
||
|
||
print(f"[session-extract] {len(todo)} executions to extract (workers={workers})")
|
||
|
||
import threading
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
# 按 chatSessionId 分组,同组内按 startTime 排序串行提取
|
||
# (同一 chatSession 的 context.messages 是累积的,需要按顺序提取以去重)
|
||
from collections import defaultdict
|
||
chat_groups: dict[str, list[dict]] = defaultdict(list)
|
||
for e in todo:
|
||
cid = e.get("chatSessionId", "") or "unknown"
|
||
chat_groups[cid].append(e)
|
||
for cid in chat_groups:
|
||
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
|
||
|
||
lock = threading.Lock()
|
||
count = 0
|
||
|
||
def _extract_group(group_execs):
|
||
"""串行提取同一 chatSession 的所有 execution"""
|
||
local_index = {"version": 2, "entries": {}}
|
||
local_full = {"version": 2, "entries": {}}
|
||
results = []
|
||
for execution in group_execs:
|
||
try:
|
||
result = extract_single_execution(
|
||
agent_dir=agent_dir,
|
||
hex_dir=execution["_hex_dir"],
|
||
execution=execution,
|
||
session_dir=session_dir,
|
||
index=local_index,
|
||
full_index=local_full,
|
||
)
|
||
if result:
|
||
results.append(result)
|
||
except Exception as e:
|
||
eid = execution.get("executionId", "?")[:8]
|
||
print(f"[session-extract] ✗ {eid}: {e}")
|
||
return results, local_index.get("entries", {}), local_full.get("entries", {})
|
||
|
||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||
futures = {pool.submit(_extract_group, execs): cid
|
||
for cid, execs in chat_groups.items()}
|
||
for future in as_completed(futures):
|
||
results, idx_entries, full_entries = future.result()
|
||
if results:
|
||
with lock:
|
||
count += len(results)
|
||
index["entries"].update(idx_entries)
|
||
full_index["entries"].update(full_entries)
|
||
if count % 50 == 0:
|
||
save_index(index)
|
||
save_full_index(full_index)
|
||
print(f"[session-extract] [{count}/{len(todo)}] checkpoint saved")
|
||
elif count % 10 == 0:
|
||
print(f"[session-extract] [{count}/{len(todo)}]")
|
||
|
||
# 最终保存
|
||
save_index(index)
|
||
save_full_index(full_index)
|
||
print(f"[session-extract] done, extracted {count}/{len(todo)}")
|
||
|
||
|
||
def extract_by_id(
|
||
execution_id: str,
|
||
global_storage: Optional[str] = None,
|
||
):
|
||
"""提取指定 executionId 的 execution"""
|
||
gs = global_storage or DEFAULT_GLOBAL_STORAGE
|
||
agent_dir = find_kiro_agent_dir(gs)
|
||
if not agent_dir:
|
||
print("[session-extract] kiro.kiroagent dir not found")
|
||
return
|
||
|
||
execs = find_all_executions(agent_dir, execution_id=execution_id)
|
||
if not execs:
|
||
print(f"[session-extract] execution not found: {execution_id}")
|
||
return
|
||
# 验证确实匹配到了目标 execution(前缀匹配)
|
||
matched = execs[0]
|
||
if not matched.get("executionId", "").startswith(execution_id):
|
||
print(f"[session-extract] execution not found: {execution_id}")
|
||
return
|
||
|
||
index = load_index()
|
||
full_index = load_full_index()
|
||
result = extract_single_execution(
|
||
agent_dir=agent_dir,
|
||
hex_dir=execs[0]["_hex_dir"],
|
||
execution=execs[0],
|
||
session_dir=None,
|
||
index=index,
|
||
full_index=full_index,
|
||
)
|
||
if result:
|
||
save_index(index)
|
||
save_full_index(full_index)
|
||
print(f"[session-extract] extracted: {result}")
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════
|
||
# CLI 入口
|
||
# ═══════════════════════════════════════════════════════════
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser(description="Kiro 执行日志全量提取器 v2")
|
||
parser.add_argument("--all", action="store_true", help="提取所有未索引的 execution")
|
||
parser.add_argument("--recent", type=int, metavar="N", help="提取最近 N 条未索引的")
|
||
parser.add_argument("--workers", type=int, default=8, help="并行线程数(默认 8)")
|
||
parser.add_argument("--execution-id", type=str, help="提取指定 executionId")
|
||
parser.add_argument("--global-storage", type=str, help="globalStorage 路径")
|
||
parser.add_argument("--workspace", type=str, help="workspace 路径")
|
||
args = parser.parse_args()
|
||
|
||
gs = args.global_storage
|
||
ws = args.workspace
|
||
|
||
if args.execution_id:
|
||
extract_by_id(args.execution_id, global_storage=gs)
|
||
elif args.all:
|
||
extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers)
|
||
elif args.recent:
|
||
extract_all_unindexed(global_storage=gs, workspace_path=ws, limit=args.recent, workers=args.workers)
|
||
else:
|
||
extract_latest(global_storage=gs, workspace_path=ws)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|