Files
Neo-ZQYY/scripts/ops/extract_kiro_session.py.bak

1896 lines
76 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""extract_kiro_session — Kiro 执行日志全量提取器 v2。
改进点(相比 v1
1. 系统提示词去重:首次保存到 _system_prompts/sp_{hash8}.md后续引用
2. 目录分层YYYY-MM/DD/{chatSessionId_short}/ 下按对话组织
3. 子代理递归提取:主 execution + 子 execution 放同一目录,按调用顺序编号
4. ID 替换kiro-diff URI → 真实文件路径terminalId → 进程描述
5. CONTEXT TRANSFER 中的 steering 内容折叠引用
6. 无内容的 model action 压缩为一行
用法:
python scripts/ops/extract_kiro_session.py # 提取最新 execution
python scripts/ops/extract_kiro_session.py --all # 提取所有未索引的
python scripts/ops/extract_kiro_session.py --recent 20 # 提取最近 N 个未索引的
python scripts/ops/extract_kiro_session.py --execution-id XX # 提取指定 execution
"""
import base64
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone, timedelta
from typing import Optional
from _env_paths import ensure_repo_root
ensure_repo_root()
CST = timezone(timedelta(hours=8))
# Kiro 固定的 execution manifest 文件名
MANIFEST_FILENAME = "f62de366d0006e17ea00a01f6624aabf"
# 输出路径
SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs")
INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json") # 精简版:仅主对话
INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json") # 完整版:主对话 + 子代理
SYSTEM_PROMPTS_DIR = os.path.join(SESSION_LOG_DIR, "_system_prompts")
# globalStorage 默认路径
DEFAULT_GLOBAL_STORAGE = os.path.join(
os.environ.get("APPDATA", ""),
"Kiro", "User", "globalStorage"
)
# ═══════════════════════════════════════════════════════════
# 工具函数
# ═══════════════════════════════════════════════════════════
def ts_fmt(ms) -> str:
if not ms:
return "N/A"
try:
return datetime.fromtimestamp(ms / 1000, tz=CST).strftime("%Y-%m-%d %H:%M:%S")
except Exception:
return str(ms)
def ts_iso(ms) -> str:
if not ms:
return ""
try:
return datetime.fromtimestamp(ms / 1000, tz=CST).isoformat()
except Exception:
return ""
def ts_date_parts(ms) -> tuple[str, str, str]:
"""返回 (YYYY-MM, DD, HHMMSS) 用于目录分层和文件命名"""
try:
dt = datetime.fromtimestamp(ms / 1000, tz=CST)
return dt.strftime("%Y-%m"), dt.strftime("%d"), dt.strftime("%H%M%S")
except Exception:
return "unknown", "00", "000000"
def _resolve_chat_dir(day_dir: str, chat_short: str, first_hms: str) -> str:
"""为 chatSession 确定带当天序号的输出目录。
规则:
1. 先在 day_dir 下查找已有的同 chatSession 目录(同一天的后续轮次)
2. 再在整个 SESSION_LOG_DIR 下搜索跨天场景chatSession 首轮在其他日期)
3. 都没找到则在 day_dir 下分配新序号创建
- 目录格式:{seq:02d}_{chat_short}_{first_hms}/
"""
os.makedirs(day_dir, exist_ok=True)
# 1. 在当天目录下查找
for d in os.listdir(day_dir):
if os.path.isdir(os.path.join(day_dir, d)) and chat_short in d:
return os.path.join(day_dir, d)
# 2. 跨天搜索:遍历所有 YYYY-MM/DD/ 目录
log_root = SESSION_LOG_DIR
if os.path.isdir(log_root):
for ym in os.listdir(log_root):
ym_path = os.path.join(log_root, ym)
if not os.path.isdir(ym_path) or ym.startswith("_"):
continue
for dd in os.listdir(ym_path):
dd_path = os.path.join(ym_path, dd)
if not os.path.isdir(dd_path):
continue
for d in os.listdir(dd_path):
if os.path.isdir(os.path.join(dd_path, d)) and chat_short in d:
return os.path.join(dd_path, d)
# 3. 新 chatSession分配序号
existing_seqs = []
for d in os.listdir(day_dir):
if os.path.isdir(os.path.join(day_dir, d)) and len(d) >= 2 and d[:2].isdigit():
existing_seqs.append(int(d[:2]))
next_seq = max(existing_seqs, default=0) + 1
new_dir = os.path.join(day_dir, f"{next_seq:02d}_{chat_short}_{first_hms}")
os.makedirs(new_dir, exist_ok=True)
return new_dir
def _write_cross_day_ref(exec_day_dir: str, chat_short: str, chat_dir: str):
"""在 execution 所在日期目录下生成跨天指引文件。
当一个 chatSession 跨天时,后续日期的 day_dir 下不会有该对话的目录,
生成 _ref_{chatShort}.md 告知该对话归在哪个目录。
"""
os.makedirs(exec_day_dir, exist_ok=True)
ref_path = os.path.join(exec_day_dir, f"_ref_{chat_short}.md")
if os.path.isfile(ref_path):
return # 已存在,不重复写
rel_target = os.path.relpath(chat_dir, exec_day_dir).replace("\\", "/")
with open(ref_path, "w", encoding="utf-8") as f:
f.write(f"# 跨天对话指引\n\n")
f.write(f"chatSession `{chat_short}` 的完整记录归档在:\n\n")
f.write(f"→ `{rel_target}`\n\n")
f.write(f"(绝对路径:`{chat_dir.replace(chr(92), '/')}`\n")
def trunc(s, n=3000) -> str:
if not isinstance(s, str):
return str(s)
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
def safe_json(obj, n=5000) -> str:
try:
s = json.dumps(obj, ensure_ascii=False, indent=2)
except Exception:
s = str(obj)
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
def fence(content: str, lang: str = "") -> str:
"""生成安全的 Markdown 代码围栏。
检测 content 中最长连续反引号序列,外层用更多反引号包裹。
同时转义行首 # 避免被解析为 Markdown 标题。
如果内容中有未闭合的围栏,在末尾补上关闭围栏。
"""
if not content:
return f"```{lang}\n```"
# 修复内容中未闭合的围栏(原始数据截断导致)
fence_stack = []
for line in content.split("\n"):
stripped = line.strip()
m = re.match(r"^(`{3,})", stripped)
if m:
ticks = len(m.group(1))
# 如果栈顶有相同 tick 数的开启围栏,且当前行是纯关闭围栏
if fence_stack and fence_stack[-1] == ticks and stripped == '`' * ticks:
fence_stack.pop()
else:
fence_stack.append(ticks)
# 补上所有未闭合的围栏(从栈顶开始关闭)
if fence_stack:
suffix_lines = ['`' * t for t in reversed(fence_stack)]
content = content + "\n" + "\n".join(suffix_lines)
# 找出内容中最长的连续反引号
max_ticks = 2
cur = 0
for ch in content:
if ch == '`':
cur += 1
if cur > max_ticks:
max_ticks = cur
else:
cur = 0
outer = '`' * (max_ticks + 1)
# 转义行首 # —— 加零宽空格使其不被解析为标题
safe = _escape_heading(content)
return f"{outer}{lang}\n{safe}\n{outer}"
def _escape_heading(text: str) -> str:
"""转义文本中行首的 # 符号,防止被 Markdown 解析为标题。
在 # 前插入零宽空格 (\\u200b)。
"""
lines = text.split('\n')
out = []
for line in lines:
if line.lstrip().startswith('#'):
# 找到第一个 # 的位置,在前面插入零宽空格
idx = 0
while idx < len(line) and line[idx] in (' ', '\t'):
idx += 1
out.append(line[:idx] + '\u200b' + line[idx:])
else:
out.append(line)
return '\n'.join(out)
def hash8(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
# ═══════════════════════════════════════════════════════════
# 系统提示词去重
# ═══════════════════════════════════════════════════════════
def save_system_prompt(text: str) -> str:
"""保存系统提示词到 _system_prompts/,返回引用文件名。
如果已存在相同 hash 的文件则跳过。
"""
h = hash8(text)
filename = f"sp_{h}.md"
filepath = os.path.join(SYSTEM_PROMPTS_DIR, filename)
if not os.path.isfile(filepath):
os.makedirs(SYSTEM_PROMPTS_DIR, exist_ok=True)
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# 系统提示词 (hash: {h})\n\n")
f.write(text)
return filename
def is_system_prompt(text: str) -> bool:
"""判断文本是否为系统提示词Kiro 注入的 <identity>/<capabilities> 等)"""
if not text:
return False
# 系统提示词通常以 <identity> 开头或包含 <capabilities>
return (
"<identity>" in text[:200]
or "<capabilities>" in text[:500]
or text.strip().startswith("You are Kiro")
)
def is_steering_block(text: str) -> bool:
"""判断文本是否为 steering-reminder 注入"""
return "<steering-reminder>" in text[:100]
# ═══════════════════════════════════════════════════════════
# ID 替换与路径还原
# ═══════════════════════════════════════════════════════════
# kiro-diff URI 模式kiro-diff:/path?commitId=xxx&executionId=yyy
KIRO_DIFF_PATTERN = re.compile(
r'kiro-diff:(/[^?]+)\?commitId(?:%3D|=)([^&]+)&executionId(?:%3D|=)([^"\'}\s]+)'
)
def resolve_kiro_diff_uri(uri: str) -> str:
"""将 kiro-diff: URI 替换为可读的文件路径描述"""
m = KIRO_DIFF_PATTERN.search(uri)
if m:
filepath = m.group(1)
commit_id = m.group(2)
return f"{filepath} (版本: {commit_id[:8]})"
return uri
def resolve_ids_in_text(text: str) -> str:
"""在文本中替换已知的 ID 模式为可读信息"""
if not text or not isinstance(text, str):
return str(text) if text else ""
# 替换 kiro-diff URI
text = KIRO_DIFF_PATTERN.sub(
lambda m: f'[文件快照] {m.group(1)} (版本 {m.group(2)[:8]})',
text
)
# 替换 file:///c%3A/ 编码路径
text = re.sub(
r'file:///([a-zA-Z])%3A/',
lambda m: f'{m.group(1).upper()}:/',
text
)
return text
def resolve_tool_args(name: str, args: dict) -> dict:
"""对工具调用参数做可读性增强"""
resolved = dict(args)
# document 类型中的 target 可能是 kiro-diff URI
if "target" in resolved and isinstance(resolved["target"], str):
resolved["target"] = resolve_kiro_diff_uri(resolved["target"])
# editCode / strReplace 中的 path
if "path" in resolved and isinstance(resolved["path"], str):
resolved["path"] = resolve_ids_in_text(resolved["path"])
# document entries 中的 modified/original
for key in ("modified", "original", "local"):
if key in resolved and isinstance(resolved[key], str):
resolved[key] = resolve_ids_in_text(resolved[key])
return resolved
# ═══════════════════════════════════════════════════════════
# 定位逻辑
# ═══════════════════════════════════════════════════════════
def find_kiro_agent_dir(global_storage: str) -> Optional[str]:
agent_dir = os.path.join(global_storage, "kiro.kiroagent")
return agent_dir if os.path.isdir(agent_dir) else None
def decode_base64url_dir(dirname: str) -> str:
try:
b64 = dirname.replace("__", "==")
return base64.urlsafe_b64decode(b64).decode("utf-8", errors="replace")
except Exception:
return ""
def find_workspace_session_dir(agent_dir: str, workspace_path: str) -> Optional[str]:
ws_dir = os.path.join(agent_dir, "workspace-sessions")
if not os.path.isdir(ws_dir):
return None
target = workspace_path.replace("\\", "/").rstrip("/").lower()
for entry in os.scandir(ws_dir):
if entry.is_dir():
decoded = decode_base64url_dir(entry.name)
if decoded.replace("\\", "/").rstrip("/").lower() == target:
return entry.path
return None
def load_sessions_json(session_dir: str) -> list[dict]:
sessions_file = os.path.join(session_dir, "sessions.json")
if not os.path.isfile(sessions_file):
return []
try:
with open(sessions_file, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
return data
if isinstance(data, dict) and "sessions" in data:
return data["sessions"]
return []
except Exception:
return []
def load_session_detail(session_dir: str, session_id: str) -> Optional[dict]:
path = os.path.join(session_dir, f"{session_id}.json")
if not os.path.isfile(path):
return None
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return None
def find_all_manifests(agent_dir: str) -> list[tuple[str, str]]:
results = []
hex_pattern = re.compile(r"^[0-9a-f]{32}$")
for entry in os.scandir(agent_dir):
if entry.is_dir() and hex_pattern.match(entry.name):
manifest = os.path.join(entry.path, MANIFEST_FILENAME)
if os.path.isfile(manifest):
results.append((entry.name, manifest))
return results
def load_manifest(manifest_path: str) -> list[dict]:
try:
with open(manifest_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
return data
# Kiro 新版 manifest 格式: {"executions": [...], "version": ...}
if isinstance(data, dict) and "executions" in data:
return data["executions"]
return []
except Exception:
return []
def find_execution_log(agent_dir: str, hex_dir: str, execution: dict) -> Optional[str]:
"""定位完整 execution log 文件"""
exec_id = execution.get("executionId", "")
hex_path = os.path.join(agent_dir, hex_dir)
for entry in os.scandir(hex_path):
if not entry.is_dir():
continue
for sub_entry in os.scandir(entry.path):
if not sub_entry.is_file() or sub_entry.stat().st_size < 1000:
continue
try:
with open(sub_entry.path, "r", encoding="utf-8") as f:
head = f.read(500)
if exec_id in head:
f.seek(0)
data = json.load(f)
if data.get("executionId") == exec_id:
return sub_entry.path
except Exception:
continue
return None
def find_all_executions(
agent_dir: str,
chat_session_ids: Optional[set[str]] = None,
execution_id: Optional[str] = None,
) -> list[dict]:
"""从所有 manifest 中找匹配的 execution按 endTime 降序"""
manifests = find_all_manifests(agent_dir)
all_execs = []
for hex_dir, manifest_path in manifests:
entries = load_manifest(manifest_path)
for entry in entries:
entry["_hex_dir"] = hex_dir
if execution_id:
eid = entry.get("executionId", "")
if eid == execution_id or eid.startswith(execution_id):
return [entry]
# 仅当 execution 有 chatSessionId 且不在 workspace session 集合中时才过滤
csid = entry.get("chatSessionId")
if chat_session_ids and csid and csid not in chat_session_ids:
continue
all_execs.append(entry)
all_execs.sort(key=lambda e: e.get("endTime", 0), reverse=True)
return all_execs
# ═══════════════════════════════════════════════════════════
# 解析逻辑
# ═══════════════════════════════════════════════════════════
def parse_messages(messages: list) -> list[dict]:
"""解析 context.messages处理系统提示词去重和 ID 替换"""
conversation = []
for i, msg in enumerate(messages):
entries = msg.get("entries", [])
parsed = []
for entry in entries:
if not isinstance(entry, dict):
continue
etype = entry.get("type", "unknown")
if etype == "text":
text = entry.get("text", "")
# 检测系统提示词
if is_system_prompt(text):
sp_file = save_system_prompt(text)
parsed.append({
"type": "system_prompt_ref",
"ref_file": sp_file,
"char_count": len(text),
})
elif is_steering_block(text):
# steering 内容折叠,只保留文件名列表
steering_files = re.findall(r'(\w[\w-]+\.md):', text)
parsed.append({
"type": "steering_ref",
"files": steering_files or ["(steering block)"],
"char_count": len(text),
})
else:
parsed.append({"type": "text", "text": resolve_ids_in_text(text)})
elif etype == "toolUse":
args = resolve_tool_args(entry.get("name", ""), entry.get("args", {}))
parsed.append({
"type": "toolUse",
"id": entry.get("id"),
"name": entry.get("name"),
"args": args,
})
elif etype == "toolUseResponse":
msg_text = entry.get("message", "")
parsed.append({
"type": "toolUseResponse",
"id": entry.get("id"),
"name": entry.get("name"),
"message": resolve_ids_in_text(msg_text),
"success": entry.get("success"),
})
elif etype == "document":
doc = entry.get("document", {})
doc_type = doc.get("type", "")
target = doc.get("target", "")
# steering 类型的 document提取文件名
if doc_type == "steering":
display_name = doc.get("displayName", "")
parsed.append({
"type": "steering_doc",
"name": display_name or "steering",
})
else:
parsed.append({
"type": "document",
"doc_type": doc_type,
"target": resolve_ids_in_text(target) if target else "",
})
else:
parsed.append({"type": etype, "raw_keys": list(entry.keys())})
conversation.append({
"index": i,
"role": msg.get("role", "?"),
"messageId": msg.get("messageId", "?"),
"entries": parsed,
})
return conversation
def parse_actions(actions: list) -> list[dict]:
"""解析 actions压缩无内容的 model action"""
timeline = []
for i, action in enumerate(actions):
atype = action.get("actionType", "")
astate = action.get("actionState", "")
# 压缩无内容的 model action 为摘要
if atype == "model" and "output" not in action and "input" not in action:
timeline.append({
"index": i,
"actionType": "model",
"actionState": astate,
"emittedAt": ts_fmt(action.get("emittedAt")),
"_compressed": True,
})
continue
entry = {
"index": i,
"actionId": action.get("actionId"),
"actionType": atype,
"actionState": astate,
"emittedAt": ts_fmt(action.get("emittedAt")),
}
if action.get("subExecutionId"):
entry["subExecutionId"] = action["subExecutionId"]
if action.get("endTime"):
entry["endTime"] = ts_fmt(action["endTime"])
for k in ("intentResult", "input", "output"):
if k in action:
val = action[k]
# 对 output/input 中的文本做 ID 替换
if isinstance(val, dict):
val = dict(val) # 避免修改原始数据
# 提取文件变更信息write/create action 的 originalContent/modifiedContent
if k == "input" and ("originalContent" in val or "modifiedContent" in val):
file_path = val.get("file", val.get("path", "?"))
entry["_file_change"] = {
"file": resolve_ids_in_text(str(file_path)),
"original": val.get("originalContent", ""),
"modified": val.get("modifiedContent", ""),
}
# 从 input 中移除大文本,保留元信息
slim = {vk: resolve_ids_in_text(str(vv)) if isinstance(vv, str) else vv
for vk, vv in val.items()
if vk not in ("originalContent", "modifiedContent")}
entry[k] = slim
continue
for vk, vv in val.items():
if isinstance(vv, str):
val[vk] = resolve_ids_in_text(vv)
entry[k] = val
timeline.append(entry)
return timeline
def extract_sub_execution_ids(actions: list) -> list[str]:
"""从 actions 中提取所有 subExecutionId按出现顺序"""
seen = set()
result = []
for action in actions:
sid = action.get("subExecutionId")
if sid and sid not in seen:
seen.add(sid)
result.append(sid)
return result
# ═══════════════════════════════════════════════════════════
# Diff 快照收集
# ═══════════════════════════════════════════════════════════
def collect_diffs(agent_dir: str, hex_dir: str, execution: dict) -> dict[str, dict]:
"""从 execution 的 actions 中提取文件变更的 diff 信息。
Kiro 在 write/create action 的 input 中内联了 originalContent 和 modifiedContent。
"""
# diff 快照存储在固定目录 74a08cf8.../commitId/ 下,
# 但 action input 中已内联内容,直接从 actions 提取更可靠
return {}
# ═══════════════════════════════════════════════════════════
# Prompt Log 匹配
# ═══════════════════════════════════════════════════════════
PROMPT_LOG_DIR = os.path.join("docs", "audit", "prompt_logs")
def find_matching_prompt_log(start_time_ms: int) -> Optional[str]:
"""根据 execution startTime 匹配最近的 prompt_log 文件。
prompt_log 文件名格式: prompt_log_YYYYMMDD_HHMMSS.md
匹配窗口: startTime 前后 5 分钟内最近的一个。
"""
if not os.path.isdir(PROMPT_LOG_DIR):
return None
try:
exec_dt = datetime.fromtimestamp(start_time_ms / 1000, tz=CST)
except Exception:
return None
best_match = None
best_delta = float("inf")
pattern = re.compile(r"prompt_log_(\d{8}_\d{6})\.md$")
for f in os.scandir(PROMPT_LOG_DIR):
if not f.is_file():
continue
m = pattern.match(f.name)
if not m:
continue
try:
log_dt = datetime.strptime(m.group(1), "%Y%m%d_%H%M%S").replace(tzinfo=CST)
delta = abs((exec_dt - log_dt).total_seconds())
if delta < 300 and delta < best_delta: # 5 分钟窗口
best_delta = delta
best_match = f.path
except Exception:
continue
return best_match
# ═══════════════════════════════════════════════════════════
# Markdown 生成
# ═══════════════════════════════════════════════════════════
def _msg_semantic_label(msg: dict) -> str:
"""为对话消息生成语义标签,用于快速定位。"""
entries = msg.get("entries", [])
if not entries:
return ""
parts = []
for e in entries:
et = e["type"]
if et == "system_prompt_ref":
parts.append("系统提示词")
elif et == "steering_ref":
parts.append(f"Steering({len(e.get('files', []))})")
elif et == "steering_doc":
parts.append(f"Steering:`{e.get('name', '?')}`")
elif et == "toolUse":
name = e.get("name", "?")
# 提取关键参数作为上下文
args = e.get("args", {})
ctx = ""
if name in ("readFile", "readCode", "readMultipleFiles"):
ctx = args.get("path", "") or ",".join(args.get("paths", [])[:2])
elif name in ("fsWrite", "strReplace", "editCode"):
ctx = args.get("path", "")
elif name == "grepSearch":
ctx = args.get("query", "")[:30]
elif name == "invokeSubAgent":
ctx = args.get("name", "")
elif name == "executePwsh":
ctx = (args.get("command", "") or "")[:40]
elif name == "taskStatus":
ctx = args.get("status", "")
if ctx:
parts.append(f"调用 `{name}` → {ctx}")
else:
parts.append(f"调用 `{name}`")
elif et == "toolUseResponse":
name = e.get("name", "?")
ok = "" if e.get("success") else ""
parts.append(f"结果 `{name}` {ok}")
elif et == "document":
parts.append(f"文档:{e.get('doc_type', '?')}")
elif et == "text":
# 文本内容:提取前 50 字符作为预览bot 和 human 都加)
role = msg.get("role", "")
if role in ("bot", "human"):
text = (e.get("text") or "").strip()
if text:
preview = text[:50].replace("\n", " ")
if len(text) > 50:
preview += ""
icon = "💬" if role == "bot" else "📝"
parts.append(f"{icon} `{preview}`")
return ", ".join(parts) if parts else ""
def _step_semantic_label(step: dict) -> str:
"""为 action step 生成带图标的语义标签。"""
at = step.get("actionType", "?")
state = step.get("actionState", "?")
fc = step.get("_file_change")
sub_eid = step.get("subExecutionId")
# 状态图标
if state == "Error":
state_icon = ""
elif state in ("Success", "Accepted"):
state_icon = ""
else:
state_icon = ""
# 类型图标 + 上下文
if at in ("write", "append") and fc:
fname = fc.get("file", "?")
short = fname.rsplit("/", 1)[-1] if "/" in fname else fname
orig = fc.get("original", "")
if orig:
return f"⚡ `{at}` 修改 `{short}` {state_icon}"
else:
return f"⚡ `{at}` 新建 `{short}` {state_icon}"
elif at == "invokeSubAgent":
inp = step.get("input", {})
# Kiro 原始 log 用 subAgentName工具 schema 用 name
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
return f"🔀 `invokeSubAgent` → {agent_name} {state_icon}"
elif at == "subagentResponse":
return f"🔀 `subagentResponse` {state_icon}"
elif at in ("readFiles", "readCode"):
inp = step.get("input", {})
if isinstance(inp, dict):
files = inp.get("files", [])
if files and isinstance(files[0], dict):
paths = [f.get("path", "?") for f in files[:2]]
else:
paths = [str(f) for f in files[:2]]
ctx = ", ".join(paths)
else:
ctx = ""
return f"📖 `{at}` {ctx} {state_icon}" if ctx else f"📖 `{at}` {state_icon}"
elif at == "search":
return f"🔍 `search` {state_icon}"
elif at == "say":
return f"💬 `say` {state_icon}"
elif at == "taskStatus":
return f"📋 `taskStatus` {state_icon}"
elif at == "steering":
return f"📄 `steering` {state_icon}"
elif at == "runCommand":
return f"🖥️ `runCommand` {state_icon}"
elif at == "getDiagnostics":
return f"🩺 `getDiagnostics` {state_icon}"
elif at == "ContextualHookInvoked":
inp = step.get("input", {})
hook_name = inp.get("name", "?") if isinstance(inp, dict) else "?"
return f"🪝 Hook `{hook_name}` {state_icon}"
elif at == "intentClassification":
ir = step.get("intentResult", {})
cls = ir.get("classification", "?") if isinstance(ir, dict) else "?"
return f"🎯 意图: `{cls}` {state_icon}"
elif at == "replace":
inp = step.get("input", {})
path = inp.get("file", inp.get("path", "?")) if isinstance(inp, dict) else "?"
short = path.rsplit("/", 1)[-1] if "/" in str(path) else str(path)
return f"✏️ `replace` `{short}` {state_icon}"
else:
return f"`{at}` [{state}]"
def _build_execution_summary(
log: dict,
conversation: list[dict],
timeline: list[dict],
sub_file_map: Optional[dict[str, str]] = None,
) -> dict:
"""构建结构化执行摘要(零 LLM 成本,纯规则化提取)。
返回 dict 供 md 渲染和索引存储共用。
"""
dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
files_modified = []
files_created = []
sub_agents = []
errors = []
for step in timeline:
if step.get("_compressed"):
continue
idx = step.get("index", "?")
at = step.get("actionType", "?")
state = step.get("actionState", "?")
fc = step.get("_file_change")
if fc:
fname = fc.get("file", "?")
if fc.get("original"):
files_modified.append(fname)
else:
files_created.append(fname)
if at == "invokeSubAgent":
inp = step.get("input", {})
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
sub_agents.append(agent_name)
if state == "Error":
errors.append(f"Step {idx}: `{at}`")
for msg in conversation:
for e in msg.get("entries", []):
if e.get("type") == "toolUseResponse" and not e.get("success"):
errors.append(f"Msg {msg['index']}: `{e.get('name', '?')}`")
# 去重文件名
files_modified = list(dict.fromkeys(files_modified))
files_created = list(dict.fromkeys(files_created))
# description 由外部 LLM 生成(百炼 API提取阶段不生成
description = ""
return {
"workflow": log.get("workflowType", "?"),
"status": log.get("status", "?"),
"duration_s": round(dur, 1),
"msg_count": len(conversation),
"action_count": len(timeline),
"files_modified": files_modified,
"files_created": files_created,
"sub_agents": sub_agents,
"errors": errors,
"description": description,
}
def _render_summary_md(summary: dict, sub_file_map: Optional[dict[str, str]] = None) -> str:
"""将结构化摘要渲染为 Markdown 文本(放在文件最前面)。"""
L = []
# 一句话概览
status_icon = "" if summary["status"] == "succeed" else ""
L.append(f"{status_icon} `{summary['workflow']}` | {summary['duration_s']}s | "
f"{summary['msg_count']} msgs | {summary['action_count']} actions")
L.append("")
desc = summary.get("description", "")
if desc:
L.append(f"> {desc}")
L.append("")
fm = summary["files_modified"]
fc = summary["files_created"]
if fm or fc:
L.append(f"**文件变更** (修改 {len(fm)} / 新建 {len(fc)})")
for f in fm:
L.append(f"- ⚡ 修改 `{f}`")
for f in fc:
L.append(f"- ✨ 新建 `{f}`")
L.append("")
sa = summary["sub_agents"]
if sa:
L.append(f"**子代理** ({len(sa)}): {', '.join(f'`{a}`' for a in sa)}")
L.append("")
errs = summary["errors"]
if errs:
L.append(f"**错误** ({len(errs)})")
for e in errs:
L.append(f"- ❌ {e}")
L.append("")
if not fm and not fc and not sa and not errs:
L.append("*(无文件变更、子代理调用或错误)*")
return "\n".join(L)
def _build_nav_summary(
conversation: list[dict],
timeline: list[dict],
sub_file_map: Optional[dict[str, str]] = None,
) -> str:
"""生成快速导航摘要:文件变更、子代理、错误。"""
file_changes = []
sub_agents = []
errors = []
for step in timeline:
if step.get("_compressed"):
continue
idx = step.get("index", "?")
at = step.get("actionType", "?")
state = step.get("actionState", "?")
# 文件变更
fc = step.get("_file_change")
if fc:
fname = fc.get("file", "?")
orig = fc.get("original", "")
action = "修改" if orig else "新建"
file_changes.append(f"- Step {idx}: ⚡ {action} `{fname}`")
# 子代理
if at == "invokeSubAgent":
inp = step.get("input", {})
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
sub_eid = step.get("subExecutionId", "")
sub_path = ""
if sub_file_map and sub_eid and sub_eid in sub_file_map:
sub_path = f" → `{sub_file_map[sub_eid].replace(chr(92), '/')}`"
sub_agents.append(f"- Step {idx}: 🔀 `{agent_name}`{sub_path}")
# 错误
if state == "Error":
errors.append(f"- Step {idx}: ❌ `{at}`")
# 对话中的错误工具结果
for msg in conversation:
for e in msg.get("entries", []):
if e.get("type") == "toolUseResponse" and not e.get("success"):
errors.append(f"- Msg {msg['index']}: ❌ `{e.get('name', '?')}`")
lines = []
if file_changes:
lines.append(f"**文件变更** ({len(file_changes)})")
lines.extend(file_changes)
lines.append("")
if sub_agents:
lines.append(f"**子代理调用** ({len(sub_agents)})")
lines.extend(sub_agents)
lines.append("")
if errors:
lines.append(f"**错误** ({len(errors)})")
lines.extend(errors)
lines.append("")
if not lines:
lines.append("*(无文件变更、子代理调用或错误)*")
return "\n".join(lines)
def generate_full_record(
log: dict,
conversation: list[dict],
timeline: list[dict],
diffs: dict[str, dict],
session_info: Optional[dict] = None,
prompt_log_path: Optional[str] = None,
is_sub: bool = False,
sub_index: int = 0,
sub_file_map: Optional[dict[str, str]] = None,
prev_msg_count: int = 0,
) -> tuple[str, dict]:
"""生成单个 execution 的 Markdown 全量记录。
Args:
log: 原始 execution log JSON
conversation: parse_messages 输出
timeline: parse_actions 输出
diffs: collect_diffs 输出
session_info: 会话配置(仅主 execution 有)
prompt_log_path: 匹配的 prompt_log 文件路径
is_sub: 是否为子代理 execution
sub_index: 子代理序号(从 1 开始)
prev_msg_count: 前一轮 execution 的消息数,用于去重(跳过累积的历史消息)
"""
L = []
exec_id = log.get("executionId", "?")
chat_id = log.get("chatSessionId", "?")
# 构建结构化摘要(供 md 和索引共用)
_summary = _build_execution_summary(log, conversation, timeline, sub_file_map)
# 标题
if is_sub:
L.append(f"# 子代理 Execution #{sub_index}\n")
else:
L.append("# Kiro 会话全量记录\n")
L.append(f"> 生成时间: {datetime.now(CST).strftime('%Y-%m-%d %H:%M:%S')} CST\n")
# ── 0. 执行摘要文件最前面AI 读前 20 行即可掌握全貌)──
L.append("## 📋 执行摘要\n")
L.append(_render_summary_md(_summary, sub_file_map))
L.append("")
# ── 1. 元数据 ──
L.append("## 1. 元数据\n")
L.append("| 字段 | 值 |")
L.append("|------|-----|")
L.append(f"| executionId | `{exec_id}` |")
L.append(f"| chatSessionId | `{chat_id}` |")
L.append(f"| workflowType | `{log.get('workflowType', '?')}` |")
L.append(f"| autonomyMode | `{log.get('autonomyMode', '?')}` |")
L.append(f"| status | `{log.get('status', '?')}` |")
L.append(f"| startTime | `{ts_fmt(log.get('startTime'))}` |")
L.append(f"| endTime | `{ts_fmt(log.get('endTime'))}` |")
dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
L.append(f"| duration | `{dur:.1f}s` |")
L.append(f"| contextUsage | `{log.get('contextUsagePercentage', 0):.2f}%` |")
L.append("")
if session_info and not is_sub:
L.append(f"- 会话标题: `{session_info.get('title', '?')}`")
L.append(f"- 创建时间: `{ts_fmt(int(session_info.get('dateCreated', 0)))}`")
L.append(f"- 工作区: `{session_info.get('workspaceDirectory', '?')}`")
L.append("")
if prompt_log_path and not is_sub:
rel = os.path.relpath(prompt_log_path).replace("\\", "/")
L.append(f"- 关联 prompt_log: `{rel}`")
L.append("")
# ── 2. 用户输入 ──
L.append("## 2. 用户输入\n")
input_text = ""
for msg in log.get("input", {}).get("data", {}).get("messages", []):
for entry in msg.get("content", msg.get("entries", [])):
if isinstance(entry, dict) and entry.get("text"):
input_text += entry["text"] + "\n"
if input_text.strip():
L.append(fence(input_text.strip()) + "\n")
else:
L.append("*(无用户输入)*\n")
# ── 3. 对话记录 ──
L.append("## 3. 对话记录\n")
# 去重:同一 chatSession 的非首轮 executioncontext.messages 包含前几轮的累积历史
# prev_msg_count > 0 时跳过前 N 条,只渲染本轮新增的消息
new_msgs = conversation[prev_msg_count:] if prev_msg_count > 0 else conversation
h = sum(1 for m in new_msgs if m["role"] == "human")
b = sum(1 for m in new_msgs if m["role"] == "bot")
t = sum(1 for m in new_msgs if m["role"] == "tool")
if prev_msg_count > 0:
L.append(f"{len(new_msgs)} 条新增消息 (跳过前 {prev_msg_count} 条历史): human={h}, bot={b}, tool={t}\n")
else:
L.append(f"{len(new_msgs)} 条消息: human={h}, bot={b}, tool={t}\n")
for msg in new_msgs:
emoji = {"human": "👤", "bot": "🤖", "tool": "🔧"}.get(msg["role"], "")
# 生成语义标签
msg_label = _msg_semantic_label(msg)
label_suffix = f"{msg_label}" if msg_label else ""
# P0: 压缩 hook 输出的空消息特征HUMAN 消息只含 "Output:\nCommand executed..." 或 "Output:\nCommand timed out..."
if msg["role"] == "human" and len(msg["entries"]) == 1:
e0 = msg["entries"][0]
if e0["type"] == "text":
_txt = (e0.get("text") or "").strip()
if _txt.startswith("Output:") and ("Exit Code:" in _txt) and len(_txt) < 200:
# 提取 exit code
import re as _re
_ec_match = _re.search(r"Exit Code:\s*(-?\d+)", _txt)
_ec = _ec_match.group(1) if _ec_match else "?"
L.append(f"### Msg {msg['index']}: 👤 HUMAN — 🪝 Hook 输出 (exit={_ec})\n")
continue
L.append(f"### Msg {msg['index']}: {emoji} {msg['role'].upper()}{label_suffix}\n")
for entry in msg["entries"]:
et = entry["type"]
if et == "system_prompt_ref":
ref = entry["ref_file"]
chars = entry["char_count"]
sp_path = f"docs/audit/session_logs/_system_prompts/{ref}"
L.append(f"**[系统提示词]** → `{sp_path}` ({chars} 字符)\n")
elif et == "steering_ref":
files = ", ".join(entry["files"])
chars = entry["char_count"]
L.append(f"**[Steering]** 引用: {files} ({chars} 字符)\n")
elif et == "text":
text = entry.get("text", "")
if not text:
L.append("*(空)*\n")
else:
L.append(fence(text) + "\n")
elif et == "toolUse":
name = entry.get("name", "?")
args = entry.get("args", {})
L.append(f"**[🔧 调用]** `{name}`\n")
# P1: strReplace/editCode 的代码变更用 diff 格式展示
if name in ("strReplace", "editCode") and isinstance(args, dict):
_path = args.get("path", "?")
_lang = "python" if _path.endswith(".py") else "sql" if _path.endswith(".sql") else ""
L.append(f"- 文件: `{_path}`\n")
_old = args.get("oldStr", args.get("old_str", ""))
_new = args.get("newStr", args.get("new_str", ""))
_sel = args.get("selector", "")
_op = args.get("operation", "")
_repl = args.get("replacement", "")
if _sel:
L.append(f"- selector: `{_sel}`" + (f" ({_op})" if _op else ""))
if _old:
L.append(f"- 删除:\n" + fence(trunc(_old, 2000), _lang))
if _new:
L.append(f"- 插入:\n" + fence(trunc(_new, 2000), _lang))
if _repl:
L.append(f"- 替换为:\n" + fence(trunc(_repl, 2000), _lang))
L.append("")
else:
L.append(fence(safe_json(args, 5000), "json") + "\n")
elif et == "toolUseResponse":
ok = "" if entry.get("success") else ""
L.append(f"**[📋 结果]** `{entry.get('name', '?')}` {ok}\n")
msg_text = entry.get("message", "")
if msg_text:
L.append(fence(trunc(msg_text, 5000)) + "\n")
elif et == "document":
target = entry.get("target", "")
L.append(f"**[📄 文档]** type=`{entry.get('doc_type')}` target=`{target}`\n")
elif et == "steering_doc":
L.append(f"**[📄 Steering]** `{entry.get('name', 'steering')}`\n")
else:
L.append(f"**[{et}]** keys={entry.get('raw_keys')}\n")
# ── 4. Actions 时间线 ──
L.append("## 4. Actions 时间线\n")
L.append(f"{len(timeline)}\n")
for step in timeline:
if step.get("_compressed"):
L.append(f"- `model` [{step.get('actionState')}] @ {step.get('emittedAt')}\n")
continue
at = step.get('actionType', '?')
state = step.get('actionState', '?')
# 生成语义标签
step_label = _step_semantic_label(step)
L.append(f"### Step {step['index']}: {step_label} @ {step.get('emittedAt','?')}\n")
if step.get("subExecutionId"):
sub_eid = step["subExecutionId"]
L.append(f"- subExecutionId: `{sub_eid}`")
# 标注子代理文件路径(如果有映射)
if sub_file_map and sub_eid in sub_file_map:
sub_path = sub_file_map[sub_eid].replace("\\", "/")
L.append(f"- 子代理记录: `{sub_path}`")
if step.get("endTime"):
L.append(f"- endTime: {step['endTime']}")
# 文件变更展示
if step.get("_file_change"):
fc = step["_file_change"]
fname = fc.get("file", "?")
orig = fc.get("original", "")
mod = fc.get("modified", "")
lang = "python" if fname.endswith(".py") else "sql" if fname.endswith(".sql") else ""
L.append(f"- 文件变更: `{fname}`")
if orig and mod:
L.append(f" - 修改前 ({len(orig)} 字符):\n" + fence(trunc(orig, 3000), lang))
L.append(f" - 修改后 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
elif mod:
L.append(f" - 新建 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
# 特殊处理各种 action type 的内容展示
_at = step.get("actionType", "")
if _at == "say":
_say_msg = (step.get("output") or {}).get("message", "")
if _say_msg:
L.append(f"- 💬 AI 回复:\n\n{_say_msg}\n")
else:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
elif _at == "invokeSubAgent":
_sub_input = step.get("input") or {}
_sub_prompt = _sub_input.get("prompt", "")
_sub_name = _sub_input.get("name", "") or _sub_input.get("subAgentName", "") or "?"
if _sub_prompt:
L.append(f"- 📤 委托 `{_sub_name}`:\n\n{trunc(_sub_prompt, 3000)}\n")
_sub_output = step.get("output") or {}
_sub_resp = _sub_output.get("response", "")
if _sub_resp:
L.append(f"- 📥 子代理输出:\n\n{_sub_resp}\n")
elif not _sub_prompt:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
elif _at == "subagentResponse":
_sr_input = step.get("input") or {}
_sr_resp = _sr_input.get("response", "")
if _sr_resp:
L.append(f"- 📥 子代理返回:\n\n{_sr_resp}\n")
else:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
elif _at == "ContextualHookInvoked":
# P1: hook 执行——提取名称、命令、exitCode
_hi = step.get("input") or {}
_ho = step.get("output") or {}
_h_name = _hi.get("name", "?")
_h_cmd = _ho.get("command", "")
_h_result = _ho.get("result", {})
_h_exit = _h_result.get("exitCode", "?") if isinstance(_h_result, dict) else "?"
_h_out = _h_result.get("output", "") if isinstance(_h_result, dict) else ""
if _h_cmd:
L.append(f"- `$ {_h_cmd}`")
L.append(f"- Exit: `{_h_exit}`")
if _h_out and _h_out != "Command executed successfully with no output.":
L.append(f"- 输出:\n" + fence(trunc(_h_out, 2000)))
elif _at == "intentClassification":
# P1: 意图分类——压缩为一行
_ir = step.get("intentResult", {})
_cls = _ir.get("classification", "?") if isinstance(_ir, dict) else "?"
L.append(f"- 分类结果: `{_cls}`")
elif _at == "runCommand":
# P0: 命令执行——提取命令、exitCode、输出
_rc_in = step.get("input") or {}
_rc_out = step.get("output") or {}
_rc_cmd = _rc_in.get("command", "") if isinstance(_rc_in, dict) else ""
_rc_result = _rc_out.get("result", {}) if isinstance(_rc_out, dict) else {}
_rc_exit = _rc_result.get("exitCode", "?") if isinstance(_rc_result, dict) else "?"
_rc_output = _rc_result.get("output", "") if isinstance(_rc_result, dict) else ""
if _rc_cmd:
L.append(f"- `$ {_rc_cmd}`")
L.append(f"- Exit: `{_rc_exit}`")
if _rc_output:
L.append(f"- 输出:\n" + fence(trunc(_rc_output, 3000)))
elif _at == "search":
# P2: 搜索——提取 query 和 why
_s_in = step.get("input") or {}
_s_query = _s_in.get("query", "") if isinstance(_s_in, dict) else ""
_s_why = _s_in.get("why", "") if isinstance(_s_in, dict) else ""
if _s_query:
L.append(f"- 🔍 `{trunc(_s_query, 100)}`")
if _s_why:
L.append(f"- 原因: {trunc(_s_why, 200)}")
# 展示搜索结果(如果有)
_s_out = step.get("output")
if _s_out and isinstance(_s_out, dict):
_s_files = _s_out.get("files", [])
if _s_files:
L.append(f"- 结果: {len(_s_files)} 个文件")
elif _at == "steering":
# P2: steering——提取文件名列表
_st_in = step.get("input") or {}
_st_docs = _st_in.get("documents", []) if isinstance(_st_in, dict) else []
if _st_docs:
import urllib.parse
names = []
for d in _st_docs[:10]:
if isinstance(d, str):
# URL 编码的路径,提取文件名
decoded = urllib.parse.unquote(d)
name = decoded.rsplit("/", 1)[-1] if "/" in decoded else decoded
names.append(name)
if names:
L.append(f"- 文件: {', '.join(names)}")
else:
for k in ("input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 3000), "json"))
elif _at == "getDiagnostics":
# P2: 诊断——提取路径和问题数
_gd_in = step.get("input") or {}
_gd_out = step.get("output") or {}
_gd_paths = _gd_in.get("paths", []) if isinstance(_gd_in, dict) else []
if _gd_paths:
L.append(f"- 文件: {', '.join(str(p) for p in _gd_paths[:5])}")
if isinstance(_gd_out, dict):
_gd_diags = _gd_out.get("diagnostics", [])
if isinstance(_gd_diags, list):
L.append(f"- 问题数: {len(_gd_diags)}")
for d in _gd_diags[:5]:
if isinstance(d, dict):
L.append(f" - {d.get('severity', '?')}: {d.get('message', '?')[:100]}")
elif not _gd_diags:
L.append("- ✅ 无问题")
elif _at in ("readFiles", "readCode"):
# P3: 文件读取——只展示路径,不展示内容
_rf_in = step.get("input") or {}
if isinstance(_rf_in, dict):
_rf_files = _rf_in.get("files", [])
paths = []
for f in _rf_files[:5]:
if isinstance(f, dict):
paths.append(f.get("path", "?"))
else:
paths.append(str(f))
if paths:
L.append(f"- 文件: {', '.join(paths)}")
else:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
L.append("")
# ── 5. 资源消耗 ──
usage = log.get("usageSummary", [])
if usage:
L.append("## 5. 资源消耗\n")
L.append("| 工具 | 消耗 | 单位 |")
L.append("|------|------|------|")
total = 0
for u in usage:
tools = ", ".join(u.get("usedTools", ["-"]))
amt = u.get("usage", 0)
total += amt
L.append(f"| {tools} | {amt} | {u.get('unit', '?')} |")
L.append(f"| **合计** | **{total:.4f}** | |")
L.append("")
return "\n".join(L), _summary
# ═══════════════════════════════════════════════════════════
# 索引管理
# ═══════════════════════════════════════════════════════════
def load_index() -> dict:
if os.path.isfile(INDEX_PATH):
try:
with open(INDEX_PATH, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {"version": 2, "entries": {}}
def load_full_index() -> dict:
if os.path.isfile(INDEX_FULL_PATH):
try:
with open(INDEX_FULL_PATH, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {"version": 2, "entries": {}}
def save_index(index: dict):
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
_save_day_indexes(index, "_day_index.json")
def save_full_index(index: dict):
os.makedirs(os.path.dirname(INDEX_FULL_PATH), exist_ok=True)
with open(INDEX_FULL_PATH, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
_save_day_indexes(index, "_day_index_full.json")
def _save_day_indexes(index: dict, filename: str):
"""将根级索引按 output_dir 中的 {ym}/{dd} 拆分,写入每个 day_dir 下。
路径格式docs/audit/session_logs/{ym}/{dd}/{filename}
每个 day 级索引只包含 output_dir 在该天目录下的 entry。
"""
entries = index.get("entries", {})
if not entries:
return
# 按 day_dir 分组
day_groups: dict[str, dict[str, dict]] = {}
prefix = SESSION_LOG_DIR.replace("\\", "/")
for eid, ent in entries.items():
out_dir = ent.get("output_dir", "").replace("\\", "/")
if not out_dir.startswith(prefix):
continue
# out_dir 格式docs/audit/session_logs/2026-03/03/01_abc12345_013337
# 取到 day_dirdocs/audit/session_logs/2026-03/03
rel = out_dir[len(prefix):].lstrip("/")
parts = rel.split("/")
if len(parts) >= 2:
day_key = f"{parts[0]}/{parts[1]}" # "2026-03/03"
day_groups.setdefault(day_key, {})[eid] = ent
for day_key, day_entries in day_groups.items():
day_dir = os.path.join(SESSION_LOG_DIR, day_key.replace("/", os.sep))
day_idx_path = os.path.join(day_dir, filename)
os.makedirs(day_dir, exist_ok=True)
day_data = {"version": 2, "entries": day_entries}
with open(day_idx_path, "w", encoding="utf-8") as f:
json.dump(day_data, f, ensure_ascii=False, indent=2)
def update_index(index: dict, exec_id: str, output_dir: str, meta: dict,
summary: Optional[dict] = None, is_sub: bool = False,
parent_exec_id: str = ""):
"""添加一条索引记录,含结构化摘要供 AI 低成本查询。
is_sub=True 时标记为子代理 entry并记录 parent_exec_id。
"""
entry = {
"output_dir": output_dir.replace("\\", "/"),
"chatSessionId": meta.get("chatSessionId", ""),
"startTime": meta.get("startTime", ""),
"endTime": meta.get("endTime", ""),
"status": meta.get("status", ""),
"workflowType": meta.get("workflowType", ""),
"indexed_at": datetime.now(CST).isoformat(),
}
if is_sub:
entry["is_sub"] = True
if parent_exec_id:
entry["parent_exec_id"] = parent_exec_id
if summary:
entry["summary"] = {
"duration_s": summary.get("duration_s", 0),
"msg_count": summary.get("msg_count", 0),
"action_count": summary.get("action_count", 0),
"files_modified": summary.get("files_modified", []),
"files_created": summary.get("files_created", []),
"sub_agents": summary.get("sub_agents", []),
"errors": summary.get("errors", []),
}
if summary.get("description"):
entry["description"] = summary["description"]
index["entries"][exec_id] = entry
# ═══════════════════════════════════════════════════════════
# 主提取逻辑
# ═══════════════════════════════════════════════════════════
def extract_single_execution(
agent_dir: str,
hex_dir: str,
execution: dict,
session_dir: Optional[str],
index: dict,
full_index: Optional[dict] = None,
sub_mode: bool = False,
sub_index: int = 0,
output_base_dir: Optional[str] = None,
parent_exec_id: str = "",
) -> Optional[str]:
"""提取单个 execution 并写入文件。
Args:
agent_dir: kiro.kiroagent 目录
hex_dir: execution 所在的 32 位 hex 目录
execution: manifest 中的 execution 条目
session_dir: workspace-sessions 子目录(用于加载会话信息)
index: 精简索引字典(仅主对话)
full_index: 完整索引字典(主对话 + 子代理None 时不写入
sub_mode: 是否为子代理模式
sub_index: 子代理序号
output_base_dir: 子代理模式下的输出目录(与主 execution 同目录)
parent_exec_id: 子代理的父 execution ID
Returns:
输出目录路径,或 None如果失败/已索引)
"""
exec_id = execution.get("executionId", "")
chat_id = execution.get("chatSessionId", "")
# 跳过已索引且文件仍存在的(子代理不检查,因为它们跟随主 execution
if not sub_mode and exec_id in index.get("entries", {}):
existing_dir = index["entries"][exec_id].get("output_dir", "")
if existing_dir and os.path.isdir(existing_dir):
return None
# 文件已被清理,从索引中移除,继续提取
del index["entries"][exec_id]
# 加载 execution log
log_path = find_execution_log(agent_dir, hex_dir, execution)
if not log_path:
return None
try:
with open(log_path, "r", encoding="utf-8") as f:
log = json.load(f)
except Exception:
return None
# 从完整 log 补充 chatSessionId新版 manifest 条目中可能缺失)
if not chat_id:
chat_id = log.get("chatSessionId", "")
# 解析
messages = log.get("context", {}).get("messages", [])
actions = log.get("actions", [])
conversation = parse_messages(messages)
timeline = parse_actions(actions)
diffs = collect_diffs(agent_dir, hex_dir, execution)
# 会话信息(仅主 execution
session_info = None
if not sub_mode and session_dir and chat_id:
session_info = load_session_detail(session_dir, chat_id)
# prompt_log 匹配(仅主 execution
prompt_log = None
if not sub_mode:
start_time = log.get("startTime", 0)
prompt_log = find_matching_prompt_log(start_time)
# 取 execution 开始时间(用于目录和文件命名的时间后缀)
_start_ms = log.get("startTime") or execution.get("startTime", 0)
_ym, _dd, _hms = ts_date_parts(_start_ms)
# 确定输出目录
if sub_mode and output_base_dir:
out_dir = output_base_dir
else:
chat_short = chat_id[:8] if chat_id else hash8(exec_id)
day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
out_dir = _resolve_chat_dir(day_dir, chat_short, _hms)
os.makedirs(out_dir, exist_ok=True)
# 跨天指引:如果 execution 所在日期与 chatSession 目录所在日期不同,
# 在 execution 日期的 day_dir 下生成 _ref_{chatShort}.md
if not sub_mode:
chat_short = chat_id[:8] if chat_id else hash8(exec_id)
# out_dir 的父目录是 chatSession 首轮所在的 day_dir
chat_day_dir = os.path.dirname(out_dir)
exec_day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
if os.path.normpath(chat_day_dir) != os.path.normpath(exec_day_dir):
_write_cross_day_ref(exec_day_dir, chat_short, out_dir)
# 先递归提取子代理,收集 sub_file_mapsubExecutionId → 文件路径)
sub_file_map: dict[str, str] = {}
if not sub_mode:
sub_exec_ids = extract_sub_execution_ids(actions)
for si, sub_eid in enumerate(sub_exec_ids, 1):
sub_execs = find_all_executions(agent_dir, execution_id=sub_eid)
if sub_execs:
sub_exec = sub_execs[0]
extract_single_execution(
agent_dir=agent_dir,
hex_dir=sub_exec["_hex_dir"],
execution=sub_exec,
session_dir=session_dir,
index=index,
full_index=full_index,
sub_mode=True,
sub_index=si,
output_base_dir=out_dir,
parent_exec_id=exec_id,
)
sub_filename = f"sub_{si:02d}_{sub_eid[:8]}.md"
sub_file_map[sub_eid] = os.path.join(out_dir, sub_filename)
# 计算 prev_msg_count同 chatSession 的前一轮 execution 的消息数
# 用于去重——跳过 context.messages 中累积的历史消息
# 策略:从同目录已有的 main_*.md 文件摘要中解析 msg_count
# (不依赖索引,因为并行提取时索引可能尚未合并)
prev_msg_count = 0
existing_mains = []
cur_msg_count = len(conversation)
if not sub_mode and out_dir and os.path.isdir(out_dir):
existing_mains = sorted(
f for f in os.listdir(out_dir)
if f.startswith("main_") and f.endswith(".md")
)
if existing_mains:
# 从已有 main 文件的摘要行解析 msg count
# 摘要格式:✅ `chat-agent` | 1406.6s | 44 msgs | 266 actions
# 同时检查对话记录部分的"跳过前 N 条历史"来获取累积消息数
# 累积消息数 = 该轮的 prev_msg_count + 该轮新增的 msg_count
import re as _re_prev
for _mf in existing_mains:
try:
_mf_path = os.path.join(out_dir, _mf)
_mf_msgs = 0
_mf_skipped = 0
with open(_mf_path, "r", encoding="utf-8") as _fh:
for _ln_idx, _ln in enumerate(_fh):
if _ln_idx > 200:
break
# 摘要行44 msgs
_mc_match = _re_prev.search(r'\|\s*(\d+)\s*msgs\s*\|', _ln)
if _mc_match:
_mf_msgs = int(_mc_match.group(1))
# 对话记录行:共 23 条新增消息 (跳过前 44 条历史)
_skip_match = _re_prev.search(r'\s*(\d+)\s*条新增消息\s*\(跳过前\s*(\d+)\s*条', _ln)
if _skip_match:
_mf_msgs = int(_skip_match.group(1))
_mf_skipped = int(_skip_match.group(2))
break
# 对话记录行(首轮):共 11 条消息:
_full_match = _re_prev.search(r'\s*(\d+)\s*条消息:', _ln)
if _full_match and _mf_skipped == 0:
_mf_msgs = int(_full_match.group(1))
# 累积消息数 = 跳过的 + 本轮新增的
_cumulative = _mf_skipped + _mf_msgs
if _cumulative > prev_msg_count and _cumulative < cur_msg_count:
prev_msg_count = _cumulative
except Exception:
pass
# 生成 Markdown + 结构化摘要(主 execution 带 sub_file_map
md, summary = generate_full_record(
log=log,
conversation=conversation,
timeline=timeline,
diffs=diffs,
session_info=session_info,
prompt_log_path=prompt_log,
is_sub=sub_mode,
sub_index=sub_index,
sub_file_map=sub_file_map if not sub_mode else None,
prev_msg_count=prev_msg_count,
)
# 写入文件
if sub_mode:
filename = f"sub_{sub_index:02d}_{exec_id[:8]}.md"
else:
# 自动编号:复用上面已扫描的 existing_mainsprev_msg_count 分支),
# 若未进入该分支则重新扫描
if not existing_mains and os.path.isdir(out_dir):
existing_mains = sorted(
f for f in os.listdir(out_dir)
if f.startswith("main_") and f.endswith(".md")
)
main_idx = len(existing_mains) + 1
filename = f"main_{main_idx:02d}_{exec_id[:8]}.md"
filepath = os.path.join(out_dir, filename)
# surrogate 字符(如 \udccb无法用 utf-8 编码,替换为 U+FFFD
md_safe = md.encode("utf-8", errors="surrogateescape").decode("utf-8", errors="replace")
with open(filepath, "w", encoding="utf-8") as f:
f.write(md_safe)
# 更新索引
_meta = {
"chatSessionId": chat_id,
"startTime": ts_fmt(log.get("startTime")),
"endTime": ts_fmt(log.get("endTime")),
"status": log.get("status", ""),
"workflowType": log.get("workflowType", ""),
}
if not sub_mode:
# 主对话:写入精简索引 + 完整索引
update_index(index, exec_id, out_dir, _meta, summary=summary)
if full_index is not None:
update_index(full_index, exec_id, out_dir, _meta, summary=summary)
else:
# 子代理:只写入完整索引
if full_index is not None:
update_index(full_index, exec_id, out_dir, _meta,
summary=summary, is_sub=True,
parent_exec_id=parent_exec_id)
return out_dir
# ═══════════════════════════════════════════════════════════
# 入口函数
# ═══════════════════════════════════════════════════════════
def extract_latest(global_storage: Optional[str] = None, workspace_path: Optional[str] = None):
"""提取所有未索引的终态 execution供 agent_on_stop 调用)。
同一 chatSession 的 execution 按 startTime 升序提取,确保 prev_msg_count 正确计算。
当前轮次在 agent on stop 触发时通常还是 running 状态,会在下一次调用时回补。
"""
gs = global_storage or DEFAULT_GLOBAL_STORAGE
ws = workspace_path or os.getcwd()
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
return
session_dir = find_workspace_session_dir(agent_dir, ws)
chat_ids = None
if session_dir:
sessions = load_sessions_json(session_dir)
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
if s.get("chatSessionId") or s.get("sessionId")}
TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
if not all_execs:
return
index = load_index()
full_index = load_full_index()
# 筛选:未索引 + 终态
ready = [e for e in all_execs
if e.get("executionId", "") not in index.get("entries", {})
and e.get("status", "") in TERMINAL_STATUSES]
if not ready:
return
# 按 chatSessionId 分组,组内按 startTime 升序
# 确保同一对话的 execution 按时间顺序提取prev_msg_count 才能正确计算
from collections import defaultdict
chat_groups: dict[str, list[dict]] = defaultdict(list)
for e in ready:
cid = e.get("chatSessionId", "") or "unknown"
chat_groups[cid].append(e)
for cid in chat_groups:
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
extracted_count = 0
for cid, group_execs in chat_groups.items():
for execution in group_execs:
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execution["_hex_dir"],
execution=execution,
session_dir=session_dir,
index=index,
full_index=full_index,
)
if result:
extracted_count += 1
print(f"[session-extract] extracted: {result}")
if extracted_count > 0:
save_index(index)
save_full_index(full_index)
if extracted_count > 1:
print(f"[session-extract] total: {extracted_count} executions")
def extract_all_unindexed(
global_storage: Optional[str] = None,
workspace_path: Optional[str] = None,
limit: Optional[int] = None,
workers: int = 8,
):
"""提取所有未索引的 execution多线程并行"""
gs = global_storage or DEFAULT_GLOBAL_STORAGE
ws = workspace_path or os.getcwd()
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
print("[session-extract] kiro.kiroagent dir not found")
return
session_dir = find_workspace_session_dir(agent_dir, ws)
chat_ids = None
if session_dir:
sessions = load_sessions_json(session_dir)
# 兼容 chatSessionId旧版和 sessionId新版两种字段名
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
if s.get("chatSessionId") or s.get("sessionId")}
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
if not all_execs:
print("[session-extract] no executions found")
return
index = load_index()
full_index = load_full_index()
# 筛选未索引的 execution只提取终态的跳过 running 等非终态)
TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
todo = [e for e in all_execs
if e.get("executionId", "") not in index.get("entries", {})
and e.get("status", "") in TERMINAL_STATUSES]
if limit:
todo = todo[:limit]
if not todo:
print("[session-extract] all indexed, nothing to do")
return
print(f"[session-extract] {len(todo)} executions to extract (workers={workers})")
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
# 按 chatSessionId 分组,同组内按 startTime 排序串行提取
# (同一 chatSession 的 context.messages 是累积的,需要按顺序提取以去重)
from collections import defaultdict
chat_groups: dict[str, list[dict]] = defaultdict(list)
for e in todo:
cid = e.get("chatSessionId", "") or "unknown"
chat_groups[cid].append(e)
for cid in chat_groups:
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
lock = threading.Lock()
count = 0
def _extract_group(group_execs):
"""串行提取同一 chatSession 的所有 execution"""
local_index = {"version": 2, "entries": {}}
local_full = {"version": 2, "entries": {}}
results = []
for execution in group_execs:
try:
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execution["_hex_dir"],
execution=execution,
session_dir=session_dir,
index=local_index,
full_index=local_full,
)
if result:
results.append(result)
except Exception as e:
eid = execution.get("executionId", "?")[:8]
print(f"[session-extract] ✗ {eid}: {e}")
return results, local_index.get("entries", {}), local_full.get("entries", {})
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_extract_group, execs): cid
for cid, execs in chat_groups.items()}
for future in as_completed(futures):
results, idx_entries, full_entries = future.result()
if results:
with lock:
count += len(results)
index["entries"].update(idx_entries)
full_index["entries"].update(full_entries)
if count % 50 == 0:
save_index(index)
save_full_index(full_index)
print(f"[session-extract] [{count}/{len(todo)}] checkpoint saved")
elif count % 10 == 0:
print(f"[session-extract] [{count}/{len(todo)}]")
# 最终保存
save_index(index)
save_full_index(full_index)
print(f"[session-extract] done, extracted {count}/{len(todo)}")
def extract_by_id(
execution_id: str,
global_storage: Optional[str] = None,
):
"""提取指定 executionId 的 execution"""
gs = global_storage or DEFAULT_GLOBAL_STORAGE
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
print("[session-extract] kiro.kiroagent dir not found")
return
execs = find_all_executions(agent_dir, execution_id=execution_id)
if not execs:
print(f"[session-extract] execution not found: {execution_id}")
return
# 验证确实匹配到了目标 execution前缀匹配
matched = execs[0]
if not matched.get("executionId", "").startswith(execution_id):
print(f"[session-extract] execution not found: {execution_id}")
return
index = load_index()
full_index = load_full_index()
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execs[0]["_hex_dir"],
execution=execs[0],
session_dir=None,
index=index,
full_index=full_index,
)
if result:
save_index(index)
save_full_index(full_index)
print(f"[session-extract] extracted: {result}")
# ═══════════════════════════════════════════════════════════
# CLI 入口
# ═══════════════════════════════════════════════════════════
def main():
import argparse
parser = argparse.ArgumentParser(description="Kiro 执行日志全量提取器 v2")
parser.add_argument("--all", action="store_true", help="提取所有未索引的 execution")
parser.add_argument("--recent", type=int, metavar="N", help="提取最近 N 条未索引的")
parser.add_argument("--workers", type=int, default=8, help="并行线程数(默认 8")
parser.add_argument("--execution-id", type=str, help="提取指定 executionId")
parser.add_argument("--global-storage", type=str, help="globalStorage 路径")
parser.add_argument("--workspace", type=str, help="workspace 路径")
args = parser.parse_args()
gs = args.global_storage
ws = args.workspace
if args.execution_id:
extract_by_id(args.execution_id, global_storage=gs)
elif args.all:
extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers)
elif args.recent:
extract_all_unindexed(global_storage=gs, workspace_path=ws, limit=args.recent, workers=args.workers)
else:
extract_latest(global_storage=gs, workspace_path=ws)
if __name__ == "__main__":
main()