Files
Neo-ZQYY/scripts/ops/extract_kiro_session.py

2182 lines
88 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""extract_kiro_session — Kiro 执行日志全量提取器 v2。
改进点(相比 v1
1. 系统提示词去重:首次保存到 _system_prompts/sp_{hash8}.md后续引用
2. 目录分层YYYY-MM/DD/{chatSessionId_short}/ 下按对话组织
3. 子代理递归提取:主 execution + 子 execution 放同一目录,按调用顺序编号
4. ID 替换kiro-diff URI → 真实文件路径terminalId → 进程描述
5. CONTEXT TRANSFER 中的 steering 内容折叠引用
6. 无内容的 model action 压缩为一行
用法:
python scripts/ops/extract_kiro_session.py # 提取最新 execution
python scripts/ops/extract_kiro_session.py --all # 提取所有未索引的
python scripts/ops/extract_kiro_session.py --recent 20 # 提取最近 N 个未索引的
python scripts/ops/extract_kiro_session.py --execution-id XX # 提取指定 execution
"""
import base64
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone, timedelta
from typing import Optional
from _env_paths import ensure_repo_root
ensure_repo_root()
CST = timezone(timedelta(hours=8))
# Kiro 固定的 execution manifest 文件名
MANIFEST_FILENAME = "f62de366d0006e17ea00a01f6624aabf"
# 输出路径
SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs")
INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json") # 精简版:仅主对话
INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json") # 完整版:主对话 + 子代理
SYSTEM_PROMPTS_DIR = os.path.join(SESSION_LOG_DIR, "_system_prompts")
# globalStorage 默认路径
DEFAULT_GLOBAL_STORAGE = os.path.join(
os.environ.get("APPDATA", ""),
"Kiro", "User", "globalStorage"
)
# ═══════════════════════════════════════════════════════════
# 工具函数
# ═══════════════════════════════════════════════════════════
def ts_fmt(ms) -> str:
if not ms:
return "N/A"
try:
return datetime.fromtimestamp(ms / 1000, tz=CST).strftime("%Y-%m-%d %H:%M:%S")
except Exception:
return str(ms)
def ts_iso(ms) -> str:
if not ms:
return ""
try:
return datetime.fromtimestamp(ms / 1000, tz=CST).isoformat()
except Exception:
return ""
def ts_date_parts(ms) -> tuple[str, str, str]:
"""返回 (YYYY-MM, DD, HHMMSS) 用于目录分层和文件命名"""
try:
dt = datetime.fromtimestamp(ms / 1000, tz=CST)
return dt.strftime("%Y-%m"), dt.strftime("%d"), dt.strftime("%H%M%S")
except Exception:
return "unknown", "00", "000000"
def _resolve_chat_dir(day_dir: str, chat_short: str, first_hms: str) -> str:
"""为 chatSession 确定带当天序号的输出目录。
规则:
1. 先在 day_dir 下查找已有的同 chatSession 目录(同一天的后续轮次)
2. 再在整个 SESSION_LOG_DIR 下搜索跨天场景chatSession 首轮在其他日期)
3. 都没找到则在 day_dir 下分配新序号创建
- 目录格式:{seq:02d}_{chat_short}_{first_hms}/
"""
os.makedirs(day_dir, exist_ok=True)
# 1. 在当天目录下查找
for d in os.listdir(day_dir):
if os.path.isdir(os.path.join(day_dir, d)) and chat_short in d:
return os.path.join(day_dir, d)
# 2. 跨天搜索:遍历所有 YYYY-MM/DD/ 目录
log_root = SESSION_LOG_DIR
if os.path.isdir(log_root):
for ym in os.listdir(log_root):
ym_path = os.path.join(log_root, ym)
if not os.path.isdir(ym_path) or ym.startswith("_"):
continue
for dd in os.listdir(ym_path):
dd_path = os.path.join(ym_path, dd)
if not os.path.isdir(dd_path):
continue
for d in os.listdir(dd_path):
if os.path.isdir(os.path.join(dd_path, d)) and chat_short in d:
return os.path.join(dd_path, d)
# 3. 新 chatSession分配序号
existing_seqs = []
for d in os.listdir(day_dir):
if os.path.isdir(os.path.join(day_dir, d)) and len(d) >= 2 and d[:2].isdigit():
existing_seqs.append(int(d[:2]))
next_seq = max(existing_seqs, default=0) + 1
new_dir = os.path.join(day_dir, f"{next_seq:02d}_{chat_short}_{first_hms}")
os.makedirs(new_dir, exist_ok=True)
return new_dir
def _write_cross_day_ref(exec_day_dir: str, chat_short: str, chat_dir: str):
"""在 execution 所在日期目录下生成跨天指引文件。
当一个 chatSession 跨天时,后续日期的 day_dir 下不会有该对话的目录,
生成 _ref_{chatShort}.md 告知该对话归在哪个目录。
"""
os.makedirs(exec_day_dir, exist_ok=True)
ref_path = os.path.join(exec_day_dir, f"_ref_{chat_short}.md")
if os.path.isfile(ref_path):
return # 已存在,不重复写
rel_target = os.path.relpath(chat_dir, exec_day_dir).replace("\\", "/")
with open(ref_path, "w", encoding="utf-8") as f:
f.write(f"# 跨天对话指引\n\n")
f.write(f"chatSession `{chat_short}` 的完整记录归档在:\n\n")
f.write(f"→ `{rel_target}`\n\n")
f.write(f"(绝对路径:`{chat_dir.replace(chr(92), '/')}`\n")
def trunc(s, n=3000) -> str:
if not isinstance(s, str):
return str(s)
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
def safe_json(obj, n=5000) -> str:
try:
s = json.dumps(obj, ensure_ascii=False, indent=2)
except Exception:
s = str(obj)
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
def fence(content: str, lang: str = "") -> str:
"""生成安全的 Markdown 代码围栏。
检测 content 中最长连续反引号序列,外层用更多反引号包裹。
同时转义行首 # 避免被解析为 Markdown 标题。
如果内容中有未闭合的围栏,在末尾补上关闭围栏。
"""
if not content:
return f"```{lang}\n```"
# 修复内容中未闭合的围栏(原始数据截断导致)
fence_stack = []
for line in content.split("\n"):
stripped = line.strip()
m = re.match(r"^(`{3,})", stripped)
if m:
ticks = len(m.group(1))
# 如果栈顶有相同 tick 数的开启围栏,且当前行是纯关闭围栏
if fence_stack and fence_stack[-1] == ticks and stripped == '`' * ticks:
fence_stack.pop()
else:
fence_stack.append(ticks)
# 补上所有未闭合的围栏(从栈顶开始关闭)
if fence_stack:
suffix_lines = ['`' * t for t in reversed(fence_stack)]
content = content + "\n" + "\n".join(suffix_lines)
# 找出内容中最长的连续反引号
max_ticks = 2
cur = 0
for ch in content:
if ch == '`':
cur += 1
if cur > max_ticks:
max_ticks = cur
else:
cur = 0
outer = '`' * (max_ticks + 1)
# 转义行首 # —— 加零宽空格使其不被解析为标题
safe = _escape_heading(content)
return f"{outer}{lang}\n{safe}\n{outer}"
def _escape_heading(text: str) -> str:
"""转义文本中行首的 # 符号,防止被 Markdown 解析为标题。
在 # 前插入零宽空格 (\\u200b)。
"""
lines = text.split('\n')
out = []
for line in lines:
if line.lstrip().startswith('#'):
# 找到第一个 # 的位置,在前面插入零宽空格
idx = 0
while idx < len(line) and line[idx] in (' ', '\t'):
idx += 1
out.append(line[:idx] + '\u200b' + line[idx:])
else:
out.append(line)
return '\n'.join(out)
def hash8(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]
# ═══════════════════════════════════════════════════════════
# 系统提示词去重
# ═══════════════════════════════════════════════════════════
def save_system_prompt(text: str) -> str:
"""保存系统提示词到 _system_prompts/,返回引用文件名。
如果已存在相同 hash 的文件则跳过。
"""
h = hash8(text)
filename = f"sp_{h}.md"
filepath = os.path.join(SYSTEM_PROMPTS_DIR, filename)
if not os.path.isfile(filepath):
os.makedirs(SYSTEM_PROMPTS_DIR, exist_ok=True)
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# 系统提示词 (hash: {h})\n\n")
f.write(text)
return filename
def is_system_prompt(text: str) -> bool:
"""判断文本是否为系统提示词Kiro 注入的 <identity>/<capabilities> 等)"""
if not text:
return False
# 系统提示词通常以 <identity> 开头或包含 <capabilities>
return (
"<identity>" in text[:200]
or "<capabilities>" in text[:500]
or text.strip().startswith("You are Kiro")
)
def is_steering_block(text: str) -> bool:
"""判断文本是否为 steering-reminder 注入"""
return "<steering-reminder>" in text[:100]
# ═══════════════════════════════════════════════════════════
# ID 替换与路径还原
# ═══════════════════════════════════════════════════════════
# kiro-diff URI 模式kiro-diff:/path?commitId=xxx&executionId=yyy
KIRO_DIFF_PATTERN = re.compile(
r'kiro-diff:(/[^?]+)\?commitId(?:%3D|=)([^&]+)&executionId(?:%3D|=)([^"\'}\s]+)'
)
def resolve_kiro_diff_uri(uri: str) -> str:
"""将 kiro-diff: URI 替换为可读的文件路径描述"""
m = KIRO_DIFF_PATTERN.search(uri)
if m:
filepath = m.group(1)
commit_id = m.group(2)
return f"{filepath} (版本: {commit_id[:8]})"
return uri
def resolve_ids_in_text(text: str) -> str:
"""在文本中替换已知的 ID 模式为可读信息"""
if not text or not isinstance(text, str):
return str(text) if text else ""
# 替换 kiro-diff URI
text = KIRO_DIFF_PATTERN.sub(
lambda m: f'[文件快照] {m.group(1)} (版本 {m.group(2)[:8]})',
text
)
# 替换 file:///c%3A/ 编码路径
text = re.sub(
r'file:///([a-zA-Z])%3A/',
lambda m: f'{m.group(1).upper()}:/',
text
)
return text
def resolve_tool_args(name: str, args: dict) -> dict:
"""对工具调用参数做可读性增强"""
resolved = dict(args)
# document 类型中的 target 可能是 kiro-diff URI
if "target" in resolved and isinstance(resolved["target"], str):
resolved["target"] = resolve_kiro_diff_uri(resolved["target"])
# editCode / strReplace 中的 path
if "path" in resolved and isinstance(resolved["path"], str):
resolved["path"] = resolve_ids_in_text(resolved["path"])
# document entries 中的 modified/original
for key in ("modified", "original", "local"):
if key in resolved and isinstance(resolved[key], str):
resolved[key] = resolve_ids_in_text(resolved[key])
return resolved
# ═══════════════════════════════════════════════════════════
# 定位逻辑
# ═══════════════════════════════════════════════════════════
def find_kiro_agent_dir(global_storage: str) -> Optional[str]:
agent_dir = os.path.join(global_storage, "kiro.kiroagent")
return agent_dir if os.path.isdir(agent_dir) else None
def decode_base64url_dir(dirname: str) -> str:
try:
b64 = dirname.replace("__", "==")
return base64.urlsafe_b64decode(b64).decode("utf-8", errors="replace")
except Exception:
return ""
def find_workspace_session_dir(agent_dir: str, workspace_path: str) -> Optional[str]:
ws_dir = os.path.join(agent_dir, "workspace-sessions")
if not os.path.isdir(ws_dir):
return None
target = workspace_path.replace("\\", "/").rstrip("/").lower()
for entry in os.scandir(ws_dir):
if entry.is_dir():
decoded = decode_base64url_dir(entry.name)
if decoded.replace("\\", "/").rstrip("/").lower() == target:
return entry.path
return None
def load_sessions_json(session_dir: str) -> list[dict]:
sessions_file = os.path.join(session_dir, "sessions.json")
if not os.path.isfile(sessions_file):
return []
try:
with open(sessions_file, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
return data
if isinstance(data, dict) and "sessions" in data:
return data["sessions"]
return []
except Exception:
return []
def load_session_detail(session_dir: str, session_id: str) -> Optional[dict]:
path = os.path.join(session_dir, f"{session_id}.json")
if not os.path.isfile(path):
return None
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
return None
def find_all_manifests(agent_dir: str) -> list[tuple[str, str]]:
results = []
hex_pattern = re.compile(r"^[0-9a-f]{32}$")
for entry in os.scandir(agent_dir):
if entry.is_dir() and hex_pattern.match(entry.name):
manifest = os.path.join(entry.path, MANIFEST_FILENAME)
if os.path.isfile(manifest):
results.append((entry.name, manifest))
return results
def load_manifest(manifest_path: str) -> list[dict]:
try:
with open(manifest_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
return data
# Kiro 新版 manifest 格式: {"executions": [...], "version": ...}
if isinstance(data, dict) and "executions" in data:
return data["executions"]
return []
except Exception:
return []
def find_execution_log(agent_dir: str, hex_dir: str, execution: dict) -> Optional[str]:
"""定位完整 execution log 文件"""
exec_id = execution.get("executionId", "")
hex_path = os.path.join(agent_dir, hex_dir)
for entry in os.scandir(hex_path):
if not entry.is_dir():
continue
for sub_entry in os.scandir(entry.path):
if not sub_entry.is_file() or sub_entry.stat().st_size < 1000:
continue
try:
with open(sub_entry.path, "r", encoding="utf-8") as f:
head = f.read(500)
if exec_id in head:
f.seek(0)
data = json.load(f)
if data.get("executionId") == exec_id:
return sub_entry.path
except Exception:
continue
return None
def find_all_executions(
agent_dir: str,
chat_session_ids: Optional[set[str]] = None,
execution_id: Optional[str] = None,
) -> list[dict]:
"""从所有 manifest 中找匹配的 execution按 endTime 降序"""
manifests = find_all_manifests(agent_dir)
all_execs = []
for hex_dir, manifest_path in manifests:
entries = load_manifest(manifest_path)
for entry in entries:
entry["_hex_dir"] = hex_dir
if execution_id:
eid = entry.get("executionId", "")
if eid == execution_id or eid.startswith(execution_id):
return [entry]
# 仅当 execution 有 chatSessionId 且不在 workspace session 集合中时才过滤
csid = entry.get("chatSessionId")
if chat_session_ids and csid and csid not in chat_session_ids:
continue
all_execs.append(entry)
all_execs.sort(key=lambda e: e.get("endTime", 0), reverse=True)
return all_execs
# ═══════════════════════════════════════════════════════════
# 解析逻辑
# ═══════════════════════════════════════════════════════════
def parse_messages(messages: list) -> list[dict]:
"""解析 context.messages处理系统提示词去重和 ID 替换"""
conversation = []
for i, msg in enumerate(messages):
entries = msg.get("entries", [])
parsed = []
for entry in entries:
if not isinstance(entry, dict):
continue
etype = entry.get("type", "unknown")
if etype == "text":
text = entry.get("text", "")
# 检测系统提示词
if is_system_prompt(text):
sp_file = save_system_prompt(text)
parsed.append({
"type": "system_prompt_ref",
"ref_file": sp_file,
"char_count": len(text),
})
elif is_steering_block(text):
# steering 内容折叠,只保留文件名列表
steering_files = re.findall(r'(\w[\w-]+\.md):', text)
parsed.append({
"type": "steering_ref",
"files": steering_files or ["(steering block)"],
"char_count": len(text),
})
else:
parsed.append({"type": "text", "text": resolve_ids_in_text(text)})
elif etype == "toolUse":
args = resolve_tool_args(entry.get("name", ""), entry.get("args", {}))
parsed.append({
"type": "toolUse",
"id": entry.get("id"),
"name": entry.get("name"),
"args": args,
})
elif etype == "toolUseResponse":
msg_text = entry.get("message", "")
parsed.append({
"type": "toolUseResponse",
"id": entry.get("id"),
"name": entry.get("name"),
"message": resolve_ids_in_text(msg_text),
"success": entry.get("success"),
})
elif etype == "document":
doc = entry.get("document", {})
doc_type = doc.get("type", "")
target = doc.get("target", "")
# steering 类型的 document提取文件名
if doc_type == "steering":
display_name = doc.get("displayName", "")
parsed.append({
"type": "steering_doc",
"name": display_name or "steering",
})
else:
parsed.append({
"type": "document",
"doc_type": doc_type,
"target": resolve_ids_in_text(target) if target else "",
})
else:
parsed.append({"type": etype, "raw_keys": list(entry.keys())})
conversation.append({
"index": i,
"role": msg.get("role", "?"),
"messageId": msg.get("messageId", "?"),
"entries": parsed,
})
return conversation
def parse_actions(actions: list) -> list[dict]:
"""解析 actions压缩无内容的 model action"""
timeline = []
for i, action in enumerate(actions):
atype = action.get("actionType", "")
astate = action.get("actionState", "")
# 压缩无内容的 model action 为摘要
if atype == "model" and "output" not in action and "input" not in action:
timeline.append({
"index": i,
"actionType": "model",
"actionState": astate,
"emittedAt": ts_fmt(action.get("emittedAt")),
"_compressed": True,
})
continue
entry = {
"index": i,
"actionId": action.get("actionId"),
"actionType": atype,
"actionState": astate,
"emittedAt": ts_fmt(action.get("emittedAt")),
}
if action.get("subExecutionId"):
entry["subExecutionId"] = action["subExecutionId"]
if action.get("endTime"):
entry["endTime"] = ts_fmt(action["endTime"])
for k in ("intentResult", "input", "output"):
if k in action:
val = action[k]
# 对 output/input 中的文本做 ID 替换
if isinstance(val, dict):
val = dict(val) # 避免修改原始数据
# 提取文件变更信息write/create action 的 originalContent/modifiedContent
if k == "input" and ("originalContent" in val or "modifiedContent" in val):
file_path = val.get("file", val.get("path", "?"))
entry["_file_change"] = {
"file": resolve_ids_in_text(str(file_path)),
"original": val.get("originalContent", ""),
"modified": val.get("modifiedContent", ""),
}
# 从 input 中移除大文本,保留元信息
slim = {vk: resolve_ids_in_text(str(vv)) if isinstance(vv, str) else vv
for vk, vv in val.items()
if vk not in ("originalContent", "modifiedContent")}
entry[k] = slim
continue
for vk, vv in val.items():
if isinstance(vv, str):
val[vk] = resolve_ids_in_text(vv)
entry[k] = val
timeline.append(entry)
return timeline
def extract_sub_execution_ids(actions: list) -> list[str]:
"""从 actions 中提取所有 subExecutionId按出现顺序"""
seen = set()
result = []
for action in actions:
sid = action.get("subExecutionId")
if sid and sid not in seen:
seen.add(sid)
result.append(sid)
return result
# ═══════════════════════════════════════════════════════════
# Diff 快照收集
# ═══════════════════════════════════════════════════════════
def collect_diffs(agent_dir: str, hex_dir: str, execution: dict) -> dict[str, dict]:
"""从 execution 的 actions 中提取文件变更的 diff 信息。
Kiro 在 write/create action 的 input 中内联了 originalContent 和 modifiedContent。
"""
# diff 快照存储在固定目录 74a08cf8.../commitId/ 下,
# 但 action input 中已内联内容,直接从 actions 提取更可靠
return {}
# ═══════════════════════════════════════════════════════════
# Prompt Log 匹配
# ═══════════════════════════════════════════════════════════
PROMPT_LOG_DIR = os.path.join("docs", "audit", "prompt_logs")
def find_matching_prompt_log(start_time_ms: int) -> Optional[str]:
"""根据 execution startTime 匹配最近的 prompt_log 文件。
prompt_log 文件名格式: prompt_log_YYYYMMDD_HHMMSS.md
匹配窗口: startTime 前后 5 分钟内最近的一个。
"""
if not os.path.isdir(PROMPT_LOG_DIR):
return None
try:
exec_dt = datetime.fromtimestamp(start_time_ms / 1000, tz=CST)
except Exception:
return None
best_match = None
best_delta = float("inf")
pattern = re.compile(r"prompt_log_(\d{8}_\d{6})\.md$")
for f in os.scandir(PROMPT_LOG_DIR):
if not f.is_file():
continue
m = pattern.match(f.name)
if not m:
continue
try:
log_dt = datetime.strptime(m.group(1), "%Y%m%d_%H%M%S").replace(tzinfo=CST)
delta = abs((exec_dt - log_dt).total_seconds())
if delta < 300 and delta < best_delta: # 5 分钟窗口
best_delta = delta
best_match = f.path
except Exception:
continue
return best_match
# ═══════════════════════════════════════════════════════════
# Markdown 生成
# ═══════════════════════════════════════════════════════════
def _msg_semantic_label(msg: dict) -> str:
"""为对话消息生成语义标签,用于快速定位。"""
entries = msg.get("entries", [])
if not entries:
return ""
parts = []
for e in entries:
et = e["type"]
if et == "system_prompt_ref":
parts.append("系统提示词")
elif et == "steering_ref":
parts.append(f"Steering({len(e.get('files', []))})")
elif et == "steering_doc":
parts.append(f"Steering:`{e.get('name', '?')}`")
elif et == "toolUse":
name = e.get("name", "?")
# 提取关键参数作为上下文
args = e.get("args", {})
ctx = ""
if name in ("readFile", "readCode", "readMultipleFiles"):
ctx = args.get("path", "") or ",".join(args.get("paths", [])[:2])
elif name in ("fsWrite", "strReplace", "editCode"):
ctx = args.get("path", "")
elif name == "grepSearch":
ctx = args.get("query", "")[:30]
elif name == "invokeSubAgent":
ctx = args.get("name", "")
elif name == "executePwsh":
ctx = (args.get("command", "") or "")[:40]
elif name == "taskStatus":
ctx = args.get("status", "")
if ctx:
parts.append(f"调用 `{name}` → {ctx}")
else:
parts.append(f"调用 `{name}`")
elif et == "toolUseResponse":
name = e.get("name", "?")
ok = "" if e.get("success") else ""
parts.append(f"结果 `{name}` {ok}")
elif et == "document":
parts.append(f"文档:{e.get('doc_type', '?')}")
elif et == "text":
# 文本内容:提取前 50 字符作为预览bot 和 human 都加)
role = msg.get("role", "")
if role in ("bot", "human"):
text = (e.get("text") or "").strip()
if text:
preview = text[:50].replace("\n", " ")
if len(text) > 50:
preview += ""
icon = "💬" if role == "bot" else "📝"
parts.append(f"{icon} `{preview}`")
return ", ".join(parts) if parts else ""
def _step_semantic_label(step: dict) -> str:
"""为 action step 生成带图标的语义标签。"""
at = step.get("actionType", "?")
state = step.get("actionState", "?")
fc = step.get("_file_change")
sub_eid = step.get("subExecutionId")
# 状态图标
if state == "Error":
state_icon = ""
elif state in ("Success", "Accepted"):
state_icon = ""
else:
state_icon = ""
# 类型图标 + 上下文
if at in ("write", "append") and fc:
fname = fc.get("file", "?")
short = fname.rsplit("/", 1)[-1] if "/" in fname else fname
orig = fc.get("original", "")
if orig:
return f"⚡ `{at}` 修改 `{short}` {state_icon}"
else:
return f"⚡ `{at}` 新建 `{short}` {state_icon}"
elif at == "invokeSubAgent":
inp = step.get("input", {})
# Kiro 原始 log 用 subAgentName工具 schema 用 name
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
return f"🔀 `invokeSubAgent` → {agent_name} {state_icon}"
elif at == "subagentResponse":
return f"🔀 `subagentResponse` {state_icon}"
elif at in ("readFiles", "readCode"):
inp = step.get("input", {})
if isinstance(inp, dict):
files = inp.get("files", [])
if files and isinstance(files[0], dict):
paths = [f.get("path", "?") for f in files[:2]]
else:
paths = [str(f) for f in files[:2]]
ctx = ", ".join(paths)
else:
ctx = ""
return f"📖 `{at}` {ctx} {state_icon}" if ctx else f"📖 `{at}` {state_icon}"
elif at == "search":
return f"🔍 `search` {state_icon}"
elif at == "say":
return f"💬 `say` {state_icon}"
elif at == "taskStatus":
return f"📋 `taskStatus` {state_icon}"
elif at == "steering":
return f"📄 `steering` {state_icon}"
elif at == "runCommand":
return f"🖥️ `runCommand` {state_icon}"
elif at == "getDiagnostics":
return f"🩺 `getDiagnostics` {state_icon}"
elif at == "ContextualHookInvoked":
inp = step.get("input", {})
hook_name = inp.get("name", "?") if isinstance(inp, dict) else "?"
return f"🪝 Hook `{hook_name}` {state_icon}"
elif at == "intentClassification":
ir = step.get("intentResult", {})
cls = ir.get("classification", "?") if isinstance(ir, dict) else "?"
return f"🎯 意图: `{cls}` {state_icon}"
elif at == "replace":
inp = step.get("input", {})
path = inp.get("file", inp.get("path", "?")) if isinstance(inp, dict) else "?"
short = path.rsplit("/", 1)[-1] if "/" in str(path) else str(path)
return f"✏️ `replace` `{short}` {state_icon}"
else:
return f"`{at}` [{state}]"
def _load_previous_actions_md(main_md_path: str) -> str:
"""从已有的 main.md 文件中提取 '## 4. Actions 时间线' 部分的原始 Markdown。
用于覆盖模式下,将前几轮 execution 的 actions 追加到新一轮前面。
返回空字符串表示无历史 actions。
"""
if not os.path.isfile(main_md_path):
return ""
try:
with open(main_md_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception:
return ""
# 定位 "## 4. Actions 时间线" 到下一个 "## 5." 或文件末尾
start_marker = "## 4. Actions 时间线"
end_marker = "## 5."
start_idx = content.find(start_marker)
if start_idx == -1:
return ""
# 跳过标题行本身,从下一行开始
body_start = content.find("\n", start_idx)
if body_start == -1:
return ""
body_start += 1
end_idx = content.find(end_marker, body_start)
if end_idx == -1:
actions_md = content[body_start:]
else:
actions_md = content[body_start:end_idx]
return actions_md.strip()
def _merge_summaries(prev_summary: dict, cur_summary: dict) -> dict:
"""增量合并两轮 execution 的结构化摘要。
规则:
- duration_s: 累加
- action_count: 累加
- msg_count: 取最新context.messages 是累积超集)
- files_modified / files_created: 并集(保序)
- sub_agents: 并集
- errors: 拼接
- description: 取最新非空值
- workflow / status: 取最新
"""
if not prev_summary:
return dict(cur_summary)
merged = dict(cur_summary)
merged["duration_s"] = round(
prev_summary.get("duration_s", 0) + cur_summary.get("duration_s", 0), 1
)
merged["action_count"] = (
prev_summary.get("action_count", 0) + cur_summary.get("action_count", 0)
)
# msg_count: 取较大值(最新轮次的 context.messages 是累积超集)
merged["msg_count"] = max(
prev_summary.get("msg_count", 0), cur_summary.get("msg_count", 0)
)
# 文件列表:并集,保序
def _union_lists(a: list, b: list) -> list:
seen = set()
result = []
for item in a + b:
if item not in seen:
seen.add(item)
result.append(item)
return result
merged["files_modified"] = _union_lists(
prev_summary.get("files_modified", []),
cur_summary.get("files_modified", []),
)
merged["files_created"] = _union_lists(
prev_summary.get("files_created", []),
cur_summary.get("files_created", []),
)
merged["sub_agents"] = _union_lists(
prev_summary.get("sub_agents", []),
cur_summary.get("sub_agents", []),
)
merged["errors"] = (
prev_summary.get("errors", []) + cur_summary.get("errors", [])
)
# description: 取最新非空值
if cur_summary.get("description"):
merged["description"] = cur_summary["description"]
elif prev_summary.get("description"):
merged["description"] = prev_summary["description"]
return merged
def _build_execution_summary(
log: dict,
conversation: list[dict],
timeline: list[dict],
sub_file_map: Optional[dict[str, str]] = None,
) -> dict:
"""构建结构化执行摘要(零 LLM 成本,纯规则化提取)。
返回 dict 供 md 渲染和索引存储共用。
"""
dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
files_modified = []
files_created = []
sub_agents = []
errors = []
for step in timeline:
if step.get("_compressed"):
continue
idx = step.get("index", "?")
at = step.get("actionType", "?")
state = step.get("actionState", "?")
fc = step.get("_file_change")
if fc:
fname = fc.get("file", "?")
if fc.get("original"):
files_modified.append(fname)
else:
files_created.append(fname)
if at == "invokeSubAgent":
inp = step.get("input", {})
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
sub_agents.append(agent_name)
if state == "Error":
errors.append(f"Step {idx}: `{at}`")
for msg in conversation:
for e in msg.get("entries", []):
if e.get("type") == "toolUseResponse" and not e.get("success"):
errors.append(f"Msg {msg['index']}: `{e.get('name', '?')}`")
# 去重文件名
files_modified = list(dict.fromkeys(files_modified))
files_created = list(dict.fromkeys(files_created))
# description 由外部 LLM 生成(百炼 API提取阶段不生成
description = ""
return {
"workflow": log.get("workflowType", "?"),
"status": log.get("status", "?"),
"duration_s": round(dur, 1),
"msg_count": len(conversation),
"action_count": len(timeline),
"files_modified": files_modified,
"files_created": files_created,
"sub_agents": sub_agents,
"errors": errors,
"description": description,
}
def _render_summary_md(summary: dict, sub_file_map: Optional[dict[str, str]] = None) -> str:
"""将结构化摘要渲染为 Markdown 文本(放在文件最前面)。"""
L = []
# 一句话概览
status_icon = "" if summary["status"] == "succeed" else ""
L.append(f"{status_icon} `{summary['workflow']}` | {summary['duration_s']}s | "
f"{summary['msg_count']} msgs | {summary['action_count']} actions")
L.append("")
desc = summary.get("description", "")
if desc:
L.append(f"> {desc}")
L.append("")
fm = summary["files_modified"]
fc = summary["files_created"]
if fm or fc:
L.append(f"**文件变更** (修改 {len(fm)} / 新建 {len(fc)})")
for f in fm:
L.append(f"- ⚡ 修改 `{f}`")
for f in fc:
L.append(f"- ✨ 新建 `{f}`")
L.append("")
sa = summary["sub_agents"]
if sa:
L.append(f"**子代理** ({len(sa)}): {', '.join(f'`{a}`' for a in sa)}")
L.append("")
errs = summary["errors"]
if errs:
L.append(f"**错误** ({len(errs)})")
for e in errs:
L.append(f"- ❌ {e}")
L.append("")
if not fm and not fc and not sa and not errs:
L.append("*(无文件变更、子代理调用或错误)*")
return "\n".join(L)
def _build_nav_summary(
conversation: list[dict],
timeline: list[dict],
sub_file_map: Optional[dict[str, str]] = None,
) -> str:
"""生成快速导航摘要:文件变更、子代理、错误。"""
file_changes = []
sub_agents = []
errors = []
for step in timeline:
if step.get("_compressed"):
continue
idx = step.get("index", "?")
at = step.get("actionType", "?")
state = step.get("actionState", "?")
# 文件变更
fc = step.get("_file_change")
if fc:
fname = fc.get("file", "?")
orig = fc.get("original", "")
action = "修改" if orig else "新建"
file_changes.append(f"- Step {idx}: ⚡ {action} `{fname}`")
# 子代理
if at == "invokeSubAgent":
inp = step.get("input", {})
agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
sub_eid = step.get("subExecutionId", "")
sub_path = ""
if sub_file_map and sub_eid and sub_eid in sub_file_map:
sub_path = f" → `{sub_file_map[sub_eid].replace(chr(92), '/')}`"
sub_agents.append(f"- Step {idx}: 🔀 `{agent_name}`{sub_path}")
# 错误
if state == "Error":
errors.append(f"- Step {idx}: ❌ `{at}`")
# 对话中的错误工具结果
for msg in conversation:
for e in msg.get("entries", []):
if e.get("type") == "toolUseResponse" and not e.get("success"):
errors.append(f"- Msg {msg['index']}: ❌ `{e.get('name', '?')}`")
lines = []
if file_changes:
lines.append(f"**文件变更** ({len(file_changes)})")
lines.extend(file_changes)
lines.append("")
if sub_agents:
lines.append(f"**子代理调用** ({len(sub_agents)})")
lines.extend(sub_agents)
lines.append("")
if errors:
lines.append(f"**错误** ({len(errors)})")
lines.extend(errors)
lines.append("")
if not lines:
lines.append("*(无文件变更、子代理调用或错误)*")
return "\n".join(lines)
def generate_full_record(
log: dict,
conversation: list[dict],
timeline: list[dict],
diffs: dict[str, dict],
session_info: Optional[dict] = None,
prompt_log_path: Optional[str] = None,
is_sub: bool = False,
sub_index: int = 0,
sub_file_map: Optional[dict[str, str]] = None,
accumulated_actions_md: str = "",
merged_summary: Optional[dict] = None,
execution_round: int = 1,
) -> tuple[str, dict]:
"""生成单个 execution 的 Markdown 全量记录(覆盖模式)。
覆盖模式下:
- 对话记录:直接使用最新 execution 的 context.messages累积超集无需去重
- Actions 时间线:前几轮的 accumulated_actions_md + 本轮 timeline
- 执行摘要:使用 merged_summary已合并前几轮
Args:
log: 原始 execution log JSON
conversation: parse_messages 输出
timeline: parse_actions 输出
diffs: collect_diffs 输出
session_info: 会话配置(仅主 execution 有)
prompt_log_path: 匹配的 prompt_log 文件路径
is_sub: 是否为子代理 execution
sub_index: 子代理序号(从 1 开始)
accumulated_actions_md: 前几轮 execution 的 actions Markdown追加到本轮前面
merged_summary: 合并后的结构化摘要含前几轮None 时使用本轮单独摘要
execution_round: 当前是第几轮 execution用于标注
"""
L = []
exec_id = log.get("executionId", "?")
chat_id = log.get("chatSessionId", "?")
# 构建结构化摘要(供 md 和索引共用)
_summary = _build_execution_summary(log, conversation, timeline, sub_file_map)
# 覆盖模式:使用合并后的摘要(含前几轮累积)
display_summary = merged_summary if merged_summary else _summary
# 标题
if is_sub:
L.append(f"# 子代理 Execution #{sub_index}\n")
else:
L.append("# Kiro 会话全量记录\n")
L.append(f"> 生成时间: {datetime.now(CST).strftime('%Y-%m-%d %H:%M:%S')} CST\n")
# ── 0. 执行摘要文件最前面AI 读前 20 行即可掌握全貌)──
L.append("## 📋 执行摘要\n")
if execution_round > 1:
L.append(f"*(合并 {execution_round} 轮 execution*\n")
L.append(_render_summary_md(display_summary, sub_file_map))
L.append("")
# ── 1. 元数据 ──
L.append("## 1. 元数据\n")
L.append("| 字段 | 值 |")
L.append("|------|-----|")
L.append(f"| executionId | `{exec_id}` |")
L.append(f"| chatSessionId | `{chat_id}` |")
L.append(f"| workflowType | `{log.get('workflowType', '?')}` |")
L.append(f"| autonomyMode | `{log.get('autonomyMode', '?')}` |")
L.append(f"| status | `{log.get('status', '?')}` |")
L.append(f"| startTime | `{ts_fmt(log.get('startTime'))}` |")
L.append(f"| endTime | `{ts_fmt(log.get('endTime'))}` |")
dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
L.append(f"| duration | `{dur:.1f}s` |")
L.append(f"| contextUsage | `{log.get('contextUsagePercentage', 0):.2f}%` |")
L.append("")
if session_info and not is_sub:
L.append(f"- 会话标题: `{session_info.get('title', '?')}`")
L.append(f"- 创建时间: `{ts_fmt(int(session_info.get('dateCreated', 0)))}`")
L.append(f"- 工作区: `{session_info.get('workspaceDirectory', '?')}`")
L.append("")
if prompt_log_path and not is_sub:
rel = os.path.relpath(prompt_log_path).replace("\\", "/")
L.append(f"- 关联 prompt_log: `{rel}`")
L.append("")
# ── 2. 用户输入 ──
L.append("## 2. 用户输入\n")
input_text = ""
for msg in log.get("input", {}).get("data", {}).get("messages", []):
for entry in msg.get("content", msg.get("entries", [])):
if isinstance(entry, dict) and entry.get("text"):
input_text += entry["text"] + "\n"
if input_text.strip():
L.append(fence(input_text.strip()) + "\n")
else:
L.append("*(无用户输入)*\n")
# ── 3. 对话记录 ──
L.append("## 3. 对话记录\n")
# 覆盖模式:直接使用全部消息(最新 execution 的 context.messages 是累积超集)
h = sum(1 for m in conversation if m["role"] == "human")
b = sum(1 for m in conversation if m["role"] == "bot")
t = sum(1 for m in conversation if m["role"] == "tool")
L.append(f"{len(conversation)} 条消息: human={h}, bot={b}, tool={t}\n")
for msg in conversation:
emoji = {"human": "👤", "bot": "🤖", "tool": "🔧"}.get(msg["role"], "")
# 生成语义标签
msg_label = _msg_semantic_label(msg)
label_suffix = f"{msg_label}" if msg_label else ""
# P0: 压缩 hook 输出的空消息特征HUMAN 消息只含 "Output:\nCommand executed..." 或 "Output:\nCommand timed out..."
if msg["role"] == "human" and len(msg["entries"]) == 1:
e0 = msg["entries"][0]
if e0["type"] == "text":
_txt = (e0.get("text") or "").strip()
if _txt.startswith("Output:") and ("Exit Code:" in _txt) and len(_txt) < 200:
# 提取 exit code
import re as _re
_ec_match = _re.search(r"Exit Code:\s*(-?\d+)", _txt)
_ec = _ec_match.group(1) if _ec_match else "?"
L.append(f"### Msg {msg['index']}: 👤 HUMAN — 🪝 Hook 输出 (exit={_ec})\n")
continue
L.append(f"### Msg {msg['index']}: {emoji} {msg['role'].upper()}{label_suffix}\n")
for entry in msg["entries"]:
et = entry["type"]
if et == "system_prompt_ref":
ref = entry["ref_file"]
chars = entry["char_count"]
sp_path = f"docs/audit/session_logs/_system_prompts/{ref}"
L.append(f"**[系统提示词]** → `{sp_path}` ({chars} 字符)\n")
elif et == "steering_ref":
files = ", ".join(entry["files"])
chars = entry["char_count"]
L.append(f"**[Steering]** 引用: {files} ({chars} 字符)\n")
elif et == "text":
text = entry.get("text", "")
if not text:
L.append("*(空)*\n")
else:
L.append(fence(text) + "\n")
elif et == "toolUse":
name = entry.get("name", "?")
args = entry.get("args", {})
L.append(f"**[🔧 调用]** `{name}`\n")
# P1: strReplace/editCode 的代码变更用 diff 格式展示
if name in ("strReplace", "editCode") and isinstance(args, dict):
_path = args.get("path", "?")
_lang = "python" if _path.endswith(".py") else "sql" if _path.endswith(".sql") else ""
L.append(f"- 文件: `{_path}`\n")
_old = args.get("oldStr", args.get("old_str", ""))
_new = args.get("newStr", args.get("new_str", ""))
_sel = args.get("selector", "")
_op = args.get("operation", "")
_repl = args.get("replacement", "")
if _sel:
L.append(f"- selector: `{_sel}`" + (f" ({_op})" if _op else ""))
if _old:
L.append(f"- 删除:\n" + fence(trunc(_old, 2000), _lang))
if _new:
L.append(f"- 插入:\n" + fence(trunc(_new, 2000), _lang))
if _repl:
L.append(f"- 替换为:\n" + fence(trunc(_repl, 2000), _lang))
L.append("")
else:
L.append(fence(safe_json(args, 5000), "json") + "\n")
elif et == "toolUseResponse":
ok = "" if entry.get("success") else ""
L.append(f"**[📋 结果]** `{entry.get('name', '?')}` {ok}\n")
msg_text = entry.get("message", "")
if msg_text:
L.append(fence(trunc(msg_text, 5000)) + "\n")
elif et == "document":
target = entry.get("target", "")
L.append(f"**[📄 文档]** type=`{entry.get('doc_type')}` target=`{target}`\n")
elif et == "steering_doc":
L.append(f"**[📄 Steering]** `{entry.get('name', 'steering')}`\n")
else:
L.append(f"**[{et}]** keys={entry.get('raw_keys')}\n")
# ── 4. Actions 时间线 ──
L.append("## 4. Actions 时间线\n")
# 覆盖模式:先输出前几轮累积的 actions再输出本轮
if accumulated_actions_md:
L.append(accumulated_actions_md)
L.append("")
L.append(f"---\n")
L.append(f"### 🔄 第 {execution_round} 轮 Execution ({ts_fmt(log.get('startTime'))})\n")
L.append(f"{len(timeline)}\n")
for step in timeline:
if step.get("_compressed"):
L.append(f"- `model` [{step.get('actionState')}] @ {step.get('emittedAt')}\n")
continue
at = step.get('actionType', '?')
state = step.get('actionState', '?')
# 生成语义标签
step_label = _step_semantic_label(step)
L.append(f"### Step {step['index']}: {step_label} @ {step.get('emittedAt','?')}\n")
if step.get("subExecutionId"):
sub_eid = step["subExecutionId"]
L.append(f"- subExecutionId: `{sub_eid}`")
# 标注子代理文件路径(如果有映射)
if sub_file_map and sub_eid in sub_file_map:
sub_path = sub_file_map[sub_eid].replace("\\", "/")
L.append(f"- 子代理记录: `{sub_path}`")
if step.get("endTime"):
L.append(f"- endTime: {step['endTime']}")
# 文件变更展示
if step.get("_file_change"):
fc = step["_file_change"]
fname = fc.get("file", "?")
orig = fc.get("original", "")
mod = fc.get("modified", "")
lang = "python" if fname.endswith(".py") else "sql" if fname.endswith(".sql") else ""
L.append(f"- 文件变更: `{fname}`")
if orig and mod:
L.append(f" - 修改前 ({len(orig)} 字符):\n" + fence(trunc(orig, 3000), lang))
L.append(f" - 修改后 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
elif mod:
L.append(f" - 新建 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
# 特殊处理各种 action type 的内容展示
_at = step.get("actionType", "")
if _at == "say":
_say_msg = (step.get("output") or {}).get("message", "")
if _say_msg:
L.append(f"- 💬 AI 回复:\n\n{_say_msg}\n")
else:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
elif _at == "invokeSubAgent":
_sub_input = step.get("input") or {}
_sub_prompt = _sub_input.get("prompt", "")
_sub_name = _sub_input.get("name", "") or _sub_input.get("subAgentName", "") or "?"
if _sub_prompt:
L.append(f"- 📤 委托 `{_sub_name}`:\n\n{trunc(_sub_prompt, 3000)}\n")
_sub_output = step.get("output") or {}
_sub_resp = _sub_output.get("response", "")
if _sub_resp:
L.append(f"- 📥 子代理输出:\n\n{_sub_resp}\n")
elif not _sub_prompt:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
elif _at == "subagentResponse":
_sr_input = step.get("input") or {}
_sr_resp = _sr_input.get("response", "")
if _sr_resp:
L.append(f"- 📥 子代理返回:\n\n{_sr_resp}\n")
else:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
elif _at == "ContextualHookInvoked":
# P1: hook 执行——提取名称、命令、exitCode
_hi = step.get("input") or {}
_ho = step.get("output") or {}
_h_name = _hi.get("name", "?")
_h_cmd = _ho.get("command", "")
_h_result = _ho.get("result", {})
_h_exit = _h_result.get("exitCode", "?") if isinstance(_h_result, dict) else "?"
_h_out = _h_result.get("output", "") if isinstance(_h_result, dict) else ""
if _h_cmd:
L.append(f"- `$ {_h_cmd}`")
L.append(f"- Exit: `{_h_exit}`")
if _h_out and _h_out != "Command executed successfully with no output.":
L.append(f"- 输出:\n" + fence(trunc(_h_out, 2000)))
elif _at == "intentClassification":
# P1: 意图分类——压缩为一行
_ir = step.get("intentResult", {})
_cls = _ir.get("classification", "?") if isinstance(_ir, dict) else "?"
L.append(f"- 分类结果: `{_cls}`")
elif _at == "runCommand":
# P0: 命令执行——提取命令、exitCode、输出
_rc_in = step.get("input") or {}
_rc_out = step.get("output") or {}
_rc_cmd = _rc_in.get("command", "") if isinstance(_rc_in, dict) else ""
_rc_result = _rc_out.get("result", {}) if isinstance(_rc_out, dict) else {}
_rc_exit = _rc_result.get("exitCode", "?") if isinstance(_rc_result, dict) else "?"
_rc_output = _rc_result.get("output", "") if isinstance(_rc_result, dict) else ""
if _rc_cmd:
L.append(f"- `$ {_rc_cmd}`")
L.append(f"- Exit: `{_rc_exit}`")
if _rc_output:
L.append(f"- 输出:\n" + fence(trunc(_rc_output, 3000)))
elif _at == "search":
# P2: 搜索——提取 query 和 why
_s_in = step.get("input") or {}
_s_query = _s_in.get("query", "") if isinstance(_s_in, dict) else ""
_s_why = _s_in.get("why", "") if isinstance(_s_in, dict) else ""
if _s_query:
L.append(f"- 🔍 `{trunc(_s_query, 100)}`")
if _s_why:
L.append(f"- 原因: {trunc(_s_why, 200)}")
# 展示搜索结果(如果有)
_s_out = step.get("output")
if _s_out and isinstance(_s_out, dict):
_s_files = _s_out.get("files", [])
if _s_files:
L.append(f"- 结果: {len(_s_files)} 个文件")
elif _at == "steering":
# P2: steering——提取文件名列表
_st_in = step.get("input") or {}
_st_docs = _st_in.get("documents", []) if isinstance(_st_in, dict) else []
if _st_docs:
import urllib.parse
names = []
for d in _st_docs[:10]:
if isinstance(d, str):
# URL 编码的路径,提取文件名
decoded = urllib.parse.unquote(d)
name = decoded.rsplit("/", 1)[-1] if "/" in decoded else decoded
names.append(name)
if names:
L.append(f"- 文件: {', '.join(names)}")
else:
for k in ("input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 3000), "json"))
elif _at == "getDiagnostics":
# P2: 诊断——提取路径和问题数
_gd_in = step.get("input") or {}
_gd_out = step.get("output") or {}
_gd_paths = _gd_in.get("paths", []) if isinstance(_gd_in, dict) else []
if _gd_paths:
L.append(f"- 文件: {', '.join(str(p) for p in _gd_paths[:5])}")
if isinstance(_gd_out, dict):
_gd_diags = _gd_out.get("diagnostics", [])
if isinstance(_gd_diags, list):
L.append(f"- 问题数: {len(_gd_diags)}")
for d in _gd_diags[:5]:
if isinstance(d, dict):
L.append(f" - {d.get('severity', '?')}: {d.get('message', '?')[:100]}")
elif not _gd_diags:
L.append("- ✅ 无问题")
elif _at in ("readFiles", "readCode"):
# P3: 文件读取——只展示路径,不展示内容
_rf_in = step.get("input") or {}
if isinstance(_rf_in, dict):
_rf_files = _rf_in.get("files", [])
paths = []
for f in _rf_files[:5]:
if isinstance(f, dict):
paths.append(f.get("path", "?"))
else:
paths.append(str(f))
if paths:
L.append(f"- 文件: {', '.join(paths)}")
else:
for k in ("intentResult", "input", "output"):
if k in step:
L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
L.append("")
# ── 5. 资源消耗 ──
usage = log.get("usageSummary", [])
if usage:
L.append("## 5. 资源消耗\n")
L.append("| 工具 | 消耗 | 单位 |")
L.append("|------|------|------|")
total = 0
for u in usage:
tools = ", ".join(u.get("usedTools", ["-"]))
amt = u.get("usage", 0)
total += amt
L.append(f"| {tools} | {amt} | {u.get('unit', '?')} |")
L.append(f"| **合计** | **{total:.4f}** | |")
L.append("")
return "\n".join(L), _summary, display_summary
# ═══════════════════════════════════════════════════════════
# 索引管理
# ═══════════════════════════════════════════════════════════
def load_index() -> dict:
if os.path.isfile(INDEX_PATH):
try:
with open(INDEX_PATH, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {"version": 2, "entries": {}}
def load_full_index() -> dict:
if os.path.isfile(INDEX_FULL_PATH):
try:
with open(INDEX_FULL_PATH, "r", encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {"version": 2, "entries": {}}
def save_index(index: dict):
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
_save_day_indexes(index, "_day_index.json")
def save_full_index(index: dict):
os.makedirs(os.path.dirname(INDEX_FULL_PATH), exist_ok=True)
with open(INDEX_FULL_PATH, "w", encoding="utf-8") as f:
json.dump(index, f, ensure_ascii=False, indent=2)
_save_day_indexes(index, "_day_index_full.json")
def _save_day_indexes(index: dict, filename: str):
"""将根级索引按 output_dir 中的 {ym}/{dd} 拆分,写入每个 day_dir 下。
路径格式docs/audit/session_logs/{ym}/{dd}/{filename}
每个 day 级索引只包含 output_dir 在该天目录下的 entry。
"""
entries = index.get("entries", {})
if not entries:
return
# 按 day_dir 分组
day_groups: dict[str, dict[str, dict]] = {}
prefix = SESSION_LOG_DIR.replace("\\", "/")
for eid, ent in entries.items():
out_dir = ent.get("output_dir", "").replace("\\", "/")
if not out_dir.startswith(prefix):
continue
# out_dir 格式docs/audit/session_logs/2026-03/03/01_abc12345_013337
# 取到 day_dirdocs/audit/session_logs/2026-03/03
rel = out_dir[len(prefix):].lstrip("/")
parts = rel.split("/")
if len(parts) >= 2:
day_key = f"{parts[0]}/{parts[1]}" # "2026-03/03"
day_groups.setdefault(day_key, {})[eid] = ent
for day_key, day_entries in day_groups.items():
day_dir = os.path.join(SESSION_LOG_DIR, day_key.replace("/", os.sep))
day_idx_path = os.path.join(day_dir, filename)
os.makedirs(day_dir, exist_ok=True)
day_data = {"version": 2, "entries": day_entries}
with open(day_idx_path, "w", encoding="utf-8") as f:
json.dump(day_data, f, ensure_ascii=False, indent=2)
def update_index(index: dict, exec_id: str, output_dir: str, meta: dict,
summary: Optional[dict] = None, is_sub: bool = False,
parent_exec_id: str = "",
chat_session_id: str = "",
prev_exec_ids: Optional[list[str]] = None):
"""添加/更新一条索引记录,含结构化摘要供 AI 低成本查询。
覆盖模式下:
- 同一 chatSession 只保留最新 execution 的 entry
- prev_exec_ids 中的旧 entry 会被移除
- summary 应为已合并的(调用方负责合并)
is_sub=True 时标记为子代理 entry并记录 parent_exec_id。
"""
# 覆盖模式:将同 chatSession 的旧 entry 标记为已替代(保留在索引中防止重复提取)
if prev_exec_ids:
for old_eid in prev_exec_ids:
if old_eid in index.get("entries", {}) and old_eid != exec_id:
# 保留 entry 但标记为 superseded避免被 extract_latest 重复提取
index["entries"][old_eid] = {
"superseded_by": exec_id,
"chatSessionId": index["entries"][old_eid].get("chatSessionId", ""),
}
entry = {
"output_dir": output_dir.replace("\\", "/"),
"chatSessionId": meta.get("chatSessionId", "") or chat_session_id,
"startTime": meta.get("startTime", ""),
"endTime": meta.get("endTime", ""),
"status": meta.get("status", ""),
"workflowType": meta.get("workflowType", ""),
"indexed_at": datetime.now(CST).isoformat(),
}
if is_sub:
entry["is_sub"] = True
if parent_exec_id:
entry["parent_exec_id"] = parent_exec_id
if summary:
entry["summary"] = {
"duration_s": summary.get("duration_s", 0),
"msg_count": summary.get("msg_count", 0),
"action_count": summary.get("action_count", 0),
"files_modified": summary.get("files_modified", []),
"files_created": summary.get("files_created", []),
"sub_agents": summary.get("sub_agents", []),
"errors": summary.get("errors", []),
}
if summary.get("description"):
entry["description"] = summary["description"]
index["entries"][exec_id] = entry
# ═══════════════════════════════════════════════════════════
# 主提取逻辑
# ═══════════════════════════════════════════════════════════
def extract_single_execution(
agent_dir: str,
hex_dir: str,
execution: dict,
session_dir: Optional[str],
index: dict,
full_index: Optional[dict] = None,
sub_mode: bool = False,
sub_index: int = 0,
output_base_dir: Optional[str] = None,
parent_exec_id: str = "",
force: bool = False,
) -> Optional[str]:
"""提取单个 execution 并写入文件(覆盖模式)。
覆盖模式:同一 chatSession 只保留一个 main.md 文件。
- 对话记录:使用最新 execution 的 context.messages累积超集
- Actions 时间线:前几轮的 actions + 本轮 actions追加
- 执行摘要增量合并duration 累加、files 并集、msg_count 取最新)
- 索引:同 chatSession 只保留一条 entry
Args:
agent_dir: kiro.kiroagent 目录
hex_dir: execution 所在的 32 位 hex 目录
execution: manifest 中的 execution 条目
session_dir: workspace-sessions 子目录(用于加载会话信息)
index: 精简索引字典(仅主对话)
full_index: 完整索引字典(主对话 + 子代理None 时不写入
sub_mode: 是否为子代理模式
sub_index: 子代理序号
output_base_dir: 子代理模式下的输出目录(与主 execution 同目录)
parent_exec_id: 子代理的父 execution ID
force: 强制重新提取(忽略已索引检查)
Returns:
输出目录路径,或 None如果失败/已索引)
"""
exec_id = execution.get("executionId", "")
chat_id = execution.get("chatSessionId", "")
# 跳过已索引且文件仍存在的(子代理不检查,因为它们跟随主 execution
# force 模式下跳过此检查(用于 --all 重新提取)
if not sub_mode and not force and exec_id in index.get("entries", {}):
existing_entry = index["entries"][exec_id]
# superseded/no_log 占位条目:永远跳过,不重试
if existing_entry.get("superseded_by") or existing_entry.get("no_log"):
return None
existing_dir = existing_entry.get("output_dir", "")
if existing_dir and os.path.isdir(existing_dir):
return None
# 文件已被清理,从索引中移除,继续提取
del index["entries"][exec_id]
# 加载 execution log
log_path = find_execution_log(agent_dir, hex_dir, execution)
if not log_path:
# Tombstone日志文件不存在已被 Kiro 清理),写入占位条目防止重复尝试
if not sub_mode and exec_id:
index.setdefault("entries", {})[exec_id] = {
"no_log": True,
"chatSessionId": chat_id,
"indexed_at": datetime.now(CST).isoformat(),
}
if full_index is not None:
full_index.setdefault("entries", {})[exec_id] = {
"no_log": True,
"chatSessionId": chat_id,
"indexed_at": datetime.now(CST).isoformat(),
}
return None
try:
with open(log_path, "r", encoding="utf-8") as f:
log = json.load(f)
except Exception:
return None
# 从完整 log 补充 chatSessionId新版 manifest 条目中可能缺失)
if not chat_id:
chat_id = log.get("chatSessionId", "")
# 解析
messages = log.get("context", {}).get("messages", [])
actions = log.get("actions", [])
conversation = parse_messages(messages)
timeline = parse_actions(actions)
diffs = collect_diffs(agent_dir, hex_dir, execution)
# 会话信息(仅主 execution
session_info = None
if not sub_mode and session_dir and chat_id:
session_info = load_session_detail(session_dir, chat_id)
# prompt_log 匹配(仅主 execution
prompt_log = None
if not sub_mode:
start_time = log.get("startTime", 0)
prompt_log = find_matching_prompt_log(start_time)
# 取 execution 开始时间(用于目录和文件命名的时间后缀)
_start_ms = log.get("startTime") or execution.get("startTime", 0)
_ym, _dd, _hms = ts_date_parts(_start_ms)
# 确定输出目录
if sub_mode and output_base_dir:
out_dir = output_base_dir
else:
chat_short = chat_id[:8] if chat_id else hash8(exec_id)
day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
out_dir = _resolve_chat_dir(day_dir, chat_short, _hms)
os.makedirs(out_dir, exist_ok=True)
# 跨天指引
if not sub_mode:
chat_short = chat_id[:8] if chat_id else hash8(exec_id)
chat_day_dir = os.path.dirname(out_dir)
exec_day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
if os.path.normpath(chat_day_dir) != os.path.normpath(exec_day_dir):
_write_cross_day_ref(exec_day_dir, chat_short, out_dir)
# 先递归提取子代理,收集 sub_file_mapsubExecutionId → 文件路径)
sub_file_map: dict[str, str] = {}
if not sub_mode:
sub_exec_ids = extract_sub_execution_ids(actions)
for si, sub_eid in enumerate(sub_exec_ids, 1):
sub_execs = find_all_executions(agent_dir, execution_id=sub_eid)
if sub_execs:
sub_exec = sub_execs[0]
extract_single_execution(
agent_dir=agent_dir,
hex_dir=sub_exec["_hex_dir"],
execution=sub_exec,
session_dir=session_dir,
index=index,
full_index=full_index,
sub_mode=True,
sub_index=si,
output_base_dir=out_dir,
parent_exec_id=exec_id,
force=force,
)
sub_filename = f"sub_{si:02d}_{sub_eid[:8]}.md"
sub_file_map[sub_eid] = os.path.join(out_dir, sub_filename)
# ── 覆盖模式:加载前几轮的 actions 和 summary ──
accumulated_actions_md = ""
prev_summary: dict = {}
execution_round = 1
prev_exec_ids: list[str] = [] # 同 chatSession 的旧 executionId 列表
if not sub_mode:
# 查找同 chatSession 的已有 main.md 文件
existing_main_path = ""
if os.path.isdir(out_dir):
for f in os.listdir(out_dir):
if f.startswith("main_") and f.endswith(".md"):
existing_main_path = os.path.join(out_dir, f)
break # 覆盖模式下只有一个 main 文件
if existing_main_path:
# 提取前几轮累积的 actions
accumulated_actions_md = _load_previous_actions_md(existing_main_path)
# 从索引中查找同 chatSession 的旧 entry获取 prev_summary
if chat_id:
for eid, ent in list(index.get("entries", {}).items()):
if ent.get("chatSessionId") == chat_id and eid != exec_id and not ent.get("is_sub"):
# 跳过已被替代的条目(无 summary/output_dir
if ent.get("superseded_by") or ent.get("no_log"):
prev_exec_ids.append(eid)
continue
prev_exec_ids.append(eid)
ent_summary = ent.get("summary", {})
# 保留旧 entry 的 descriptionLLM 生成的)
if ent.get("description"):
ent_summary["description"] = ent["description"]
prev_summary = _merge_summaries(prev_summary, ent_summary)
# 计算当前是第几轮
execution_round = len(prev_exec_ids) + 1
# 生成 Markdown + 结构化摘要
# 先计算本轮 summary与前几轮合并后传给 generate_full_record 渲染
cur_summary = _build_execution_summary(log, conversation, timeline, sub_file_map if not sub_mode else None)
final_summary = _merge_summaries(prev_summary, cur_summary) if prev_summary else cur_summary
md, _cur_summary_unused, _display_unused = generate_full_record(
log=log,
conversation=conversation,
timeline=timeline,
diffs=diffs,
session_info=session_info,
prompt_log_path=prompt_log,
is_sub=sub_mode,
sub_index=sub_index,
sub_file_map=sub_file_map if not sub_mode else None,
accumulated_actions_md=accumulated_actions_md,
merged_summary=final_summary if prev_summary else None,
execution_round=execution_round,
)
# ── 写入文件(覆盖模式:删除旧 main写入新 main──
if sub_mode:
filename = f"sub_{sub_index:02d}_{exec_id[:8]}.md"
else:
# 覆盖模式:删除同目录下所有旧 main_*.md
if os.path.isdir(out_dir):
for f in os.listdir(out_dir):
if f.startswith("main_") and f.endswith(".md"):
try:
os.remove(os.path.join(out_dir, f))
except Exception:
pass
# 固定文件名main_01_{最新execId前8位}.md
filename = f"main_01_{exec_id[:8]}.md"
filepath = os.path.join(out_dir, filename)
md_safe = md.encode("utf-8", errors="surrogateescape").decode("utf-8", errors="replace")
with open(filepath, "w", encoding="utf-8") as f:
f.write(md_safe)
# 更新索引(覆盖模式:移除旧 entry写入新 entry
_meta = {
"chatSessionId": chat_id,
"startTime": ts_fmt(log.get("startTime")),
"endTime": ts_fmt(log.get("endTime")),
"status": log.get("status", ""),
"workflowType": log.get("workflowType", ""),
}
if not sub_mode:
update_index(index, exec_id, out_dir, _meta,
summary=final_summary,
chat_session_id=chat_id,
prev_exec_ids=prev_exec_ids)
if full_index is not None:
update_index(full_index, exec_id, out_dir, _meta,
summary=final_summary,
chat_session_id=chat_id,
prev_exec_ids=prev_exec_ids)
else:
if full_index is not None:
update_index(full_index, exec_id, out_dir, _meta,
summary=cur_summary, is_sub=True,
parent_exec_id=parent_exec_id)
return out_dir
# ═══════════════════════════════════════════════════════════
# 入口函数
# ═══════════════════════════════════════════════════════════
def extract_latest(global_storage: Optional[str] = None, workspace_path: Optional[str] = None):
"""提取所有未索引的终态 execution + partial/补全逻辑(供 agent_on_stop 调用)。
覆盖模式:同一 chatSession 的 execution 按 startTime 升序提取,
后续轮次覆盖前一轮的 main.md对话用最新超集actions 追加)。
Partial/补全:
1. 检测当前 session 的 execution可能还在 running提取为 status="partial"
2. 下次调用时,找到 status="partial" 的 entry重新提取覆盖
"""
gs = global_storage or DEFAULT_GLOBAL_STORAGE
ws = workspace_path or os.getcwd()
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
return
session_dir = find_workspace_session_dir(agent_dir, ws)
chat_ids = None
if session_dir:
sessions = load_sessions_json(session_dir)
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
if s.get("chatSessionId") or s.get("sessionId")}
TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
if not all_execs:
return
index = load_index()
full_index = load_full_index()
# ── 步骤 1补全之前标记为 partial 的 entry ──
partial_eids = [
eid for eid, ent in index.get("entries", {}).items()
if ent.get("status") == "partial"
]
for p_eid in partial_eids:
# 在 all_execs 中找到对应的 execution
matched = [e for e in all_execs if e.get("executionId") == p_eid]
if matched:
p_exec = matched[0]
# 如果已变为终态,重新提取覆盖
if p_exec.get("status", "") in TERMINAL_STATUSES:
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=p_exec["_hex_dir"],
execution=p_exec,
session_dir=session_dir,
index=index,
full_index=full_index,
force=True, # 强制覆盖
)
if result:
print(f"[session-extract] completed partial: {result}")
# ── 步骤 2提取未索引的终态 execution ──
# 同时检查 index主对话和 full_index含子代理避免重复提取子代理 execution
indexed_eids = set(index.get("entries", {}).keys())
if full_index:
indexed_eids |= set(full_index.get("entries", {}).keys())
ready = [e for e in all_execs
if e.get("executionId", "") not in indexed_eids
and e.get("status", "") in TERMINAL_STATUSES]
# ── 步骤 3检测当前 session 的 running execution提取为 partial ──
# 当前 hook 触发时,本 session 的 execution 通常还在 running
# 通过环境变量 KIRO_CHAT_SESSION_ID 识别(如果有的话)
current_chat_id = os.environ.get("KIRO_CHAT_SESSION_ID", "")
if current_chat_id:
running_execs = [
e for e in all_execs
if e.get("chatSessionId") == current_chat_id
and e.get("status") not in TERMINAL_STATUSES
and e.get("executionId", "") not in indexed_eids
]
for r_exec in running_execs:
# 标记为 partial 提取
r_exec["_is_partial"] = True
ready.append(r_exec)
if not ready:
if partial_eids:
save_index(index)
save_full_index(full_index)
return
# agent_on_stop 场景下限制单次处理量,避免处理数千个历史 execution
# 优先处理最新的(按 startTime 降序排列ready 已经是降序的来自 find_all_executions
MAX_PER_RUN = 50
if len(ready) > MAX_PER_RUN:
# 按 startTime 降序排序,取最新的 MAX_PER_RUN 个
ready.sort(key=lambda x: x.get("startTime", 0), reverse=True)
skipped = len(ready) - MAX_PER_RUN
ready = ready[:MAX_PER_RUN]
print(f"[session-extract] capped to {MAX_PER_RUN} most recent (skipped {skipped} older)")
# 按 chatSessionId 分组,组内按 startTime 升序
from collections import defaultdict
chat_groups: dict[str, list[dict]] = defaultdict(list)
for e in ready:
cid = e.get("chatSessionId", "") or "unknown"
chat_groups[cid].append(e)
for cid in chat_groups:
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
extracted_count = 0
tombstone_count = 0
for cid, group_execs in chat_groups.items():
for execution in group_execs:
eid = execution.get("executionId", "")
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execution["_hex_dir"],
execution=execution,
session_dir=session_dir,
index=index,
full_index=full_index,
)
if result:
# 如果是 partial 提取,在索引中标记 status
if execution.get("_is_partial"):
if eid in index.get("entries", {}):
index["entries"][eid]["status"] = "partial"
if full_index and eid in full_index.get("entries", {}):
full_index["entries"][eid]["status"] = "partial"
extracted_count += 1
print(f"[session-extract] extracted: {result}")
elif eid and index.get("entries", {}).get(eid, {}).get("no_log"):
# tombstone 被写入,需要保存索引
tombstone_count += 1
if extracted_count > 0 or partial_eids or tombstone_count > 0:
save_index(index)
save_full_index(full_index)
if extracted_count > 1:
print(f"[session-extract] total: {extracted_count} executions")
if tombstone_count > 0:
print(f"[session-extract] tombstoned: {tombstone_count} (no log found)")
def extract_all_unindexed(
global_storage: Optional[str] = None,
workspace_path: Optional[str] = None,
limit: Optional[int] = None,
workers: int = 8,
rebuild: bool = False,
):
"""提取 execution覆盖模式多线程并行
rebuild=True 时清空索引和旧文件,全量重建(用于迁移到覆盖模式)。
rebuild=False 时只提取未索引的(增量模式)。
"""
gs = global_storage or DEFAULT_GLOBAL_STORAGE
ws = workspace_path or os.getcwd()
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
print("[session-extract] kiro.kiroagent dir not found")
return
session_dir = find_workspace_session_dir(agent_dir, ws)
chat_ids = None
if session_dir:
sessions = load_sessions_json(session_dir)
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
if s.get("chatSessionId") or s.get("sessionId")}
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
if not all_execs:
print("[session-extract] no executions found")
return
TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
if rebuild:
# 全量重建:清空索引,处理所有终态 execution
print("[session-extract] REBUILD mode: clearing indexes, reprocessing all")
index = {"version": 2, "entries": {}}
full_index = {"version": 2, "entries": {}}
# 清理旧的 main_*.md 文件(保留 sub_*.md 和其他文件)
# 旧文件会在 extract_single_execution 中被覆盖,无需预清理
todo = [e for e in all_execs
if e.get("status", "") in TERMINAL_STATUSES]
else:
index = load_index()
full_index = load_full_index()
todo = [e for e in all_execs
if e.get("executionId", "") not in index.get("entries", {})
and e.get("status", "") in TERMINAL_STATUSES]
if limit:
todo = todo[:limit]
if not todo:
print("[session-extract] all indexed, nothing to do")
return
print(f"[session-extract] {len(todo)} executions to extract (workers={workers})")
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
# 按 chatSessionId 分组,同组内按 startTime 排序串行提取
# 覆盖模式下同组串行确保 actions 按时间顺序追加
from collections import defaultdict
chat_groups: dict[str, list[dict]] = defaultdict(list)
for e in todo:
cid = e.get("chatSessionId", "") or "unknown"
chat_groups[cid].append(e)
for cid in chat_groups:
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
lock = threading.Lock()
count = 0
def _extract_group(group_execs):
"""串行提取同一 chatSession 的所有 execution覆盖模式"""
local_index = {"version": 2, "entries": {}}
local_full = {"version": 2, "entries": {}}
results = []
for execution in group_execs:
try:
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execution["_hex_dir"],
execution=execution,
session_dir=session_dir,
index=local_index,
full_index=local_full,
force=rebuild,
)
if result:
results.append(result)
except Exception as e:
eid = execution.get("executionId", "?")[:8]
print(f"[session-extract] ✗ {eid}: {e}")
return results, local_index.get("entries", {}), local_full.get("entries", {})
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_extract_group, execs): cid
for cid, execs in chat_groups.items()}
for future in as_completed(futures):
results, idx_entries, full_entries = future.result()
if results:
with lock:
count += len(results)
index["entries"].update(idx_entries)
full_index["entries"].update(full_entries)
if count % 50 == 0:
save_index(index)
save_full_index(full_index)
print(f"[session-extract] [{count}/{len(todo)}] checkpoint saved")
elif count % 10 == 0:
print(f"[session-extract] [{count}/{len(todo)}]")
# 后处理去重:多线程合并可能产生同 chatSessionId 的多条 entry
# (同组串行保证组内不重复,但 double-submit 等场景仍可能引入)
def _dedup_index_entries(idx: dict) -> int:
"""按 chatSessionId 去重,保留 startTime 最晚的 entry合并 summary。返回去除条数。"""
entries = idx.get("entries", {})
chat_groups_dedup: dict[str, list[tuple[str, dict]]] = defaultdict(list)
keep: dict[str, dict] = {}
for eid, ent in entries.items():
if ent.get("is_sub"):
keep[eid] = ent
continue
cid = ent.get("chatSessionId", "")
if not cid:
keep[eid] = ent
continue
chat_groups_dedup[cid].append((eid, ent))
removed = 0
for cid, group in chat_groups_dedup.items():
if len(group) == 1:
keep[group[0][0]] = group[0][1]
continue
group.sort(key=lambda x: x[1].get("startTime", ""))
merged_summary: dict = {}
for eid, ent in group:
merged_summary = _merge_summaries(merged_summary, ent.get("summary", {}))
last_eid, last_ent = group[-1]
last_ent["summary"] = merged_summary
for eid, ent in reversed(group):
if ent.get("description"):
last_ent["description"] = ent["description"]
break
keep[last_eid] = last_ent
removed += len(group) - 1
idx["entries"] = keep
return removed
removed_main = _dedup_index_entries(index)
removed_full = _dedup_index_entries(full_index)
if removed_main or removed_full:
print(f"[session-extract] dedup: removed {removed_main} main / {removed_full} full duplicates")
# 最终保存
save_index(index)
save_full_index(full_index)
print(f"[session-extract] done, extracted {count}/{len(todo)}, final entries: {len(index.get('entries', {}))}")
def extract_by_id(
execution_id: str,
global_storage: Optional[str] = None,
):
"""提取指定 executionId 的 execution"""
gs = global_storage or DEFAULT_GLOBAL_STORAGE
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
print("[session-extract] kiro.kiroagent dir not found")
return
execs = find_all_executions(agent_dir, execution_id=execution_id)
if not execs:
print(f"[session-extract] execution not found: {execution_id}")
return
# 验证确实匹配到了目标 execution前缀匹配
matched = execs[0]
if not matched.get("executionId", "").startswith(execution_id):
print(f"[session-extract] execution not found: {execution_id}")
return
index = load_index()
full_index = load_full_index()
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execs[0]["_hex_dir"],
execution=execs[0],
session_dir=None,
index=index,
full_index=full_index,
force=True, # 指定 ID 时强制提取
)
if result:
save_index(index)
save_full_index(full_index)
print(f"[session-extract] extracted: {result}")
# ═══════════════════════════════════════════════════════════
# CLI 入口
# ═══════════════════════════════════════════════════════════
def main():
import argparse
parser = argparse.ArgumentParser(description="Kiro 执行日志全量提取器 v3覆盖模式")
parser.add_argument("--all", action="store_true", help="提取所有未索引的 execution")
parser.add_argument("--rebuild", action="store_true", help="全量重建:清空索引,重新提取所有 execution覆盖模式迁移用")
parser.add_argument("--recent", type=int, metavar="N", help="提取最近 N 条未索引的")
parser.add_argument("--workers", type=int, default=8, help="并行线程数(默认 8")
parser.add_argument("--execution-id", type=str, help="提取指定 executionId")
parser.add_argument("--global-storage", type=str, help="globalStorage 路径")
parser.add_argument("--workspace", type=str, help="workspace 路径")
args = parser.parse_args()
gs = args.global_storage
ws = args.workspace
if args.execution_id:
extract_by_id(args.execution_id, global_storage=gs)
elif args.rebuild:
extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers, rebuild=True)
elif args.all:
extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers)
elif args.recent:
extract_all_unindexed(global_storage=gs, workspace_path=ws, limit=args.recent, workers=args.workers)
else:
extract_latest(global_storage=gs, workspace_path=ws)
if __name__ == "__main__":
main()