243 lines
7.7 KiB
Python
243 lines
7.7 KiB
Python
"""导出所有 execution 的对话内容,供外部 LLM 批量生成摘要。
|
||
|
||
每个 execution 导出一个文件,包含:
|
||
- execution ID
|
||
- 每轮 user input 和 assistant output
|
||
- 文件变更列表
|
||
|
||
输出目录: export/session_summaries/
|
||
输出格式: {exec_id_short}.txt
|
||
|
||
用法:
|
||
python -B scripts/ops/export_session_conversations.py
|
||
python -B scripts/ops/export_session_conversations.py --limit 50
|
||
python -B scripts/ops/export_session_conversations.py --output-dir /path/to/dir
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 添加 scripts/ops 到 path
|
||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
from _env_paths import ensure_repo_root
|
||
ensure_repo_root()
|
||
|
||
from extract_kiro_session import (
|
||
find_kiro_agent_dir,
|
||
find_workspace_session_dir,
|
||
find_all_executions,
|
||
find_execution_log,
|
||
load_sessions_json,
|
||
parse_messages,
|
||
parse_actions,
|
||
DEFAULT_GLOBAL_STORAGE,
|
||
)
|
||
|
||
OUTPUT_DIR = Path("export/session_summaries")
|
||
|
||
|
||
def extract_conversation_text(log: dict) -> dict:
|
||
"""从 execution log 中提取对话文本。
|
||
|
||
返回:
|
||
{
|
||
"exec_id": str,
|
||
"workflow": str,
|
||
"status": str,
|
||
"start_time": str,
|
||
"rounds": [{"role": "user"|"assistant", "text": str}, ...],
|
||
"files_modified": [str],
|
||
"files_created": [str],
|
||
}
|
||
"""
|
||
messages = log.get("context", {}).get("messages", [])
|
||
actions = log.get("actions", [])
|
||
conversation = parse_messages(messages)
|
||
timeline = parse_actions(actions)
|
||
|
||
rounds = []
|
||
for msg in conversation:
|
||
role = msg.get("role", "")
|
||
if role not in ("human", "bot"):
|
||
continue
|
||
|
||
texts = []
|
||
for entry in msg.get("entries", []):
|
||
et = entry.get("type", "")
|
||
if et == "text":
|
||
t = entry.get("text", "").strip()
|
||
if t:
|
||
# 跳过 steering/system prompt 自动注入
|
||
if t.startswith("## Included Rules") or t.startswith("<steering"):
|
||
continue
|
||
texts.append(t)
|
||
elif et == "toolUse":
|
||
# 只记录工具名,不记录参数(节省 token)
|
||
texts.append(f"[调用工具: {entry.get('name', '?')}]")
|
||
elif et == "toolUseResponse":
|
||
ok = "成功" if entry.get("success") else "失败"
|
||
texts.append(f"[工具结果: {entry.get('name', '?')} {ok}]")
|
||
|
||
if texts:
|
||
combined = "\n".join(texts)
|
||
# 截断过长的单条消息(节省 token)
|
||
if len(combined) > 2000:
|
||
combined = combined[:2000] + "\n[...截断...]"
|
||
rounds.append({
|
||
"role": "user" if role == "human" else "assistant",
|
||
"text": combined,
|
||
})
|
||
|
||
# 文件变更
|
||
files_modified = []
|
||
files_created = []
|
||
for step in timeline:
|
||
fc = step.get("_file_change")
|
||
if fc:
|
||
fname = fc.get("file", "?")
|
||
if fc.get("original"):
|
||
if fname not in files_modified:
|
||
files_modified.append(fname)
|
||
else:
|
||
if fname not in files_created:
|
||
files_created.append(fname)
|
||
|
||
from extract_kiro_session import ts_fmt
|
||
return {
|
||
"exec_id": log.get("executionId", "?"),
|
||
"workflow": log.get("workflowType", "?"),
|
||
"status": log.get("status", "?"),
|
||
"start_time": ts_fmt(log.get("startTime")),
|
||
"rounds": rounds,
|
||
"files_modified": files_modified,
|
||
"files_created": files_created,
|
||
}
|
||
|
||
|
||
def render_conversation_file(data: dict) -> str:
|
||
"""渲染为纯文本格式,供 LLM 阅读。"""
|
||
L = []
|
||
L.append(f"EXECUTION_ID: {data['exec_id']}")
|
||
L.append(f"WORKFLOW: {data['workflow']}")
|
||
L.append(f"STATUS: {data['status']}")
|
||
L.append(f"START_TIME: {data['start_time']}")
|
||
|
||
if data["files_modified"]:
|
||
L.append(f"FILES_MODIFIED: {', '.join(data['files_modified'][:20])}")
|
||
if data["files_created"]:
|
||
L.append(f"FILES_CREATED: {', '.join(data['files_created'][:20])}")
|
||
|
||
L.append(f"ROUNDS: {len(data['rounds'])}")
|
||
L.append("---")
|
||
|
||
for i, r in enumerate(data["rounds"], 1):
|
||
role_label = "USER" if r["role"] == "user" else "ASSISTANT"
|
||
L.append(f"\n[{role_label} #{i}]")
|
||
L.append(r["text"])
|
||
|
||
L.append("\n---END---")
|
||
return "\n".join(L)
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser(description="导出 session 对话内容供 LLM 批量总结")
|
||
parser.add_argument("--limit", type=int, help="最多导出 N 条")
|
||
parser.add_argument("--output-dir", type=str, help="输出目录")
|
||
parser.add_argument("--skip-existing", action="store_true", help="跳过已导出的")
|
||
args = parser.parse_args()
|
||
|
||
out_dir = Path(args.output_dir) if args.output_dir else OUTPUT_DIR
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
gs = DEFAULT_GLOBAL_STORAGE
|
||
ws = os.getcwd()
|
||
|
||
agent_dir = find_kiro_agent_dir(gs)
|
||
if not agent_dir:
|
||
print("[export] kiro.kiroagent dir not found")
|
||
return
|
||
|
||
session_dir = find_workspace_session_dir(agent_dir, ws)
|
||
chat_ids = None
|
||
if session_dir:
|
||
sessions = load_sessions_json(session_dir)
|
||
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
|
||
if s.get("chatSessionId") or s.get("sessionId")}
|
||
|
||
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
|
||
if not all_execs:
|
||
print("[export] no executions found")
|
||
return
|
||
|
||
# 只导出主 execution(非子代理)
|
||
# 子代理的 execution 通常没有独立的用户输入
|
||
count = 0
|
||
skipped = 0
|
||
errors = 0
|
||
|
||
for execution in all_execs:
|
||
eid = execution.get("executionId", "")
|
||
short = eid[:8]
|
||
|
||
if args.skip_existing:
|
||
out_file = out_dir / f"{short}.txt"
|
||
if out_file.exists():
|
||
skipped += 1
|
||
continue
|
||
|
||
log_path = find_execution_log(agent_dir, execution["_hex_dir"], execution)
|
||
if not log_path:
|
||
continue
|
||
|
||
try:
|
||
with open(log_path, "r", encoding="utf-8") as f:
|
||
log = json.load(f)
|
||
except Exception:
|
||
errors += 1
|
||
continue
|
||
|
||
# 跳过子代理 execution(没有独立用户输入)
|
||
if log.get("workflowType") in ("sub-agent",):
|
||
continue
|
||
|
||
data = extract_conversation_text(log)
|
||
|
||
# 跳过没有对话内容的
|
||
if not data["rounds"]:
|
||
continue
|
||
|
||
out_file = out_dir / f"{short}.txt"
|
||
out_file.write_text(render_conversation_file(data), encoding="utf-8")
|
||
count += 1
|
||
|
||
if count % 50 == 0:
|
||
print(f"[export] {count} exported...")
|
||
|
||
if args.limit and count >= args.limit:
|
||
break
|
||
|
||
# 生成 manifest(ID 列表,供后续导入摘要时匹配)
|
||
manifest = out_dir / "_manifest.json"
|
||
existing = sorted([f.stem for f in out_dir.glob("*.txt") if f.stem != "_manifest"])
|
||
manifest.write_text(
|
||
json.dumps({"count": len(existing), "ids": existing}, indent=2, ensure_ascii=False) + "\n",
|
||
encoding="utf-8"
|
||
)
|
||
|
||
print(f"[export] done: {count} exported, {skipped} skipped, {errors} errors")
|
||
print(f"[export] output: {out_dir}")
|
||
print(f"[export] manifest: {manifest}")
|
||
print(f"\n下一步:")
|
||
print(f" 1. 用 LLM 处理 {out_dir}/*.txt,为每个文件生成 50-200 字中文摘要")
|
||
print(f" 2. 将结果写入 {out_dir}/_summaries.json,格式:")
|
||
print(f' {{"exec_id_short": "摘要文本", ...}}')
|
||
print(f" 3. 运行导入脚本将摘要写入索引")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|