Files
Neo-ZQYY/scripts/ops/export_session_conversations.py

243 lines
7.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""导出所有 execution 的对话内容,供外部 LLM 批量生成摘要。
每个 execution 导出一个文件,包含:
- execution ID
- 每轮 user input 和 assistant output
- 文件变更列表
输出目录: export/session_summaries/
输出格式: {exec_id_short}.txt
用法:
python -B scripts/ops/export_session_conversations.py
python -B scripts/ops/export_session_conversations.py --limit 50
python -B scripts/ops/export_session_conversations.py --output-dir /path/to/dir
"""
import json
import os
import sys
from pathlib import Path
# 添加 scripts/ops 到 path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from _env_paths import ensure_repo_root
ensure_repo_root()
from extract_kiro_session import (
find_kiro_agent_dir,
find_workspace_session_dir,
find_all_executions,
find_execution_log,
load_sessions_json,
parse_messages,
parse_actions,
DEFAULT_GLOBAL_STORAGE,
)
OUTPUT_DIR = Path("export/session_summaries")
def extract_conversation_text(log: dict) -> dict:
"""从 execution log 中提取对话文本。
返回:
{
"exec_id": str,
"workflow": str,
"status": str,
"start_time": str,
"rounds": [{"role": "user"|"assistant", "text": str}, ...],
"files_modified": [str],
"files_created": [str],
}
"""
messages = log.get("context", {}).get("messages", [])
actions = log.get("actions", [])
conversation = parse_messages(messages)
timeline = parse_actions(actions)
rounds = []
for msg in conversation:
role = msg.get("role", "")
if role not in ("human", "bot"):
continue
texts = []
for entry in msg.get("entries", []):
et = entry.get("type", "")
if et == "text":
t = entry.get("text", "").strip()
if t:
# 跳过 steering/system prompt 自动注入
if t.startswith("## Included Rules") or t.startswith("<steering"):
continue
texts.append(t)
elif et == "toolUse":
# 只记录工具名,不记录参数(节省 token
texts.append(f"[调用工具: {entry.get('name', '?')}]")
elif et == "toolUseResponse":
ok = "成功" if entry.get("success") else "失败"
texts.append(f"[工具结果: {entry.get('name', '?')} {ok}]")
if texts:
combined = "\n".join(texts)
# 截断过长的单条消息(节省 token
if len(combined) > 2000:
combined = combined[:2000] + "\n[...截断...]"
rounds.append({
"role": "user" if role == "human" else "assistant",
"text": combined,
})
# 文件变更
files_modified = []
files_created = []
for step in timeline:
fc = step.get("_file_change")
if fc:
fname = fc.get("file", "?")
if fc.get("original"):
if fname not in files_modified:
files_modified.append(fname)
else:
if fname not in files_created:
files_created.append(fname)
from extract_kiro_session import ts_fmt
return {
"exec_id": log.get("executionId", "?"),
"workflow": log.get("workflowType", "?"),
"status": log.get("status", "?"),
"start_time": ts_fmt(log.get("startTime")),
"rounds": rounds,
"files_modified": files_modified,
"files_created": files_created,
}
def render_conversation_file(data: dict) -> str:
"""渲染为纯文本格式,供 LLM 阅读。"""
L = []
L.append(f"EXECUTION_ID: {data['exec_id']}")
L.append(f"WORKFLOW: {data['workflow']}")
L.append(f"STATUS: {data['status']}")
L.append(f"START_TIME: {data['start_time']}")
if data["files_modified"]:
L.append(f"FILES_MODIFIED: {', '.join(data['files_modified'][:20])}")
if data["files_created"]:
L.append(f"FILES_CREATED: {', '.join(data['files_created'][:20])}")
L.append(f"ROUNDS: {len(data['rounds'])}")
L.append("---")
for i, r in enumerate(data["rounds"], 1):
role_label = "USER" if r["role"] == "user" else "ASSISTANT"
L.append(f"\n[{role_label} #{i}]")
L.append(r["text"])
L.append("\n---END---")
return "\n".join(L)
def main():
import argparse
parser = argparse.ArgumentParser(description="导出 session 对话内容供 LLM 批量总结")
parser.add_argument("--limit", type=int, help="最多导出 N 条")
parser.add_argument("--output-dir", type=str, help="输出目录")
parser.add_argument("--skip-existing", action="store_true", help="跳过已导出的")
args = parser.parse_args()
out_dir = Path(args.output_dir) if args.output_dir else OUTPUT_DIR
out_dir.mkdir(parents=True, exist_ok=True)
gs = DEFAULT_GLOBAL_STORAGE
ws = os.getcwd()
agent_dir = find_kiro_agent_dir(gs)
if not agent_dir:
print("[export] kiro.kiroagent dir not found")
return
session_dir = find_workspace_session_dir(agent_dir, ws)
chat_ids = None
if session_dir:
sessions = load_sessions_json(session_dir)
chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
if s.get("chatSessionId") or s.get("sessionId")}
all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
if not all_execs:
print("[export] no executions found")
return
# 只导出主 execution非子代理
# 子代理的 execution 通常没有独立的用户输入
count = 0
skipped = 0
errors = 0
for execution in all_execs:
eid = execution.get("executionId", "")
short = eid[:8]
if args.skip_existing:
out_file = out_dir / f"{short}.txt"
if out_file.exists():
skipped += 1
continue
log_path = find_execution_log(agent_dir, execution["_hex_dir"], execution)
if not log_path:
continue
try:
with open(log_path, "r", encoding="utf-8") as f:
log = json.load(f)
except Exception:
errors += 1
continue
# 跳过子代理 execution没有独立用户输入
if log.get("workflowType") in ("sub-agent",):
continue
data = extract_conversation_text(log)
# 跳过没有对话内容的
if not data["rounds"]:
continue
out_file = out_dir / f"{short}.txt"
out_file.write_text(render_conversation_file(data), encoding="utf-8")
count += 1
if count % 50 == 0:
print(f"[export] {count} exported...")
if args.limit and count >= args.limit:
break
# 生成 manifestID 列表,供后续导入摘要时匹配)
manifest = out_dir / "_manifest.json"
existing = sorted([f.stem for f in out_dir.glob("*.txt") if f.stem != "_manifest"])
manifest.write_text(
json.dumps({"count": len(existing), "ids": existing}, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8"
)
print(f"[export] done: {count} exported, {skipped} skipped, {errors} errors")
print(f"[export] output: {out_dir}")
print(f"[export] manifest: {manifest}")
print(f"\n下一步:")
print(f" 1. 用 LLM 处理 {out_dir}/*.txt为每个文件生成 50-200 字中文摘要")
print(f" 2. 将结果写入 {out_dir}/_summaries.json格式")
print(f' {{"exec_id_short": "摘要文本", ...}}')
print(f" 3. 运行导入脚本将摘要写入索引")
if __name__ == "__main__":
main()