Neo-ZQYY/scripts/ops/export_session_conversations.py

"""导出所有 execution 的对话内容，供外部 LLM 批量生成摘要。

每个 execution 导出一个文件，包含：
- execution ID
- 每轮 user input 和 assistant output
- 文件变更列表

输出目录: export/session_summaries/
输出格式: {exec_id_short}.txt

用法:
  python -B scripts/ops/export_session_conversations.py
  python -B scripts/ops/export_session_conversations.py --limit 50
  python -B scripts/ops/export_session_conversations.py --output-dir /path/to/dir
"""

import json
import os
import sys
from pathlib import Path

# 添加 scripts/ops 到 path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from _env_paths import ensure_repo_root
ensure_repo_root()

from extract_kiro_session import (
    find_kiro_agent_dir,
    find_workspace_session_dir,
    find_all_executions,
    find_execution_log,
    load_sessions_json,
    parse_messages,
    parse_actions,
    DEFAULT_GLOBAL_STORAGE,
)

OUTPUT_DIR = Path("export/session_summaries")


def extract_conversation_text(log: dict) -> dict:
    """从 execution log 中提取对话文本。

    返回:
        {
            "exec_id": str,
            "workflow": str,
            "status": str,
            "start_time": str,
            "rounds": [{"role": "user"|"assistant", "text": str}, ...],
            "files_modified": [str],
            "files_created": [str],
        }
    """
    messages = log.get("context", {}).get("messages", [])
    actions = log.get("actions", [])
    conversation = parse_messages(messages)
    timeline = parse_actions(actions)

    rounds = []
    for msg in conversation:
        role = msg.get("role", "")
        if role not in ("human", "bot"):
            continue

        texts = []
        for entry in msg.get("entries", []):
            et = entry.get("type", "")
            if et == "text":
                t = entry.get("text", "").strip()
                if t:
                    # 跳过 steering/system prompt 自动注入
                    if t.startswith("## Included Rules") or t.startswith("<steering"):
                        continue
                    texts.append(t)
            elif et == "toolUse":
                # 只记录工具名，不记录参数（节省 token）
                texts.append(f"[调用工具: {entry.get('name', '?')}]")
            elif et == "toolUseResponse":
                ok = "成功" if entry.get("success") else "失败"
                texts.append(f"[工具结果: {entry.get('name', '?')} {ok}]")

        if texts:
            combined = "\n".join(texts)
            # 截断过长的单条消息（节省 token）
            if len(combined) > 2000:
                combined = combined[:2000] + "\n[...截断...]"
            rounds.append({
                "role": "user" if role == "human" else "assistant",
                "text": combined,
            })

    # 文件变更
    files_modified = []
    files_created = []
    for step in timeline:
        fc = step.get("_file_change")
        if fc:
            fname = fc.get("file", "?")
            if fc.get("original"):
                if fname not in files_modified:
                    files_modified.append(fname)
            else:
                if fname not in files_created:
                    files_created.append(fname)

    from extract_kiro_session import ts_fmt
    return {
        "exec_id": log.get("executionId", "?"),
        "workflow": log.get("workflowType", "?"),
        "status": log.get("status", "?"),
        "start_time": ts_fmt(log.get("startTime")),
        "rounds": rounds,
        "files_modified": files_modified,
        "files_created": files_created,
    }


def render_conversation_file(data: dict) -> str:
    """渲染为纯文本格式，供 LLM 阅读。"""
    L = []
    L.append(f"EXECUTION_ID: {data['exec_id']}")
    L.append(f"WORKFLOW: {data['workflow']}")
    L.append(f"STATUS: {data['status']}")
    L.append(f"START_TIME: {data['start_time']}")

    if data["files_modified"]:
        L.append(f"FILES_MODIFIED: {', '.join(data['files_modified'][:20])}")
    if data["files_created"]:
        L.append(f"FILES_CREATED: {', '.join(data['files_created'][:20])}")

    L.append(f"ROUNDS: {len(data['rounds'])}")
    L.append("---")

    for i, r in enumerate(data["rounds"], 1):
        role_label = "USER" if r["role"] == "user" else "ASSISTANT"
        L.append(f"\n[{role_label} #{i}]")
        L.append(r["text"])

    L.append("\n---END---")
    return "\n".join(L)


def main():
    import argparse
    parser = argparse.ArgumentParser(description="导出 session 对话内容供 LLM 批量总结")
    parser.add_argument("--limit", type=int, help="最多导出 N 条")
    parser.add_argument("--output-dir", type=str, help="输出目录")
    parser.add_argument("--skip-existing", action="store_true", help="跳过已导出的")
    args = parser.parse_args()

    out_dir = Path(args.output_dir) if args.output_dir else OUTPUT_DIR
    out_dir.mkdir(parents=True, exist_ok=True)

    gs = DEFAULT_GLOBAL_STORAGE
    ws = os.getcwd()

    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        print("[export] kiro.kiroagent dir not found")
        return

    session_dir = find_workspace_session_dir(agent_dir, ws)
    chat_ids = None
    if session_dir:
        sessions = load_sessions_json(session_dir)
        chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
                    if s.get("chatSessionId") or s.get("sessionId")}

    all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
    if not all_execs:
        print("[export] no executions found")
        return

    # 只导出主 execution（非子代理）
    # 子代理的 execution 通常没有独立的用户输入
    count = 0
    skipped = 0
    errors = 0

    for execution in all_execs:
        eid = execution.get("executionId", "")
        short = eid[:8]

        if args.skip_existing:
            out_file = out_dir / f"{short}.txt"
            if out_file.exists():
                skipped += 1
                continue

        log_path = find_execution_log(agent_dir, execution["_hex_dir"], execution)
        if not log_path:
            continue

        try:
            with open(log_path, "r", encoding="utf-8") as f:
                log = json.load(f)
        except Exception:
            errors += 1
            continue

        # 跳过子代理 execution（没有独立用户输入）
        if log.get("workflowType") in ("sub-agent",):
            continue

        data = extract_conversation_text(log)

        # 跳过没有对话内容的
        if not data["rounds"]:
            continue

        out_file = out_dir / f"{short}.txt"
        out_file.write_text(render_conversation_file(data), encoding="utf-8")
        count += 1

        if count % 50 == 0:
            print(f"[export] {count} exported...")

        if args.limit and count >= args.limit:
            break

    # 生成 manifest（ID 列表，供后续导入摘要时匹配）
    manifest = out_dir / "_manifest.json"
    existing = sorted([f.stem for f in out_dir.glob("*.txt") if f.stem != "_manifest"])
    manifest.write_text(
        json.dumps({"count": len(existing), "ids": existing}, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8"
    )

    print(f"[export] done: {count} exported, {skipped} skipped, {errors} errors")
    print(f"[export] output: {out_dir}")
    print(f"[export] manifest: {manifest}")
    print(f"\n下一步：")
    print(f"  1. 用 LLM 处理 {out_dir}/*.txt，为每个文件生成 50-200 字中文摘要")
    print(f"  2. 将结果写入 {out_dir}/_summaries.json，格式：")
    print(f'     {{"exec_id_short": "摘要文本", ...}}')
    print(f"  3. 运行导入脚本将摘要写入索引")


if __name__ == "__main__":
    main()