Neo-ZQYY/scripts/ops/extract_kiro_session.py.bak

#!/usr/bin/env python3
"""extract_kiro_session — Kiro 执行日志全量提取器 v2。

改进点（相比 v1）：
1. 系统提示词去重：首次保存到 _system_prompts/sp_{hash8}.md，后续引用
2. 目录分层：YYYY-MM/DD/{chatSessionId_short}/ 下按对话组织
3. 子代理递归提取：主 execution + 子 execution 放同一目录，按调用顺序编号
4. ID 替换：kiro-diff URI → 真实文件路径，terminalId → 进程描述
5. CONTEXT TRANSFER 中的 steering 内容折叠引用
6. 无内容的 model action 压缩为一行

用法：
  python scripts/ops/extract_kiro_session.py                    # 提取最新 execution
  python scripts/ops/extract_kiro_session.py --all              # 提取所有未索引的
  python scripts/ops/extract_kiro_session.py --recent 20        # 提取最近 N 个未索引的
  python scripts/ops/extract_kiro_session.py --execution-id XX  # 提取指定 execution
"""

import base64
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone, timedelta
from typing import Optional

from _env_paths import ensure_repo_root

ensure_repo_root()

CST = timezone(timedelta(hours=8))

# Kiro 固定的 execution manifest 文件名
MANIFEST_FILENAME = "f62de366d0006e17ea00a01f6624aabf"

# 输出路径
SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs")
INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json")          # 精简版：仅主对话
INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json") # 完整版：主对话 + 子代理
SYSTEM_PROMPTS_DIR = os.path.join(SESSION_LOG_DIR, "_system_prompts")

# globalStorage 默认路径
DEFAULT_GLOBAL_STORAGE = os.path.join(
    os.environ.get("APPDATA", ""),
    "Kiro", "User", "globalStorage"
)

# ═══════════════════════════════════════════════════════════
# 工具函数
# ═══════════════════════════════════════════════════════════

def ts_fmt(ms) -> str:
    if not ms:
        return "N/A"
    try:
        return datetime.fromtimestamp(ms / 1000, tz=CST).strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        return str(ms)

def ts_iso(ms) -> str:
    if not ms:
        return ""
    try:
        return datetime.fromtimestamp(ms / 1000, tz=CST).isoformat()
    except Exception:
        return ""

def ts_date_parts(ms) -> tuple[str, str, str]:
    """返回 (YYYY-MM, DD, HHMMSS) 用于目录分层和文件命名"""
    try:
        dt = datetime.fromtimestamp(ms / 1000, tz=CST)
        return dt.strftime("%Y-%m"), dt.strftime("%d"), dt.strftime("%H%M%S")
    except Exception:
        return "unknown", "00", "000000"


def _resolve_chat_dir(day_dir: str, chat_short: str, first_hms: str) -> str:
    """为 chatSession 确定带当天序号的输出目录。

    规则：
    1. 先在 day_dir 下查找已有的同 chatSession 目录（同一天的后续轮次）
    2. 再在整个 SESSION_LOG_DIR 下搜索（跨天场景：chatSession 首轮在其他日期）
    3. 都没找到则在 day_dir 下分配新序号创建
    - 目录格式：{seq:02d}_{chat_short}_{first_hms}/
    """
    os.makedirs(day_dir, exist_ok=True)

    # 1. 在当天目录下查找
    for d in os.listdir(day_dir):
        if os.path.isdir(os.path.join(day_dir, d)) and chat_short in d:
            return os.path.join(day_dir, d)

    # 2. 跨天搜索：遍历所有 YYYY-MM/DD/ 目录
    log_root = SESSION_LOG_DIR
    if os.path.isdir(log_root):
        for ym in os.listdir(log_root):
            ym_path = os.path.join(log_root, ym)
            if not os.path.isdir(ym_path) or ym.startswith("_"):
                continue
            for dd in os.listdir(ym_path):
                dd_path = os.path.join(ym_path, dd)
                if not os.path.isdir(dd_path):
                    continue
                for d in os.listdir(dd_path):
                    if os.path.isdir(os.path.join(dd_path, d)) and chat_short in d:
                        return os.path.join(dd_path, d)

    # 3. 新 chatSession：分配序号
    existing_seqs = []
    for d in os.listdir(day_dir):
        if os.path.isdir(os.path.join(day_dir, d)) and len(d) >= 2 and d[:2].isdigit():
            existing_seqs.append(int(d[:2]))
    next_seq = max(existing_seqs, default=0) + 1
    new_dir = os.path.join(day_dir, f"{next_seq:02d}_{chat_short}_{first_hms}")
    os.makedirs(new_dir, exist_ok=True)
    return new_dir


def _write_cross_day_ref(exec_day_dir: str, chat_short: str, chat_dir: str):
    """在 execution 所在日期目录下生成跨天指引文件。

    当一个 chatSession 跨天时，后续日期的 day_dir 下不会有该对话的目录，
    生成 _ref_{chatShort}.md 告知该对话归在哪个目录。
    """
    os.makedirs(exec_day_dir, exist_ok=True)
    ref_path = os.path.join(exec_day_dir, f"_ref_{chat_short}.md")
    if os.path.isfile(ref_path):
        return  # 已存在，不重复写
    rel_target = os.path.relpath(chat_dir, exec_day_dir).replace("\\", "/")
    with open(ref_path, "w", encoding="utf-8") as f:
        f.write(f"# 跨天对话指引\n\n")
        f.write(f"chatSession `{chat_short}` 的完整记录归档在：\n\n")
        f.write(f"→ `{rel_target}`\n\n")
        f.write(f"（绝对路径：`{chat_dir.replace(chr(92), '/')}`）\n")


def trunc(s, n=3000) -> str:
    if not isinstance(s, str):
        return str(s)
    return s if len(s) <= n else s[:n] + f"\n... [截断，原文共 {len(s)} 字符]"

def safe_json(obj, n=5000) -> str:
    try:
        s = json.dumps(obj, ensure_ascii=False, indent=2)
    except Exception:
        s = str(obj)
    return s if len(s) <= n else s[:n] + f"\n... [截断，原文共 {len(s)} 字符]"

def fence(content: str, lang: str = "") -> str:
    """生成安全的 Markdown 代码围栏。
    检测 content 中最长连续反引号序列，外层用更多反引号包裹。
    同时转义行首 # 避免被解析为 Markdown 标题。
    如果内容中有未闭合的围栏，在末尾补上关闭围栏。
    """
    if not content:
        return f"```{lang}\n```"
    # 修复内容中未闭合的围栏（原始数据截断导致）
    fence_stack = []
    for line in content.split("\n"):
        stripped = line.strip()
        m = re.match(r"^(`{3,})", stripped)
        if m:
            ticks = len(m.group(1))
            # 如果栈顶有相同 tick 数的开启围栏，且当前行是纯关闭围栏
            if fence_stack and fence_stack[-1] == ticks and stripped == '`' * ticks:
                fence_stack.pop()
            else:
                fence_stack.append(ticks)
    # 补上所有未闭合的围栏（从栈顶开始关闭）
    if fence_stack:
        suffix_lines = ['`' * t for t in reversed(fence_stack)]
        content = content + "\n" + "\n".join(suffix_lines)
    # 找出内容中最长的连续反引号
    max_ticks = 2
    cur = 0
    for ch in content:
        if ch == '`':
            cur += 1
            if cur > max_ticks:
                max_ticks = cur
        else:
            cur = 0
    outer = '`' * (max_ticks + 1)
    # 转义行首 # —— 加零宽空格使其不被解析为标题
    safe = _escape_heading(content)
    return f"{outer}{lang}\n{safe}\n{outer}"


def _escape_heading(text: str) -> str:
    """转义文本中行首的 # 符号，防止被 Markdown 解析为标题。
    在 # 前插入零宽空格 (\\u200b)。
    """
    lines = text.split('\n')
    out = []
    for line in lines:
        if line.lstrip().startswith('#'):
            # 找到第一个 # 的位置，在前面插入零宽空格
            idx = 0
            while idx < len(line) and line[idx] in (' ', '\t'):
                idx += 1
            out.append(line[:idx] + '\u200b' + line[idx:])
        else:
            out.append(line)
    return '\n'.join(out)


def hash8(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]


# ═══════════════════════════════════════════════════════════
# 系统提示词去重
# ═══════════════════════════════════════════════════════════

def save_system_prompt(text: str) -> str:
    """保存系统提示词到 _system_prompts/，返回引用文件名。
    如果已存在相同 hash 的文件则跳过。
    """
    h = hash8(text)
    filename = f"sp_{h}.md"
    filepath = os.path.join(SYSTEM_PROMPTS_DIR, filename)
    if not os.path.isfile(filepath):
        os.makedirs(SYSTEM_PROMPTS_DIR, exist_ok=True)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"# 系统提示词 (hash: {h})\n\n")
            f.write(text)
    return filename


def is_system_prompt(text: str) -> bool:
    """判断文本是否为系统提示词（Kiro 注入的 <identity>/<capabilities> 等）"""
    if not text:
        return False
    # 系统提示词通常以 <identity> 开头或包含 <capabilities>
    return (
        "<identity>" in text[:200]
        or "<capabilities>" in text[:500]
        or text.strip().startswith("You are Kiro")
    )


def is_steering_block(text: str) -> bool:
    """判断文本是否为 steering-reminder 注入"""
    return "<steering-reminder>" in text[:100]


# ═══════════════════════════════════════════════════════════
# ID 替换与路径还原
# ═══════════════════════════════════════════════════════════

# kiro-diff URI 模式：kiro-diff:/path?commitId=xxx&executionId=yyy
KIRO_DIFF_PATTERN = re.compile(
    r'kiro-diff:(/[^?]+)\?commitId(?:%3D|=)([^&]+)&executionId(?:%3D|=)([^"\'}\s]+)'
)

def resolve_kiro_diff_uri(uri: str) -> str:
    """将 kiro-diff: URI 替换为可读的文件路径描述"""
    m = KIRO_DIFF_PATTERN.search(uri)
    if m:
        filepath = m.group(1)
        commit_id = m.group(2)
        return f"{filepath} (版本: {commit_id[:8]})"
    return uri


def resolve_ids_in_text(text: str) -> str:
    """在文本中替换已知的 ID 模式为可读信息"""
    if not text or not isinstance(text, str):
        return str(text) if text else ""
    # 替换 kiro-diff URI
    text = KIRO_DIFF_PATTERN.sub(
        lambda m: f'[文件快照] {m.group(1)} (版本 {m.group(2)[:8]})',
        text
    )
    # 替换 file:///c%3A/ 编码路径
    text = re.sub(
        r'file:///([a-zA-Z])%3A/',
        lambda m: f'{m.group(1).upper()}:/',
        text
    )
    return text


def resolve_tool_args(name: str, args: dict) -> dict:
    """对工具调用参数做可读性增强"""
    resolved = dict(args)

    # document 类型中的 target 可能是 kiro-diff URI
    if "target" in resolved and isinstance(resolved["target"], str):
        resolved["target"] = resolve_kiro_diff_uri(resolved["target"])

    # editCode / strReplace 中的 path
    if "path" in resolved and isinstance(resolved["path"], str):
        resolved["path"] = resolve_ids_in_text(resolved["path"])

    # document entries 中的 modified/original
    for key in ("modified", "original", "local"):
        if key in resolved and isinstance(resolved[key], str):
            resolved[key] = resolve_ids_in_text(resolved[key])

    return resolved


# ═══════════════════════════════════════════════════════════
# 定位逻辑
# ═══════════════════════════════════════════════════════════

def find_kiro_agent_dir(global_storage: str) -> Optional[str]:
    agent_dir = os.path.join(global_storage, "kiro.kiroagent")
    return agent_dir if os.path.isdir(agent_dir) else None


def decode_base64url_dir(dirname: str) -> str:
    try:
        b64 = dirname.replace("__", "==")
        return base64.urlsafe_b64decode(b64).decode("utf-8", errors="replace")
    except Exception:
        return ""


def find_workspace_session_dir(agent_dir: str, workspace_path: str) -> Optional[str]:
    ws_dir = os.path.join(agent_dir, "workspace-sessions")
    if not os.path.isdir(ws_dir):
        return None
    target = workspace_path.replace("\\", "/").rstrip("/").lower()
    for entry in os.scandir(ws_dir):
        if entry.is_dir():
            decoded = decode_base64url_dir(entry.name)
            if decoded.replace("\\", "/").rstrip("/").lower() == target:
                return entry.path
    return None


def load_sessions_json(session_dir: str) -> list[dict]:
    sessions_file = os.path.join(session_dir, "sessions.json")
    if not os.path.isfile(sessions_file):
        return []
    try:
        with open(sessions_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        if isinstance(data, dict) and "sessions" in data:
            return data["sessions"]
        return []
    except Exception:
        return []


def load_session_detail(session_dir: str, session_id: str) -> Optional[dict]:
    path = os.path.join(session_dir, f"{session_id}.json")
    if not os.path.isfile(path):
        return None
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None


def find_all_manifests(agent_dir: str) -> list[tuple[str, str]]:
    results = []
    hex_pattern = re.compile(r"^[0-9a-f]{32}$")
    for entry in os.scandir(agent_dir):
        if entry.is_dir() and hex_pattern.match(entry.name):
            manifest = os.path.join(entry.path, MANIFEST_FILENAME)
            if os.path.isfile(manifest):
                results.append((entry.name, manifest))
    return results


def load_manifest(manifest_path: str) -> list[dict]:
    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        # Kiro 新版 manifest 格式: {"executions": [...], "version": ...}
        if isinstance(data, dict) and "executions" in data:
            return data["executions"]
        return []
    except Exception:
        return []


def find_execution_log(agent_dir: str, hex_dir: str, execution: dict) -> Optional[str]:
    """定位完整 execution log 文件"""
    exec_id = execution.get("executionId", "")
    hex_path = os.path.join(agent_dir, hex_dir)
    for entry in os.scandir(hex_path):
        if not entry.is_dir():
            continue
        for sub_entry in os.scandir(entry.path):
            if not sub_entry.is_file() or sub_entry.stat().st_size < 1000:
                continue
            try:
                with open(sub_entry.path, "r", encoding="utf-8") as f:
                    head = f.read(500)
                    if exec_id in head:
                        f.seek(0)
                        data = json.load(f)
                        if data.get("executionId") == exec_id:
                            return sub_entry.path
            except Exception:
                continue
    return None


def find_all_executions(
    agent_dir: str,
    chat_session_ids: Optional[set[str]] = None,
    execution_id: Optional[str] = None,
) -> list[dict]:
    """从所有 manifest 中找匹配的 execution，按 endTime 降序"""
    manifests = find_all_manifests(agent_dir)
    all_execs = []
    for hex_dir, manifest_path in manifests:
        entries = load_manifest(manifest_path)
        for entry in entries:
            entry["_hex_dir"] = hex_dir
            if execution_id:
                eid = entry.get("executionId", "")
                if eid == execution_id or eid.startswith(execution_id):
                    return [entry]
            # 仅当 execution 有 chatSessionId 且不在 workspace session 集合中时才过滤
            csid = entry.get("chatSessionId")
            if chat_session_ids and csid and csid not in chat_session_ids:
                continue
            all_execs.append(entry)
    all_execs.sort(key=lambda e: e.get("endTime", 0), reverse=True)
    return all_execs


# ═══════════════════════════════════════════════════════════
# 解析逻辑
# ═══════════════════════════════════════════════════════════

def parse_messages(messages: list) -> list[dict]:
    """解析 context.messages，处理系统提示词去重和 ID 替换"""
    conversation = []
    for i, msg in enumerate(messages):
        entries = msg.get("entries", [])
        parsed = []
        for entry in entries:
            if not isinstance(entry, dict):
                continue
            etype = entry.get("type", "unknown")
            if etype == "text":
                text = entry.get("text", "")
                # 检测系统提示词
                if is_system_prompt(text):
                    sp_file = save_system_prompt(text)
                    parsed.append({
                        "type": "system_prompt_ref",
                        "ref_file": sp_file,
                        "char_count": len(text),
                    })
                elif is_steering_block(text):
                    # steering 内容折叠，只保留文件名列表
                    steering_files = re.findall(r'(\w[\w-]+\.md):', text)
                    parsed.append({
                        "type": "steering_ref",
                        "files": steering_files or ["(steering block)"],
                        "char_count": len(text),
                    })
                else:
                    parsed.append({"type": "text", "text": resolve_ids_in_text(text)})
            elif etype == "toolUse":
                args = resolve_tool_args(entry.get("name", ""), entry.get("args", {}))
                parsed.append({
                    "type": "toolUse",
                    "id": entry.get("id"),
                    "name": entry.get("name"),
                    "args": args,
                })
            elif etype == "toolUseResponse":
                msg_text = entry.get("message", "")
                parsed.append({
                    "type": "toolUseResponse",
                    "id": entry.get("id"),
                    "name": entry.get("name"),
                    "message": resolve_ids_in_text(msg_text),
                    "success": entry.get("success"),
                })
            elif etype == "document":
                doc = entry.get("document", {})
                doc_type = doc.get("type", "")
                target = doc.get("target", "")
                # steering 类型的 document：提取文件名
                if doc_type == "steering":
                    display_name = doc.get("displayName", "")
                    parsed.append({
                        "type": "steering_doc",
                        "name": display_name or "steering",
                    })
                else:
                    parsed.append({
                        "type": "document",
                        "doc_type": doc_type,
                        "target": resolve_ids_in_text(target) if target else "",
                    })
            else:
                parsed.append({"type": etype, "raw_keys": list(entry.keys())})
        conversation.append({
            "index": i,
            "role": msg.get("role", "?"),
            "messageId": msg.get("messageId", "?"),
            "entries": parsed,
        })
    return conversation


def parse_actions(actions: list) -> list[dict]:
    """解析 actions，压缩无内容的 model action"""
    timeline = []
    for i, action in enumerate(actions):
        atype = action.get("actionType", "")
        astate = action.get("actionState", "")

        # 压缩无内容的 model action 为摘要
        if atype == "model" and "output" not in action and "input" not in action:
            timeline.append({
                "index": i,
                "actionType": "model",
                "actionState": astate,
                "emittedAt": ts_fmt(action.get("emittedAt")),
                "_compressed": True,
            })
            continue

        entry = {
            "index": i,
            "actionId": action.get("actionId"),
            "actionType": atype,
            "actionState": astate,
            "emittedAt": ts_fmt(action.get("emittedAt")),
        }
        if action.get("subExecutionId"):
            entry["subExecutionId"] = action["subExecutionId"]
        if action.get("endTime"):
            entry["endTime"] = ts_fmt(action["endTime"])
        for k in ("intentResult", "input", "output"):
            if k in action:
                val = action[k]
                # 对 output/input 中的文本做 ID 替换
                if isinstance(val, dict):
                    val = dict(val)  # 避免修改原始数据
                    # 提取文件变更信息（write/create action 的 originalContent/modifiedContent）
                    if k == "input" and ("originalContent" in val or "modifiedContent" in val):
                        file_path = val.get("file", val.get("path", "?"))
                        entry["_file_change"] = {
                            "file": resolve_ids_in_text(str(file_path)),
                            "original": val.get("originalContent", ""),
                            "modified": val.get("modifiedContent", ""),
                        }
                        # 从 input 中移除大文本，保留元信息
                        slim = {vk: resolve_ids_in_text(str(vv)) if isinstance(vv, str) else vv
                                for vk, vv in val.items()
                                if vk not in ("originalContent", "modifiedContent")}
                        entry[k] = slim
                        continue
                    for vk, vv in val.items():
                        if isinstance(vv, str):
                            val[vk] = resolve_ids_in_text(vv)
                entry[k] = val
        timeline.append(entry)
    return timeline


def extract_sub_execution_ids(actions: list) -> list[str]:
    """从 actions 中提取所有 subExecutionId（按出现顺序）"""
    seen = set()
    result = []
    for action in actions:
        sid = action.get("subExecutionId")
        if sid and sid not in seen:
            seen.add(sid)
            result.append(sid)
    return result


# ═══════════════════════════════════════════════════════════
# Diff 快照收集
# ═══════════════════════════════════════════════════════════

def collect_diffs(agent_dir: str, hex_dir: str, execution: dict) -> dict[str, dict]:
    """从 execution 的 actions 中提取文件变更的 diff 信息。
    Kiro 在 write/create action 的 input 中内联了 originalContent 和 modifiedContent。
    """
    # diff 快照存储在固定目录 74a08cf8.../commitId/ 下，
    # 但 action input 中已内联内容，直接从 actions 提取更可靠
    return {}


# ═══════════════════════════════════════════════════════════
# Prompt Log 匹配
# ═══════════════════════════════════════════════════════════

PROMPT_LOG_DIR = os.path.join("docs", "audit", "prompt_logs")

def find_matching_prompt_log(start_time_ms: int) -> Optional[str]:
    """根据 execution startTime 匹配最近的 prompt_log 文件。
    prompt_log 文件名格式: prompt_log_YYYYMMDD_HHMMSS.md
    匹配窗口: startTime 前后 5 分钟内最近的一个。
    """
    if not os.path.isdir(PROMPT_LOG_DIR):
        return None
    try:
        exec_dt = datetime.fromtimestamp(start_time_ms / 1000, tz=CST)
    except Exception:
        return None

    best_match = None
    best_delta = float("inf")
    pattern = re.compile(r"prompt_log_(\d{8}_\d{6})\.md$")

    for f in os.scandir(PROMPT_LOG_DIR):
        if not f.is_file():
            continue
        m = pattern.match(f.name)
        if not m:
            continue
        try:
            log_dt = datetime.strptime(m.group(1), "%Y%m%d_%H%M%S").replace(tzinfo=CST)
            delta = abs((exec_dt - log_dt).total_seconds())
            if delta < 300 and delta < best_delta:  # 5 分钟窗口
                best_delta = delta
                best_match = f.path
        except Exception:
            continue
    return best_match


# ═══════════════════════════════════════════════════════════
# Markdown 生成
# ═══════════════════════════════════════════════════════════

def _msg_semantic_label(msg: dict) -> str:
    """为对话消息生成语义标签，用于快速定位。"""
    entries = msg.get("entries", [])
    if not entries:
        return ""
    parts = []
    for e in entries:
        et = e["type"]
        if et == "system_prompt_ref":
            parts.append("系统提示词")
        elif et == "steering_ref":
            parts.append(f"Steering({len(e.get('files', []))})")
        elif et == "steering_doc":
            parts.append(f"Steering:`{e.get('name', '?')}`")
        elif et == "toolUse":
            name = e.get("name", "?")
            # 提取关键参数作为上下文
            args = e.get("args", {})
            ctx = ""
            if name in ("readFile", "readCode", "readMultipleFiles"):
                ctx = args.get("path", "") or ",".join(args.get("paths", [])[:2])
            elif name in ("fsWrite", "strReplace", "editCode"):
                ctx = args.get("path", "")
            elif name == "grepSearch":
                ctx = args.get("query", "")[:30]
            elif name == "invokeSubAgent":
                ctx = args.get("name", "")
            elif name == "executePwsh":
                ctx = (args.get("command", "") or "")[:40]
            elif name == "taskStatus":
                ctx = args.get("status", "")
            if ctx:
                parts.append(f"调用 `{name}` → {ctx}")
            else:
                parts.append(f"调用 `{name}`")
        elif et == "toolUseResponse":
            name = e.get("name", "?")
            ok = "✅" if e.get("success") else "❌"
            parts.append(f"结果 `{name}` {ok}")
        elif et == "document":
            parts.append(f"文档:{e.get('doc_type', '?')}")
        elif et == "text":
            # 文本内容：提取前 50 字符作为预览（bot 和 human 都加）
            role = msg.get("role", "")
            if role in ("bot", "human"):
                text = (e.get("text") or "").strip()
                if text:
                    preview = text[:50].replace("\n", " ")
                    if len(text) > 50:
                        preview += "…"
                    icon = "💬" if role == "bot" else "📝"
                    parts.append(f"{icon} `{preview}`")
    return ", ".join(parts) if parts else ""


def _step_semantic_label(step: dict) -> str:
    """为 action step 生成带图标的语义标签。"""
    at = step.get("actionType", "?")
    state = step.get("actionState", "?")
    fc = step.get("_file_change")
    sub_eid = step.get("subExecutionId")

    # 状态图标
    if state == "Error":
        state_icon = "❌"
    elif state in ("Success", "Accepted"):
        state_icon = "✅"
    else:
        state_icon = "⏳"

    # 类型图标 + 上下文
    if at in ("write", "append") and fc:
        fname = fc.get("file", "?")
        short = fname.rsplit("/", 1)[-1] if "/" in fname else fname
        orig = fc.get("original", "")
        if orig:
            return f"⚡ `{at}` 修改 `{short}` {state_icon}"
        else:
            return f"⚡ `{at}` 新建 `{short}` {state_icon}"
    elif at == "invokeSubAgent":
        inp = step.get("input", {})
        # Kiro 原始 log 用 subAgentName，工具 schema 用 name
        agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
        return f"🔀 `invokeSubAgent` → {agent_name} {state_icon}"
    elif at == "subagentResponse":
        return f"🔀 `subagentResponse` {state_icon}"
    elif at in ("readFiles", "readCode"):
        inp = step.get("input", {})
        if isinstance(inp, dict):
            files = inp.get("files", [])
            if files and isinstance(files[0], dict):
                paths = [f.get("path", "?") for f in files[:2]]
            else:
                paths = [str(f) for f in files[:2]]
            ctx = ", ".join(paths)
        else:
            ctx = ""
        return f"📖 `{at}` {ctx} {state_icon}" if ctx else f"📖 `{at}` {state_icon}"
    elif at == "search":
        return f"🔍 `search` {state_icon}"
    elif at == "say":
        return f"💬 `say` {state_icon}"
    elif at == "taskStatus":
        return f"📋 `taskStatus` {state_icon}"
    elif at == "steering":
        return f"📄 `steering` {state_icon}"
    elif at == "runCommand":
        return f"🖥️ `runCommand` {state_icon}"
    elif at == "getDiagnostics":
        return f"🩺 `getDiagnostics` {state_icon}"
    elif at == "ContextualHookInvoked":
        inp = step.get("input", {})
        hook_name = inp.get("name", "?") if isinstance(inp, dict) else "?"
        return f"🪝 Hook `{hook_name}` {state_icon}"
    elif at == "intentClassification":
        ir = step.get("intentResult", {})
        cls = ir.get("classification", "?") if isinstance(ir, dict) else "?"
        return f"🎯 意图: `{cls}` {state_icon}"
    elif at == "replace":
        inp = step.get("input", {})
        path = inp.get("file", inp.get("path", "?")) if isinstance(inp, dict) else "?"
        short = path.rsplit("/", 1)[-1] if "/" in str(path) else str(path)
        return f"✏️ `replace` `{short}` {state_icon}"
    else:
        return f"`{at}` [{state}]"


def _build_execution_summary(
    log: dict,
    conversation: list[dict],
    timeline: list[dict],
    sub_file_map: Optional[dict[str, str]] = None,
) -> dict:
    """构建结构化执行摘要（零 LLM 成本，纯规则化提取）。
    返回 dict 供 md 渲染和索引存储共用。
    """
    dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000

    files_modified = []
    files_created = []
    sub_agents = []
    errors = []

    for step in timeline:
        if step.get("_compressed"):
            continue
        idx = step.get("index", "?")
        at = step.get("actionType", "?")
        state = step.get("actionState", "?")

        fc = step.get("_file_change")
        if fc:
            fname = fc.get("file", "?")
            if fc.get("original"):
                files_modified.append(fname)
            else:
                files_created.append(fname)

        if at == "invokeSubAgent":
            inp = step.get("input", {})
            agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
            sub_agents.append(agent_name)

        if state == "Error":
            errors.append(f"Step {idx}: `{at}`")

    for msg in conversation:
        for e in msg.get("entries", []):
            if e.get("type") == "toolUseResponse" and not e.get("success"):
                errors.append(f"Msg {msg['index']}: `{e.get('name', '?')}`")

    # 去重文件名
    files_modified = list(dict.fromkeys(files_modified))
    files_created = list(dict.fromkeys(files_created))

    # description 由外部 LLM 生成（百炼 API），提取阶段不生成
    description = ""

    return {
        "workflow": log.get("workflowType", "?"),
        "status": log.get("status", "?"),
        "duration_s": round(dur, 1),
        "msg_count": len(conversation),
        "action_count": len(timeline),
        "files_modified": files_modified,
        "files_created": files_created,
        "sub_agents": sub_agents,
        "errors": errors,
        "description": description,
    }


def _render_summary_md(summary: dict, sub_file_map: Optional[dict[str, str]] = None) -> str:
    """将结构化摘要渲染为 Markdown 文本（放在文件最前面）。"""
    L = []
    # 一句话概览
    status_icon = "✅" if summary["status"] == "succeed" else "❌"
    L.append(f"{status_icon} `{summary['workflow']}` | {summary['duration_s']}s | "
             f"{summary['msg_count']} msgs | {summary['action_count']} actions")
    L.append("")

    desc = summary.get("description", "")
    if desc:
        L.append(f"> {desc}")
        L.append("")

    fm = summary["files_modified"]
    fc = summary["files_created"]
    if fm or fc:
        L.append(f"**文件变更** (修改 {len(fm)} / 新建 {len(fc)})")
        for f in fm:
            L.append(f"- ⚡ 修改 `{f}`")
        for f in fc:
            L.append(f"- ✨ 新建 `{f}`")
        L.append("")

    sa = summary["sub_agents"]
    if sa:
        L.append(f"**子代理** ({len(sa)}): {', '.join(f'`{a}`' for a in sa)}")
        L.append("")

    errs = summary["errors"]
    if errs:
        L.append(f"**错误** ({len(errs)})")
        for e in errs:
            L.append(f"- ❌ {e}")
        L.append("")

    if not fm and not fc and not sa and not errs:
        L.append("*(无文件变更、子代理调用或错误)*")

    return "\n".join(L)


def _build_nav_summary(
    conversation: list[dict],
    timeline: list[dict],
    sub_file_map: Optional[dict[str, str]] = None,
) -> str:
    """生成快速导航摘要：文件变更、子代理、错误。"""
    file_changes = []
    sub_agents = []
    errors = []

    for step in timeline:
        if step.get("_compressed"):
            continue
        idx = step.get("index", "?")
        at = step.get("actionType", "?")
        state = step.get("actionState", "?")

        # 文件变更
        fc = step.get("_file_change")
        if fc:
            fname = fc.get("file", "?")
            orig = fc.get("original", "")
            action = "修改" if orig else "新建"
            file_changes.append(f"- Step {idx}: ⚡ {action} `{fname}`")

        # 子代理
        if at == "invokeSubAgent":
            inp = step.get("input", {})
            agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
            sub_eid = step.get("subExecutionId", "")
            sub_path = ""
            if sub_file_map and sub_eid and sub_eid in sub_file_map:
                sub_path = f" → `{sub_file_map[sub_eid].replace(chr(92), '/')}`"
            sub_agents.append(f"- Step {idx}: 🔀 `{agent_name}`{sub_path}")

        # 错误
        if state == "Error":
            errors.append(f"- Step {idx}: ❌ `{at}`")

    # 对话中的错误工具结果
    for msg in conversation:
        for e in msg.get("entries", []):
            if e.get("type") == "toolUseResponse" and not e.get("success"):
                errors.append(f"- Msg {msg['index']}: ❌ `{e.get('name', '?')}`")

    lines = []
    if file_changes:
        lines.append(f"**文件变更** ({len(file_changes)})")
        lines.extend(file_changes)
        lines.append("")
    if sub_agents:
        lines.append(f"**子代理调用** ({len(sub_agents)})")
        lines.extend(sub_agents)
        lines.append("")
    if errors:
        lines.append(f"**错误** ({len(errors)})")
        lines.extend(errors)
        lines.append("")
    if not lines:
        lines.append("*(无文件变更、子代理调用或错误)*")

    return "\n".join(lines)


def generate_full_record(
    log: dict,
    conversation: list[dict],
    timeline: list[dict],
    diffs: dict[str, dict],
    session_info: Optional[dict] = None,
    prompt_log_path: Optional[str] = None,
    is_sub: bool = False,
    sub_index: int = 0,
    sub_file_map: Optional[dict[str, str]] = None,
    prev_msg_count: int = 0,
) -> tuple[str, dict]:
    """生成单个 execution 的 Markdown 全量记录。

    Args:
        log: 原始 execution log JSON
        conversation: parse_messages 输出
        timeline: parse_actions 输出
        diffs: collect_diffs 输出
        session_info: 会话配置（仅主 execution 有）
        prompt_log_path: 匹配的 prompt_log 文件路径
        is_sub: 是否为子代理 execution
        sub_index: 子代理序号（从 1 开始）
        prev_msg_count: 前一轮 execution 的消息数，用于去重（跳过累积的历史消息）
    """
    L = []
    exec_id = log.get("executionId", "?")
    chat_id = log.get("chatSessionId", "?")

    # 构建结构化摘要（供 md 和索引共用）
    _summary = _build_execution_summary(log, conversation, timeline, sub_file_map)

    # 标题
    if is_sub:
        L.append(f"# 子代理 Execution #{sub_index}\n")
    else:
        L.append("# Kiro 会话全量记录\n")
    L.append(f"> 生成时间: {datetime.now(CST).strftime('%Y-%m-%d %H:%M:%S')} CST\n")

    # ── 0. 执行摘要（文件最前面，AI 读前 20 行即可掌握全貌）──
    L.append("## 📋 执行摘要\n")
    L.append(_render_summary_md(_summary, sub_file_map))
    L.append("")

    # ── 1. 元数据 ──
    L.append("## 1. 元数据\n")
    L.append("| 字段 | 值 |")
    L.append("|------|-----|")
    L.append(f"| executionId | `{exec_id}` |")
    L.append(f"| chatSessionId | `{chat_id}` |")
    L.append(f"| workflowType | `{log.get('workflowType', '?')}` |")
    L.append(f"| autonomyMode | `{log.get('autonomyMode', '?')}` |")
    L.append(f"| status | `{log.get('status', '?')}` |")
    L.append(f"| startTime | `{ts_fmt(log.get('startTime'))}` |")
    L.append(f"| endTime | `{ts_fmt(log.get('endTime'))}` |")
    dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
    L.append(f"| duration | `{dur:.1f}s` |")
    L.append(f"| contextUsage | `{log.get('contextUsagePercentage', 0):.2f}%` |")
    L.append("")

    if session_info and not is_sub:
        L.append(f"- 会话标题: `{session_info.get('title', '?')}`")
        L.append(f"- 创建时间: `{ts_fmt(int(session_info.get('dateCreated', 0)))}`")
        L.append(f"- 工作区: `{session_info.get('workspaceDirectory', '?')}`")
        L.append("")

    if prompt_log_path and not is_sub:
        rel = os.path.relpath(prompt_log_path).replace("\\", "/")
        L.append(f"- 关联 prompt_log: `{rel}`")
        L.append("")

    # ── 2. 用户输入 ──
    L.append("## 2. 用户输入\n")
    input_text = ""
    for msg in log.get("input", {}).get("data", {}).get("messages", []):
        for entry in msg.get("content", msg.get("entries", [])):
            if isinstance(entry, dict) and entry.get("text"):
                input_text += entry["text"] + "\n"
    if input_text.strip():
        L.append(fence(input_text.strip()) + "\n")
    else:
        L.append("*(无用户输入)*\n")

    # ── 3. 对话记录 ──
    L.append("## 3. 对话记录\n")

    # 去重：同一 chatSession 的非首轮 execution，context.messages 包含前几轮的累积历史
    # prev_msg_count > 0 时跳过前 N 条，只渲染本轮新增的消息
    new_msgs = conversation[prev_msg_count:] if prev_msg_count > 0 else conversation
    h = sum(1 for m in new_msgs if m["role"] == "human")
    b = sum(1 for m in new_msgs if m["role"] == "bot")
    t = sum(1 for m in new_msgs if m["role"] == "tool")
    if prev_msg_count > 0:
        L.append(f"共 {len(new_msgs)} 条新增消息 (跳过前 {prev_msg_count} 条历史): human={h}, bot={b}, tool={t}\n")
    else:
        L.append(f"共 {len(new_msgs)} 条消息: human={h}, bot={b}, tool={t}\n")

    for msg in new_msgs:
        emoji = {"human": "👤", "bot": "🤖", "tool": "🔧"}.get(msg["role"], "❓")
        # 生成语义标签
        msg_label = _msg_semantic_label(msg)
        label_suffix = f" — {msg_label}" if msg_label else ""

        # P0: 压缩 hook 输出的空消息（特征：HUMAN 消息只含 "Output:\nCommand executed..." 或 "Output:\nCommand timed out..."）
        if msg["role"] == "human" and len(msg["entries"]) == 1:
            e0 = msg["entries"][0]
            if e0["type"] == "text":
                _txt = (e0.get("text") or "").strip()
                if _txt.startswith("Output:") and ("Exit Code:" in _txt) and len(_txt) < 200:
                    # 提取 exit code
                    import re as _re
                    _ec_match = _re.search(r"Exit Code:\s*(-?\d+)", _txt)
                    _ec = _ec_match.group(1) if _ec_match else "?"
                    L.append(f"### Msg {msg['index']}: 👤 HUMAN — 🪝 Hook 输出 (exit={_ec})\n")
                    continue

        L.append(f"### Msg {msg['index']}: {emoji} {msg['role'].upper()}{label_suffix}\n")

        for entry in msg["entries"]:
            et = entry["type"]

            if et == "system_prompt_ref":
                ref = entry["ref_file"]
                chars = entry["char_count"]
                sp_path = f"docs/audit/session_logs/_system_prompts/{ref}"
                L.append(f"**[系统提示词]** → `{sp_path}` ({chars} 字符)\n")

            elif et == "steering_ref":
                files = ", ".join(entry["files"])
                chars = entry["char_count"]
                L.append(f"**[Steering]** 引用: {files} ({chars} 字符)\n")

            elif et == "text":
                text = entry.get("text", "")
                if not text:
                    L.append("*(空)*\n")
                else:
                    L.append(fence(text) + "\n")

            elif et == "toolUse":
                name = entry.get("name", "?")
                args = entry.get("args", {})
                L.append(f"**[🔧 调用]** `{name}`\n")
                # P1: strReplace/editCode 的代码变更用 diff 格式展示
                if name in ("strReplace", "editCode") and isinstance(args, dict):
                    _path = args.get("path", "?")
                    _lang = "python" if _path.endswith(".py") else "sql" if _path.endswith(".sql") else ""
                    L.append(f"- 文件: `{_path}`\n")
                    _old = args.get("oldStr", args.get("old_str", ""))
                    _new = args.get("newStr", args.get("new_str", ""))
                    _sel = args.get("selector", "")
                    _op = args.get("operation", "")
                    _repl = args.get("replacement", "")
                    if _sel:
                        L.append(f"- selector: `{_sel}`" + (f" ({_op})" if _op else ""))
                    if _old:
                        L.append(f"- 删除:\n" + fence(trunc(_old, 2000), _lang))
                    if _new:
                        L.append(f"- 插入:\n" + fence(trunc(_new, 2000), _lang))
                    if _repl:
                        L.append(f"- 替换为:\n" + fence(trunc(_repl, 2000), _lang))
                    L.append("")
                else:
                    L.append(fence(safe_json(args, 5000), "json") + "\n")

            elif et == "toolUseResponse":
                ok = "✅" if entry.get("success") else "❌"
                L.append(f"**[📋 结果]** `{entry.get('name', '?')}` {ok}\n")
                msg_text = entry.get("message", "")
                if msg_text:
                    L.append(fence(trunc(msg_text, 5000)) + "\n")

            elif et == "document":
                target = entry.get("target", "")
                L.append(f"**[📄 文档]** type=`{entry.get('doc_type')}` target=`{target}`\n")

            elif et == "steering_doc":
                L.append(f"**[📄 Steering]** `{entry.get('name', 'steering')}`\n")

            else:
                L.append(f"**[{et}]** keys={entry.get('raw_keys')}\n")

    # ── 4. Actions 时间线 ──
    L.append("## 4. Actions 时间线\n")
    L.append(f"共 {len(timeline)} 个\n")

    for step in timeline:
        if step.get("_compressed"):
            L.append(f"- `model` [{step.get('actionState')}] @ {step.get('emittedAt')}\n")
            continue

        at = step.get('actionType', '?')
        state = step.get('actionState', '?')
        # 生成语义标签
        step_label = _step_semantic_label(step)
        L.append(f"### Step {step['index']}: {step_label} @ {step.get('emittedAt','?')}\n")
        if step.get("subExecutionId"):
            sub_eid = step["subExecutionId"]
            L.append(f"- subExecutionId: `{sub_eid}`")
            # 标注子代理文件路径（如果有映射）
            if sub_file_map and sub_eid in sub_file_map:
                sub_path = sub_file_map[sub_eid].replace("\\", "/")
                L.append(f"- 子代理记录: `{sub_path}`")
        if step.get("endTime"):
            L.append(f"- endTime: {step['endTime']}")
        # 文件变更展示
        if step.get("_file_change"):
            fc = step["_file_change"]
            fname = fc.get("file", "?")
            orig = fc.get("original", "")
            mod = fc.get("modified", "")
            lang = "python" if fname.endswith(".py") else "sql" if fname.endswith(".sql") else ""
            L.append(f"- 文件变更: `{fname}`")
            if orig and mod:
                L.append(f"  - 修改前 ({len(orig)} 字符):\n" + fence(trunc(orig, 3000), lang))
                L.append(f"  - 修改后 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
            elif mod:
                L.append(f"  - 新建 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
        # 特殊处理各种 action type 的内容展示
        _at = step.get("actionType", "")
        if _at == "say":
            _say_msg = (step.get("output") or {}).get("message", "")
            if _say_msg:
                L.append(f"- 💬 AI 回复:\n\n{_say_msg}\n")
            else:
                for k in ("intentResult", "input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        elif _at == "invokeSubAgent":
            _sub_input = step.get("input") or {}
            _sub_prompt = _sub_input.get("prompt", "")
            _sub_name = _sub_input.get("name", "") or _sub_input.get("subAgentName", "") or "?"
            if _sub_prompt:
                L.append(f"- 📤 委托 `{_sub_name}`:\n\n{trunc(_sub_prompt, 3000)}\n")
            _sub_output = step.get("output") or {}
            _sub_resp = _sub_output.get("response", "")
            if _sub_resp:
                L.append(f"- 📥 子代理输出:\n\n{_sub_resp}\n")
            elif not _sub_prompt:
                for k in ("intentResult", "input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        elif _at == "subagentResponse":
            _sr_input = step.get("input") or {}
            _sr_resp = _sr_input.get("response", "")
            if _sr_resp:
                L.append(f"- 📥 子代理返回:\n\n{_sr_resp}\n")
            else:
                for k in ("intentResult", "input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        elif _at == "ContextualHookInvoked":
            # P1: hook 执行——提取名称、命令、exitCode
            _hi = step.get("input") or {}
            _ho = step.get("output") or {}
            _h_name = _hi.get("name", "?")
            _h_cmd = _ho.get("command", "")
            _h_result = _ho.get("result", {})
            _h_exit = _h_result.get("exitCode", "?") if isinstance(_h_result, dict) else "?"
            _h_out = _h_result.get("output", "") if isinstance(_h_result, dict) else ""
            if _h_cmd:
                L.append(f"- `$ {_h_cmd}`")
            L.append(f"- Exit: `{_h_exit}`")
            if _h_out and _h_out != "Command executed successfully with no output.":
                L.append(f"- 输出:\n" + fence(trunc(_h_out, 2000)))
        elif _at == "intentClassification":
            # P1: 意图分类——压缩为一行
            _ir = step.get("intentResult", {})
            _cls = _ir.get("classification", "?") if isinstance(_ir, dict) else "?"
            L.append(f"- 分类结果: `{_cls}`")
        elif _at == "runCommand":
            # P0: 命令执行——提取命令、exitCode、输出
            _rc_in = step.get("input") or {}
            _rc_out = step.get("output") or {}
            _rc_cmd = _rc_in.get("command", "") if isinstance(_rc_in, dict) else ""
            _rc_result = _rc_out.get("result", {}) if isinstance(_rc_out, dict) else {}
            _rc_exit = _rc_result.get("exitCode", "?") if isinstance(_rc_result, dict) else "?"
            _rc_output = _rc_result.get("output", "") if isinstance(_rc_result, dict) else ""
            if _rc_cmd:
                L.append(f"- `$ {_rc_cmd}`")
            L.append(f"- Exit: `{_rc_exit}`")
            if _rc_output:
                L.append(f"- 输出:\n" + fence(trunc(_rc_output, 3000)))
        elif _at == "search":
            # P2: 搜索——提取 query 和 why
            _s_in = step.get("input") or {}
            _s_query = _s_in.get("query", "") if isinstance(_s_in, dict) else ""
            _s_why = _s_in.get("why", "") if isinstance(_s_in, dict) else ""
            if _s_query:
                L.append(f"- 🔍 `{trunc(_s_query, 100)}`")
            if _s_why:
                L.append(f"- 原因: {trunc(_s_why, 200)}")
            # 展示搜索结果（如果有）
            _s_out = step.get("output")
            if _s_out and isinstance(_s_out, dict):
                _s_files = _s_out.get("files", [])
                if _s_files:
                    L.append(f"- 结果: {len(_s_files)} 个文件")
        elif _at == "steering":
            # P2: steering——提取文件名列表
            _st_in = step.get("input") or {}
            _st_docs = _st_in.get("documents", []) if isinstance(_st_in, dict) else []
            if _st_docs:
                import urllib.parse
                names = []
                for d in _st_docs[:10]:
                    if isinstance(d, str):
                        # URL 编码的路径，提取文件名
                        decoded = urllib.parse.unquote(d)
                        name = decoded.rsplit("/", 1)[-1] if "/" in decoded else decoded
                        names.append(name)
                if names:
                    L.append(f"- 文件: {', '.join(names)}")
            else:
                for k in ("input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 3000), "json"))
        elif _at == "getDiagnostics":
            # P2: 诊断——提取路径和问题数
            _gd_in = step.get("input") or {}
            _gd_out = step.get("output") or {}
            _gd_paths = _gd_in.get("paths", []) if isinstance(_gd_in, dict) else []
            if _gd_paths:
                L.append(f"- 文件: {', '.join(str(p) for p in _gd_paths[:5])}")
            if isinstance(_gd_out, dict):
                _gd_diags = _gd_out.get("diagnostics", [])
                if isinstance(_gd_diags, list):
                    L.append(f"- 问题数: {len(_gd_diags)}")
                    for d in _gd_diags[:5]:
                        if isinstance(d, dict):
                            L.append(f"  - {d.get('severity', '?')}: {d.get('message', '?')[:100]}")
                elif not _gd_diags:
                    L.append("- ✅ 无问题")
        elif _at in ("readFiles", "readCode"):
            # P3: 文件读取——只展示路径，不展示内容
            _rf_in = step.get("input") or {}
            if isinstance(_rf_in, dict):
                _rf_files = _rf_in.get("files", [])
                paths = []
                for f in _rf_files[:5]:
                    if isinstance(f, dict):
                        paths.append(f.get("path", "?"))
                    else:
                        paths.append(str(f))
                if paths:
                    L.append(f"- 文件: {', '.join(paths)}")
        else:
            for k in ("intentResult", "input", "output"):
                if k in step:
                    L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        L.append("")

    # ── 5. 资源消耗 ──
    usage = log.get("usageSummary", [])
    if usage:
        L.append("## 5. 资源消耗\n")
        L.append("| 工具 | 消耗 | 单位 |")
        L.append("|------|------|------|")
        total = 0
        for u in usage:
            tools = ", ".join(u.get("usedTools", ["-"]))
            amt = u.get("usage", 0)
            total += amt
            L.append(f"| {tools} | {amt} | {u.get('unit', '?')} |")
        L.append(f"| **合计** | **{total:.4f}** | |")
        L.append("")

    return "\n".join(L), _summary


# ═══════════════════════════════════════════════════════════
# 索引管理
# ═══════════════════════════════════════════════════════════

def load_index() -> dict:
    if os.path.isfile(INDEX_PATH):
        try:
            with open(INDEX_PATH, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {"version": 2, "entries": {}}


def load_full_index() -> dict:
    if os.path.isfile(INDEX_FULL_PATH):
        try:
            with open(INDEX_FULL_PATH, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {"version": 2, "entries": {}}


def save_index(index: dict):
    os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
    with open(INDEX_PATH, "w", encoding="utf-8") as f:
        json.dump(index, f, ensure_ascii=False, indent=2)
    _save_day_indexes(index, "_day_index.json")


def save_full_index(index: dict):
    os.makedirs(os.path.dirname(INDEX_FULL_PATH), exist_ok=True)
    with open(INDEX_FULL_PATH, "w", encoding="utf-8") as f:
        json.dump(index, f, ensure_ascii=False, indent=2)
    _save_day_indexes(index, "_day_index_full.json")


def _save_day_indexes(index: dict, filename: str):
    """将根级索引按 output_dir 中的 {ym}/{dd} 拆分，写入每个 day_dir 下。

    路径格式：docs/audit/session_logs/{ym}/{dd}/{filename}
    每个 day 级索引只包含 output_dir 在该天目录下的 entry。
    """
    entries = index.get("entries", {})
    if not entries:
        return

    # 按 day_dir 分组
    day_groups: dict[str, dict[str, dict]] = {}
    prefix = SESSION_LOG_DIR.replace("\\", "/")
    for eid, ent in entries.items():
        out_dir = ent.get("output_dir", "").replace("\\", "/")
        if not out_dir.startswith(prefix):
            continue
        # out_dir 格式：docs/audit/session_logs/2026-03/03/01_abc12345_013337
        # 取到 day_dir：docs/audit/session_logs/2026-03/03
        rel = out_dir[len(prefix):].lstrip("/")
        parts = rel.split("/")
        if len(parts) >= 2:
            day_key = f"{parts[0]}/{parts[1]}"  # "2026-03/03"
            day_groups.setdefault(day_key, {})[eid] = ent

    for day_key, day_entries in day_groups.items():
        day_dir = os.path.join(SESSION_LOG_DIR, day_key.replace("/", os.sep))
        day_idx_path = os.path.join(day_dir, filename)
        os.makedirs(day_dir, exist_ok=True)
        day_data = {"version": 2, "entries": day_entries}
        with open(day_idx_path, "w", encoding="utf-8") as f:
            json.dump(day_data, f, ensure_ascii=False, indent=2)


def update_index(index: dict, exec_id: str, output_dir: str, meta: dict,
                 summary: Optional[dict] = None, is_sub: bool = False,
                 parent_exec_id: str = ""):
    """添加一条索引记录，含结构化摘要供 AI 低成本查询。

    is_sub=True 时标记为子代理 entry，并记录 parent_exec_id。
    """
    entry = {
        "output_dir": output_dir.replace("\\", "/"),
        "chatSessionId": meta.get("chatSessionId", ""),
        "startTime": meta.get("startTime", ""),
        "endTime": meta.get("endTime", ""),
        "status": meta.get("status", ""),
        "workflowType": meta.get("workflowType", ""),
        "indexed_at": datetime.now(CST).isoformat(),
    }
    if is_sub:
        entry["is_sub"] = True
        if parent_exec_id:
            entry["parent_exec_id"] = parent_exec_id
    if summary:
        entry["summary"] = {
            "duration_s": summary.get("duration_s", 0),
            "msg_count": summary.get("msg_count", 0),
            "action_count": summary.get("action_count", 0),
            "files_modified": summary.get("files_modified", []),
            "files_created": summary.get("files_created", []),
            "sub_agents": summary.get("sub_agents", []),
            "errors": summary.get("errors", []),
        }
        if summary.get("description"):
            entry["description"] = summary["description"]
    index["entries"][exec_id] = entry


# ═══════════════════════════════════════════════════════════
# 主提取逻辑
# ═══════════════════════════════════════════════════════════

def extract_single_execution(
    agent_dir: str,
    hex_dir: str,
    execution: dict,
    session_dir: Optional[str],
    index: dict,
    full_index: Optional[dict] = None,
    sub_mode: bool = False,
    sub_index: int = 0,
    output_base_dir: Optional[str] = None,
    parent_exec_id: str = "",
) -> Optional[str]:
    """提取单个 execution 并写入文件。

    Args:
        agent_dir: kiro.kiroagent 目录
        hex_dir: execution 所在的 32 位 hex 目录
        execution: manifest 中的 execution 条目
        session_dir: workspace-sessions 子目录（用于加载会话信息）
        index: 精简索引字典（仅主对话）
        full_index: 完整索引字典（主对话 + 子代理），None 时不写入
        sub_mode: 是否为子代理模式
        sub_index: 子代理序号
        output_base_dir: 子代理模式下的输出目录（与主 execution 同目录）
        parent_exec_id: 子代理的父 execution ID

    Returns:
        输出目录路径，或 None（如果失败/已索引）
    """
    exec_id = execution.get("executionId", "")
    chat_id = execution.get("chatSessionId", "")

    # 跳过已索引且文件仍存在的（子代理不检查，因为它们跟随主 execution）
    if not sub_mode and exec_id in index.get("entries", {}):
        existing_dir = index["entries"][exec_id].get("output_dir", "")
        if existing_dir and os.path.isdir(existing_dir):
            return None
        # 文件已被清理，从索引中移除，继续提取
        del index["entries"][exec_id]

    # 加载 execution log
    log_path = find_execution_log(agent_dir, hex_dir, execution)
    if not log_path:
        return None

    try:
        with open(log_path, "r", encoding="utf-8") as f:
            log = json.load(f)
    except Exception:
        return None

    # 从完整 log 补充 chatSessionId（新版 manifest 条目中可能缺失）
    if not chat_id:
        chat_id = log.get("chatSessionId", "")

    # 解析
    messages = log.get("context", {}).get("messages", [])
    actions = log.get("actions", [])
    conversation = parse_messages(messages)
    timeline = parse_actions(actions)
    diffs = collect_diffs(agent_dir, hex_dir, execution)

    # 会话信息（仅主 execution）
    session_info = None
    if not sub_mode and session_dir and chat_id:
        session_info = load_session_detail(session_dir, chat_id)

    # prompt_log 匹配（仅主 execution）
    prompt_log = None
    if not sub_mode:
        start_time = log.get("startTime", 0)
        prompt_log = find_matching_prompt_log(start_time)

    # 取 execution 开始时间（用于目录和文件命名的时间后缀）
    _start_ms = log.get("startTime") or execution.get("startTime", 0)
    _ym, _dd, _hms = ts_date_parts(_start_ms)

    # 确定输出目录
    if sub_mode and output_base_dir:
        out_dir = output_base_dir
    else:
        chat_short = chat_id[:8] if chat_id else hash8(exec_id)
        day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
        out_dir = _resolve_chat_dir(day_dir, chat_short, _hms)

    os.makedirs(out_dir, exist_ok=True)

    # 跨天指引：如果 execution 所在日期与 chatSession 目录所在日期不同，
    # 在 execution 日期的 day_dir 下生成 _ref_{chatShort}.md
    if not sub_mode:
        chat_short = chat_id[:8] if chat_id else hash8(exec_id)
        # out_dir 的父目录是 chatSession 首轮所在的 day_dir
        chat_day_dir = os.path.dirname(out_dir)
        exec_day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
        if os.path.normpath(chat_day_dir) != os.path.normpath(exec_day_dir):
            _write_cross_day_ref(exec_day_dir, chat_short, out_dir)

    # 先递归提取子代理，收集 sub_file_map（subExecutionId → 文件路径）
    sub_file_map: dict[str, str] = {}
    if not sub_mode:
        sub_exec_ids = extract_sub_execution_ids(actions)
        for si, sub_eid in enumerate(sub_exec_ids, 1):
            sub_execs = find_all_executions(agent_dir, execution_id=sub_eid)
            if sub_execs:
                sub_exec = sub_execs[0]
                extract_single_execution(
                    agent_dir=agent_dir,
                    hex_dir=sub_exec["_hex_dir"],
                    execution=sub_exec,
                    session_dir=session_dir,
                    index=index,
                    full_index=full_index,
                    sub_mode=True,
                    sub_index=si,
                    output_base_dir=out_dir,
                    parent_exec_id=exec_id,
                )
                sub_filename = f"sub_{si:02d}_{sub_eid[:8]}.md"
                sub_file_map[sub_eid] = os.path.join(out_dir, sub_filename)

    # 计算 prev_msg_count：同 chatSession 的前一轮 execution 的消息数
    # 用于去重——跳过 context.messages 中累积的历史消息
    # 策略：从同目录已有的 main_*.md 文件摘要中解析 msg_count
    # （不依赖索引，因为并行提取时索引可能尚未合并）
    prev_msg_count = 0
    existing_mains = []
    cur_msg_count = len(conversation)
    if not sub_mode and out_dir and os.path.isdir(out_dir):
        existing_mains = sorted(
            f for f in os.listdir(out_dir)
            if f.startswith("main_") and f.endswith(".md")
        )
        if existing_mains:
            # 从已有 main 文件的摘要行解析 msg count
            # 摘要格式：✅ `chat-agent` | 1406.6s | 44 msgs | 266 actions
            # 同时检查对话记录部分的"跳过前 N 条历史"来获取累积消息数
            # 累积消息数 = 该轮的 prev_msg_count + 该轮新增的 msg_count
            import re as _re_prev
            for _mf in existing_mains:
                try:
                    _mf_path = os.path.join(out_dir, _mf)
                    _mf_msgs = 0
                    _mf_skipped = 0
                    with open(_mf_path, "r", encoding="utf-8") as _fh:
                        for _ln_idx, _ln in enumerate(_fh):
                            if _ln_idx > 200:
                                break
                            # 摘要行：44 msgs
                            _mc_match = _re_prev.search(r'\|\s*(\d+)\s*msgs\s*\|', _ln)
                            if _mc_match:
                                _mf_msgs = int(_mc_match.group(1))
                            # 对话记录行：共 23 条新增消息 (跳过前 44 条历史)
                            _skip_match = _re_prev.search(r'共\s*(\d+)\s*条新增消息\s*\(跳过前\s*(\d+)\s*条', _ln)
                            if _skip_match:
                                _mf_msgs = int(_skip_match.group(1))
                                _mf_skipped = int(_skip_match.group(2))
                                break
                            # 对话记录行（首轮）：共 11 条消息:
                            _full_match = _re_prev.search(r'共\s*(\d+)\s*条消息:', _ln)
                            if _full_match and _mf_skipped == 0:
                                _mf_msgs = int(_full_match.group(1))
                    # 累积消息数 = 跳过的 + 本轮新增的
                    _cumulative = _mf_skipped + _mf_msgs
                    if _cumulative > prev_msg_count and _cumulative < cur_msg_count:
                        prev_msg_count = _cumulative
                except Exception:
                    pass

    # 生成 Markdown + 结构化摘要（主 execution 带 sub_file_map）
    md, summary = generate_full_record(
        log=log,
        conversation=conversation,
        timeline=timeline,
        diffs=diffs,
        session_info=session_info,
        prompt_log_path=prompt_log,
        is_sub=sub_mode,
        sub_index=sub_index,
        sub_file_map=sub_file_map if not sub_mode else None,
        prev_msg_count=prev_msg_count,
    )

    # 写入文件
    if sub_mode:
        filename = f"sub_{sub_index:02d}_{exec_id[:8]}.md"
    else:
        # 自动编号：复用上面已扫描的 existing_mains（prev_msg_count 分支），
        # 若未进入该分支则重新扫描
        if not existing_mains and os.path.isdir(out_dir):
            existing_mains = sorted(
                f for f in os.listdir(out_dir)
                if f.startswith("main_") and f.endswith(".md")
            )
        main_idx = len(existing_mains) + 1
        filename = f"main_{main_idx:02d}_{exec_id[:8]}.md"

    filepath = os.path.join(out_dir, filename)
    # surrogate 字符（如 \udccb）无法用 utf-8 编码，替换为 U+FFFD
    md_safe = md.encode("utf-8", errors="surrogateescape").decode("utf-8", errors="replace")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(md_safe)

    # 更新索引
    _meta = {
        "chatSessionId": chat_id,
        "startTime": ts_fmt(log.get("startTime")),
        "endTime": ts_fmt(log.get("endTime")),
        "status": log.get("status", ""),
        "workflowType": log.get("workflowType", ""),
    }
    if not sub_mode:
        # 主对话：写入精简索引 + 完整索引
        update_index(index, exec_id, out_dir, _meta, summary=summary)
        if full_index is not None:
            update_index(full_index, exec_id, out_dir, _meta, summary=summary)
    else:
        # 子代理：只写入完整索引
        if full_index is not None:
            update_index(full_index, exec_id, out_dir, _meta,
                         summary=summary, is_sub=True,
                         parent_exec_id=parent_exec_id)

    return out_dir


# ═══════════════════════════════════════════════════════════
# 入口函数
# ═══════════════════════════════════════════════════════════

def extract_latest(global_storage: Optional[str] = None, workspace_path: Optional[str] = None):
    """提取所有未索引的终态 execution（供 agent_on_stop 调用）。

    同一 chatSession 的 execution 按 startTime 升序提取，确保 prev_msg_count 正确计算。
    当前轮次在 agent on stop 触发时通常还是 running 状态，会在下一次调用时回补。
    """
    gs = global_storage or DEFAULT_GLOBAL_STORAGE
    ws = workspace_path or os.getcwd()

    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        return

    session_dir = find_workspace_session_dir(agent_dir, ws)
    chat_ids = None
    if session_dir:
        sessions = load_sessions_json(session_dir)
        chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
                    if s.get("chatSessionId") or s.get("sessionId")}

    TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")

    all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
    if not all_execs:
        return

    index = load_index()
    full_index = load_full_index()

    # 筛选：未索引 + 终态
    ready = [e for e in all_execs
             if e.get("executionId", "") not in index.get("entries", {})
             and e.get("status", "") in TERMINAL_STATUSES]

    if not ready:
        return

    # 按 chatSessionId 分组，组内按 startTime 升序
    # 确保同一对话的 execution 按时间顺序提取，prev_msg_count 才能正确计算
    from collections import defaultdict
    chat_groups: dict[str, list[dict]] = defaultdict(list)
    for e in ready:
        cid = e.get("chatSessionId", "") or "unknown"
        chat_groups[cid].append(e)
    for cid in chat_groups:
        chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))

    extracted_count = 0
    for cid, group_execs in chat_groups.items():
        for execution in group_execs:
            result = extract_single_execution(
                agent_dir=agent_dir,
                hex_dir=execution["_hex_dir"],
                execution=execution,
                session_dir=session_dir,
                index=index,
                full_index=full_index,
            )
            if result:
                extracted_count += 1
                print(f"[session-extract] extracted: {result}")

    if extracted_count > 0:
        save_index(index)
        save_full_index(full_index)
        if extracted_count > 1:
            print(f"[session-extract] total: {extracted_count} executions")


def extract_all_unindexed(
    global_storage: Optional[str] = None,
    workspace_path: Optional[str] = None,
    limit: Optional[int] = None,
    workers: int = 8,
):
    """提取所有未索引的 execution（多线程并行）"""
    gs = global_storage or DEFAULT_GLOBAL_STORAGE
    ws = workspace_path or os.getcwd()

    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        print("[session-extract] kiro.kiroagent dir not found")
        return

    session_dir = find_workspace_session_dir(agent_dir, ws)
    chat_ids = None
    if session_dir:
        sessions = load_sessions_json(session_dir)
        # 兼容 chatSessionId（旧版）和 sessionId（新版）两种字段名
        chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
                    if s.get("chatSessionId") or s.get("sessionId")}

    all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
    if not all_execs:
        print("[session-extract] no executions found")
        return

    index = load_index()
    full_index = load_full_index()

    # 筛选未索引的 execution（只提取终态的，跳过 running 等非终态）
    TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")
    todo = [e for e in all_execs
            if e.get("executionId", "") not in index.get("entries", {})
            and e.get("status", "") in TERMINAL_STATUSES]
    if limit:
        todo = todo[:limit]
    if not todo:
        print("[session-extract] all indexed, nothing to do")
        return

    print(f"[session-extract] {len(todo)} executions to extract (workers={workers})")

    import threading
    from concurrent.futures import ThreadPoolExecutor, as_completed

    # 按 chatSessionId 分组，同组内按 startTime 排序串行提取
    # （同一 chatSession 的 context.messages 是累积的，需要按顺序提取以去重）
    from collections import defaultdict
    chat_groups: dict[str, list[dict]] = defaultdict(list)
    for e in todo:
        cid = e.get("chatSessionId", "") or "unknown"
        chat_groups[cid].append(e)
    for cid in chat_groups:
        chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))

    lock = threading.Lock()
    count = 0

    def _extract_group(group_execs):
        """串行提取同一 chatSession 的所有 execution"""
        local_index = {"version": 2, "entries": {}}
        local_full = {"version": 2, "entries": {}}
        results = []
        for execution in group_execs:
            try:
                result = extract_single_execution(
                    agent_dir=agent_dir,
                    hex_dir=execution["_hex_dir"],
                    execution=execution,
                    session_dir=session_dir,
                    index=local_index,
                    full_index=local_full,
                )
                if result:
                    results.append(result)
            except Exception as e:
                eid = execution.get("executionId", "?")[:8]
                print(f"[session-extract] ✗ {eid}: {e}")
        return results, local_index.get("entries", {}), local_full.get("entries", {})

    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_extract_group, execs): cid
                   for cid, execs in chat_groups.items()}
        for future in as_completed(futures):
            results, idx_entries, full_entries = future.result()
            if results:
                with lock:
                    count += len(results)
                    index["entries"].update(idx_entries)
                    full_index["entries"].update(full_entries)
                    if count % 50 == 0:
                        save_index(index)
                        save_full_index(full_index)
                        print(f"[session-extract] [{count}/{len(todo)}] checkpoint saved")
                    elif count % 10 == 0:
                        print(f"[session-extract] [{count}/{len(todo)}]")

    # 最终保存
    save_index(index)
    save_full_index(full_index)
    print(f"[session-extract] done, extracted {count}/{len(todo)}")


def extract_by_id(
    execution_id: str,
    global_storage: Optional[str] = None,
):
    """提取指定 executionId 的 execution"""
    gs = global_storage or DEFAULT_GLOBAL_STORAGE
    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        print("[session-extract] kiro.kiroagent dir not found")
        return

    execs = find_all_executions(agent_dir, execution_id=execution_id)
    if not execs:
        print(f"[session-extract] execution not found: {execution_id}")
        return
    # 验证确实匹配到了目标 execution（前缀匹配）
    matched = execs[0]
    if not matched.get("executionId", "").startswith(execution_id):
        print(f"[session-extract] execution not found: {execution_id}")
        return

    index = load_index()
    full_index = load_full_index()
    result = extract_single_execution(
        agent_dir=agent_dir,
        hex_dir=execs[0]["_hex_dir"],
        execution=execs[0],
        session_dir=None,
        index=index,
        full_index=full_index,
    )
    if result:
        save_index(index)
        save_full_index(full_index)
        print(f"[session-extract] extracted: {result}")


# ═══════════════════════════════════════════════════════════
# CLI 入口
# ═══════════════════════════════════════════════════════════

def main():
    import argparse
    parser = argparse.ArgumentParser(description="Kiro 执行日志全量提取器 v2")
    parser.add_argument("--all", action="store_true", help="提取所有未索引的 execution")
    parser.add_argument("--recent", type=int, metavar="N", help="提取最近 N 条未索引的")
    parser.add_argument("--workers", type=int, default=8, help="并行线程数（默认 8）")
    parser.add_argument("--execution-id", type=str, help="提取指定 executionId")
    parser.add_argument("--global-storage", type=str, help="globalStorage 路径")
    parser.add_argument("--workspace", type=str, help="workspace 路径")
    args = parser.parse_args()

    gs = args.global_storage
    ws = args.workspace

    if args.execution_id:
        extract_by_id(args.execution_id, global_storage=gs)
    elif args.all:
        extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers)
    elif args.recent:
        extract_all_unindexed(global_storage=gs, workspace_path=ws, limit=args.recent, workers=args.workers)
    else:
        extract_latest(global_storage=gs, workspace_path=ws)


if __name__ == "__main__":
    main()