Neo-ZQYY/scripts/ops/extract_kiro_session.py

#!/usr/bin/env python3
"""extract_kiro_session — Kiro 执行日志全量提取器 v2。

改进点（相比 v1）：
1. 系统提示词去重：首次保存到 _system_prompts/sp_{hash8}.md，后续引用
2. 目录分层：YYYY-MM/DD/{chatSessionId_short}/ 下按对话组织
3. 子代理递归提取：主 execution + 子 execution 放同一目录，按调用顺序编号
4. ID 替换：kiro-diff URI → 真实文件路径，terminalId → 进程描述
5. CONTEXT TRANSFER 中的 steering 内容折叠引用
6. 无内容的 model action 压缩为一行

用法：
  python scripts/ops/extract_kiro_session.py                    # 提取最新 execution
  python scripts/ops/extract_kiro_session.py --all              # 提取所有未索引的
  python scripts/ops/extract_kiro_session.py --recent 20        # 提取最近 N 个未索引的
  python scripts/ops/extract_kiro_session.py --execution-id XX  # 提取指定 execution
"""

import base64
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone, timedelta
from typing import Optional

from _env_paths import ensure_repo_root

ensure_repo_root()

CST = timezone(timedelta(hours=8))

# Kiro 固定的 execution manifest 文件名
MANIFEST_FILENAME = "f62de366d0006e17ea00a01f6624aabf"

# 输出路径
SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs")
INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json")          # 精简版：仅主对话
INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json") # 完整版：主对话 + 子代理
SYSTEM_PROMPTS_DIR = os.path.join(SESSION_LOG_DIR, "_system_prompts")

# globalStorage 默认路径
DEFAULT_GLOBAL_STORAGE = os.path.join(
    os.environ.get("APPDATA", ""),
    "Kiro", "User", "globalStorage"
)

# ═══════════════════════════════════════════════════════════
# 工具函数
# ═══════════════════════════════════════════════════════════

def ts_fmt(ms) -> str:
    if not ms:
        return "N/A"
    try:
        return datetime.fromtimestamp(ms / 1000, tz=CST).strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        return str(ms)

def ts_iso(ms) -> str:
    if not ms:
        return ""
    try:
        return datetime.fromtimestamp(ms / 1000, tz=CST).isoformat()
    except Exception:
        return ""

def ts_date_parts(ms) -> tuple[str, str, str]:
    """返回 (YYYY-MM, DD, HHMMSS) 用于目录分层和文件命名"""
    try:
        dt = datetime.fromtimestamp(ms / 1000, tz=CST)
        return dt.strftime("%Y-%m"), dt.strftime("%d"), dt.strftime("%H%M%S")
    except Exception:
        return "unknown", "00", "000000"


def _resolve_chat_dir(day_dir: str, chat_short: str, first_hms: str) -> str:
    """为 chatSession 确定带当天序号的输出目录。

    规则：
    1. 先在 day_dir 下查找已有的同 chatSession 目录（同一天的后续轮次）
    2. 再在整个 SESSION_LOG_DIR 下搜索（跨天场景：chatSession 首轮在其他日期）
    3. 都没找到则在 day_dir 下分配新序号创建
    - 目录格式：{seq:02d}_{chat_short}_{first_hms}/
    """
    os.makedirs(day_dir, exist_ok=True)

    # 1. 在当天目录下查找
    for d in os.listdir(day_dir):
        if os.path.isdir(os.path.join(day_dir, d)) and chat_short in d:
            return os.path.join(day_dir, d)

    # 2. 跨天搜索：遍历所有 YYYY-MM/DD/ 目录
    log_root = SESSION_LOG_DIR
    if os.path.isdir(log_root):
        for ym in os.listdir(log_root):
            ym_path = os.path.join(log_root, ym)
            if not os.path.isdir(ym_path) or ym.startswith("_"):
                continue
            for dd in os.listdir(ym_path):
                dd_path = os.path.join(ym_path, dd)
                if not os.path.isdir(dd_path):
                    continue
                for d in os.listdir(dd_path):
                    if os.path.isdir(os.path.join(dd_path, d)) and chat_short in d:
                        return os.path.join(dd_path, d)

    # 3. 新 chatSession：分配序号
    existing_seqs = []
    for d in os.listdir(day_dir):
        if os.path.isdir(os.path.join(day_dir, d)) and len(d) >= 2 and d[:2].isdigit():
            existing_seqs.append(int(d[:2]))
    next_seq = max(existing_seqs, default=0) + 1
    new_dir = os.path.join(day_dir, f"{next_seq:02d}_{chat_short}_{first_hms}")
    os.makedirs(new_dir, exist_ok=True)
    return new_dir


def _write_cross_day_ref(exec_day_dir: str, chat_short: str, chat_dir: str):
    """在 execution 所在日期目录下生成跨天指引文件。

    当一个 chatSession 跨天时，后续日期的 day_dir 下不会有该对话的目录，
    生成 _ref_{chatShort}.md 告知该对话归在哪个目录。
    """
    os.makedirs(exec_day_dir, exist_ok=True)
    ref_path = os.path.join(exec_day_dir, f"_ref_{chat_short}.md")
    if os.path.isfile(ref_path):
        return  # 已存在，不重复写
    rel_target = os.path.relpath(chat_dir, exec_day_dir).replace("\\", "/")
    with open(ref_path, "w", encoding="utf-8") as f:
        f.write(f"# 跨天对话指引\n\n")
        f.write(f"chatSession `{chat_short}` 的完整记录归档在：\n\n")
        f.write(f"→ `{rel_target}`\n\n")
        f.write(f"（绝对路径：`{chat_dir.replace(chr(92), '/')}`）\n")


def trunc(s, n=3000) -> str:
    if not isinstance(s, str):
        return str(s)
    return s if len(s) <= n else s[:n] + f"\n... [截断，原文共 {len(s)} 字符]"

import re as _re
_SURROGATE_RE = _re.compile(r'[\ud800-\udfff]')

def _sanitize_surrogates(obj):
    """递归清洗 dict/list/str 中的 surrogate 字符，替换为 U+FFFD。"""
    if isinstance(obj, str):
        return _SURROGATE_RE.sub('\ufffd', obj)
    if isinstance(obj, dict):
        return {_sanitize_surrogates(k): _sanitize_surrogates(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_sanitize_surrogates(i) for i in obj]
    return obj


def safe_json(obj, n=5000) -> str:
    try:
        s = json.dumps(obj, ensure_ascii=False, indent=2)
    except Exception:
        s = str(obj)
    return s if len(s) <= n else s[:n] + f"\n... [截断，原文共 {len(s)} 字符]"

def fence(content: str, lang: str = "") -> str:
    """生成安全的 Markdown 代码围栏。
    检测 content 中最长连续反引号序列，外层用更多反引号包裹。
    同时转义行首 # 避免被解析为 Markdown 标题。
    如果内容中有未闭合的围栏，在末尾补上关闭围栏。
    """
    if not content:
        return f"```{lang}\n```"
    # 修复内容中未闭合的围栏（原始数据截断导致）
    fence_stack = []
    for line in content.split("\n"):
        stripped = line.strip()
        m = re.match(r"^(`{3,})", stripped)
        if m:
            ticks = len(m.group(1))
            # 如果栈顶有相同 tick 数的开启围栏，且当前行是纯关闭围栏
            if fence_stack and fence_stack[-1] == ticks and stripped == '`' * ticks:
                fence_stack.pop()
            else:
                fence_stack.append(ticks)
    # 补上所有未闭合的围栏（从栈顶开始关闭）
    if fence_stack:
        suffix_lines = ['`' * t for t in reversed(fence_stack)]
        content = content + "\n" + "\n".join(suffix_lines)
    # 找出内容中最长的连续反引号
    max_ticks = 2
    cur = 0
    for ch in content:
        if ch == '`':
            cur += 1
            if cur > max_ticks:
                max_ticks = cur
        else:
            cur = 0
    outer = '`' * (max_ticks + 1)
    # 转义行首 # —— 加零宽空格使其不被解析为标题
    safe = _escape_heading(content)
    return f"{outer}{lang}\n{safe}\n{outer}"


def _escape_heading(text: str) -> str:
    """转义文本中行首的 # 符号，防止被 Markdown 解析为标题。
    在 # 前插入零宽空格 (\\u200b)。
    """
    lines = text.split('\n')
    out = []
    for line in lines:
        if line.lstrip().startswith('#'):
            # 找到第一个 # 的位置，在前面插入零宽空格
            idx = 0
            while idx < len(line) and line[idx] in (' ', '\t'):
                idx += 1
            out.append(line[:idx] + '\u200b' + line[idx:])
        else:
            out.append(line)
    return '\n'.join(out)


def hash8(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()[:8]


# ═══════════════════════════════════════════════════════════
# 系统提示词去重
# ═══════════════════════════════════════════════════════════

def save_system_prompt(text: str) -> str:
    """保存系统提示词到 _system_prompts/，返回引用文件名。
    如果已存在相同 hash 的文件则跳过。
    """
    h = hash8(text)
    filename = f"sp_{h}.md"
    filepath = os.path.join(SYSTEM_PROMPTS_DIR, filename)
    if not os.path.isfile(filepath):
        os.makedirs(SYSTEM_PROMPTS_DIR, exist_ok=True)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"# 系统提示词 (hash: {h})\n\n")
            f.write(text)
    return filename


def is_system_prompt(text: str) -> bool:
    """判断文本是否为系统提示词（Kiro 注入的 <identity>/<capabilities> 等）"""
    if not text:
        return False
    # 系统提示词通常以 <identity> 开头或包含 <capabilities>
    return (
        "<identity>" in text[:200]
        or "<capabilities>" in text[:500]
        or text.strip().startswith("You are Kiro")
    )


def is_steering_block(text: str) -> bool:
    """判断文本是否为 steering-reminder 注入"""
    return "<steering-reminder>" in text[:100]


# ═══════════════════════════════════════════════════════════
# ID 替换与路径还原
# ═══════════════════════════════════════════════════════════

# kiro-diff URI 模式：kiro-diff:/path?commitId=xxx&executionId=yyy
KIRO_DIFF_PATTERN = re.compile(
    r'kiro-diff:(/[^?]+)\?commitId(?:%3D|=)([^&]+)&executionId(?:%3D|=)([^"\'}\s]+)'
)

def resolve_kiro_diff_uri(uri: str) -> str:
    """将 kiro-diff: URI 替换为可读的文件路径描述"""
    m = KIRO_DIFF_PATTERN.search(uri)
    if m:
        filepath = m.group(1)
        commit_id = m.group(2)
        return f"{filepath} (版本: {commit_id[:8]})"
    return uri


def resolve_ids_in_text(text: str) -> str:
    """在文本中替换已知的 ID 模式为可读信息"""
    if not text or not isinstance(text, str):
        return str(text) if text else ""
    # 替换 kiro-diff URI
    text = KIRO_DIFF_PATTERN.sub(
        lambda m: f'[文件快照] {m.group(1)} (版本 {m.group(2)[:8]})',
        text
    )
    # 替换 file:///c%3A/ 编码路径
    text = re.sub(
        r'file:///([a-zA-Z])%3A/',
        lambda m: f'{m.group(1).upper()}:/',
        text
    )
    return text


def resolve_tool_args(name: str, args: dict) -> dict:
    """对工具调用参数做可读性增强"""
    resolved = dict(args)

    # document 类型中的 target 可能是 kiro-diff URI
    if "target" in resolved and isinstance(resolved["target"], str):
        resolved["target"] = resolve_kiro_diff_uri(resolved["target"])

    # editCode / strReplace 中的 path
    if "path" in resolved and isinstance(resolved["path"], str):
        resolved["path"] = resolve_ids_in_text(resolved["path"])

    # document entries 中的 modified/original
    for key in ("modified", "original", "local"):
        if key in resolved and isinstance(resolved[key], str):
            resolved[key] = resolve_ids_in_text(resolved[key])

    return resolved


# ═══════════════════════════════════════════════════════════
# 定位逻辑
# ═══════════════════════════════════════════════════════════

def find_kiro_agent_dir(global_storage: str) -> Optional[str]:
    agent_dir = os.path.join(global_storage, "kiro.kiroagent")
    return agent_dir if os.path.isdir(agent_dir) else None


def decode_base64url_dir(dirname: str) -> str:
    try:
        b64 = dirname.replace("__", "==")
        return base64.urlsafe_b64decode(b64).decode("utf-8", errors="replace")
    except Exception:
        return ""


def find_workspace_session_dir(agent_dir: str, workspace_path: str) -> Optional[str]:
    ws_dir = os.path.join(agent_dir, "workspace-sessions")
    if not os.path.isdir(ws_dir):
        return None
    target = workspace_path.replace("\\", "/").rstrip("/").lower()
    for entry in os.scandir(ws_dir):
        if entry.is_dir():
            decoded = decode_base64url_dir(entry.name)
            if decoded.replace("\\", "/").rstrip("/").lower() == target:
                return entry.path
    return None


def load_sessions_json(session_dir: str) -> list[dict]:
    sessions_file = os.path.join(session_dir, "sessions.json")
    if not os.path.isfile(sessions_file):
        return []
    try:
        with open(sessions_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        if isinstance(data, dict) and "sessions" in data:
            return data["sessions"]
        return []
    except Exception:
        return []


def load_session_detail(session_dir: str, session_id: str) -> Optional[dict]:
    path = os.path.join(session_dir, f"{session_id}.json")
    if not os.path.isfile(path):
        return None
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return None


def find_all_manifests(agent_dir: str) -> list[tuple[str, str]]:
    results = []
    hex_pattern = re.compile(r"^[0-9a-f]{32}$")
    for entry in os.scandir(agent_dir):
        if entry.is_dir() and hex_pattern.match(entry.name):
            manifest = os.path.join(entry.path, MANIFEST_FILENAME)
            if os.path.isfile(manifest):
                results.append((entry.name, manifest))
    return results


def load_manifest(manifest_path: str) -> list[dict]:
    try:
        with open(manifest_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        # Kiro 新版 manifest 格式: {"executions": [...], "version": ...}
        if isinstance(data, dict) and "executions" in data:
            return data["executions"]
        return []
    except Exception:
        return []


def find_execution_log(agent_dir: str, hex_dir: str, execution: dict) -> Optional[str]:
    """定位完整 execution log 文件"""
    exec_id = execution.get("executionId", "")
    hex_path = os.path.join(agent_dir, hex_dir)
    for entry in os.scandir(hex_path):
        if not entry.is_dir():
            continue
        for sub_entry in os.scandir(entry.path):
            if not sub_entry.is_file() or sub_entry.stat().st_size < 1000:
                continue
            try:
                with open(sub_entry.path, "r", encoding="utf-8") as f:
                    head = f.read(500)
                    if exec_id in head:
                        f.seek(0)
                        data = json.load(f)
                        if data.get("executionId") == exec_id:
                            return sub_entry.path
            except Exception:
                continue
    return None


def find_all_executions(
    agent_dir: str,
    chat_session_ids: Optional[set[str]] = None,
    execution_id: Optional[str] = None,
) -> list[dict]:
    """从所有 manifest 中找匹配的 execution，按 endTime 降序"""
    manifests = find_all_manifests(agent_dir)
    all_execs = []
    for hex_dir, manifest_path in manifests:
        entries = load_manifest(manifest_path)
        for entry in entries:
            entry["_hex_dir"] = hex_dir
            if execution_id:
                eid = entry.get("executionId", "")
                if eid == execution_id or eid.startswith(execution_id):
                    return [entry]
            # 仅当 execution 有 chatSessionId 且不在 workspace session 集合中时才过滤
            csid = entry.get("chatSessionId")
            if chat_session_ids and csid and csid not in chat_session_ids:
                continue
            all_execs.append(entry)
    all_execs.sort(key=lambda e: e.get("endTime", 0), reverse=True)
    return all_execs


# ═══════════════════════════════════════════════════════════
# 解析逻辑
# ═══════════════════════════════════════════════════════════

def parse_messages(messages: list) -> list[dict]:
    """解析 context.messages，处理系统提示词去重和 ID 替换"""
    conversation = []
    for i, msg in enumerate(messages):
        entries = msg.get("entries", [])
        parsed = []
        for entry in entries:
            if not isinstance(entry, dict):
                continue
            etype = entry.get("type", "unknown")
            if etype == "text":
                text = entry.get("text", "")
                # 检测系统提示词
                if is_system_prompt(text):
                    sp_file = save_system_prompt(text)
                    parsed.append({
                        "type": "system_prompt_ref",
                        "ref_file": sp_file,
                        "char_count": len(text),
                    })
                elif is_steering_block(text):
                    # steering 内容折叠，只保留文件名列表
                    steering_files = re.findall(r'(\w[\w-]+\.md):', text)
                    parsed.append({
                        "type": "steering_ref",
                        "files": steering_files or ["(steering block)"],
                        "char_count": len(text),
                    })
                else:
                    parsed.append({"type": "text", "text": resolve_ids_in_text(text)})
            elif etype == "toolUse":
                args = resolve_tool_args(entry.get("name", ""), entry.get("args", {}))
                parsed.append({
                    "type": "toolUse",
                    "id": entry.get("id"),
                    "name": entry.get("name"),
                    "args": args,
                })
            elif etype == "toolUseResponse":
                msg_text = entry.get("message", "")
                parsed.append({
                    "type": "toolUseResponse",
                    "id": entry.get("id"),
                    "name": entry.get("name"),
                    "message": resolve_ids_in_text(msg_text),
                    "success": entry.get("success"),
                })
            elif etype == "document":
                doc = entry.get("document", {})
                doc_type = doc.get("type", "")
                target = doc.get("target", "")
                # steering 类型的 document：提取文件名
                if doc_type == "steering":
                    display_name = doc.get("displayName", "")
                    parsed.append({
                        "type": "steering_doc",
                        "name": display_name or "steering",
                    })
                else:
                    parsed.append({
                        "type": "document",
                        "doc_type": doc_type,
                        "target": resolve_ids_in_text(target) if target else "",
                    })
            else:
                parsed.append({"type": etype, "raw_keys": list(entry.keys())})
        conversation.append({
            "index": i,
            "role": msg.get("role", "?"),
            "messageId": msg.get("messageId", "?"),
            "entries": parsed,
        })
    return conversation


def parse_actions(actions: list) -> list[dict]:
    """解析 actions，压缩无内容的 model action"""
    timeline = []
    for i, action in enumerate(actions):
        atype = action.get("actionType", "")
        astate = action.get("actionState", "")

        # 压缩无内容的 model action 为摘要
        if atype == "model" and "output" not in action and "input" not in action:
            timeline.append({
                "index": i,
                "actionType": "model",
                "actionState": astate,
                "emittedAt": ts_fmt(action.get("emittedAt")),
                "_compressed": True,
            })
            continue

        entry = {
            "index": i,
            "actionId": action.get("actionId"),
            "actionType": atype,
            "actionState": astate,
            "emittedAt": ts_fmt(action.get("emittedAt")),
        }
        if action.get("subExecutionId"):
            entry["subExecutionId"] = action["subExecutionId"]
        if action.get("endTime"):
            entry["endTime"] = ts_fmt(action["endTime"])
        for k in ("intentResult", "input", "output"):
            if k in action:
                val = action[k]
                # 对 output/input 中的文本做 ID 替换
                if isinstance(val, dict):
                    val = dict(val)  # 避免修改原始数据
                    # 提取文件变更信息（write/create action 的 originalContent/modifiedContent）
                    if k == "input" and ("originalContent" in val or "modifiedContent" in val):
                        file_path = val.get("file", val.get("path", "?"))
                        entry["_file_change"] = {
                            "file": resolve_ids_in_text(str(file_path)),
                            "original": val.get("originalContent", ""),
                            "modified": val.get("modifiedContent", ""),
                        }
                        # 从 input 中移除大文本，保留元信息
                        slim = {vk: resolve_ids_in_text(str(vv)) if isinstance(vv, str) else vv
                                for vk, vv in val.items()
                                if vk not in ("originalContent", "modifiedContent")}
                        entry[k] = slim
                        continue
                    for vk, vv in val.items():
                        if isinstance(vv, str):
                            val[vk] = resolve_ids_in_text(vv)
                entry[k] = val
        timeline.append(entry)
    return timeline


def extract_sub_execution_ids(actions: list) -> list[str]:
    """从 actions 中提取所有 subExecutionId（按出现顺序）"""
    seen = set()
    result = []
    for action in actions:
        sid = action.get("subExecutionId")
        if sid and sid not in seen:
            seen.add(sid)
            result.append(sid)
    return result


# ═══════════════════════════════════════════════════════════
# Diff 快照收集
# ═══════════════════════════════════════════════════════════

def collect_diffs(agent_dir: str, hex_dir: str, execution: dict) -> dict[str, dict]:
    """从 execution 的 actions 中提取文件变更的 diff 信息。
    Kiro 在 write/create action 的 input 中内联了 originalContent 和 modifiedContent。
    """
    # diff 快照存储在固定目录 74a08cf8.../commitId/ 下，
    # 但 action input 中已内联内容，直接从 actions 提取更可靠
    return {}


# ═══════════════════════════════════════════════════════════
# Prompt Log 匹配
# ═══════════════════════════════════════════════════════════

PROMPT_LOG_DIR = os.path.join("docs", "audit", "prompt_logs")

def find_matching_prompt_log(start_time_ms: int) -> Optional[str]:
    """根据 execution startTime 匹配最近的 prompt_log 文件。
    prompt_log 文件名格式: prompt_log_YYYYMMDD_HHMMSS.md
    匹配窗口: startTime 前后 5 分钟内最近的一个。
    """
    if not os.path.isdir(PROMPT_LOG_DIR):
        return None
    try:
        exec_dt = datetime.fromtimestamp(start_time_ms / 1000, tz=CST)
    except Exception:
        return None

    best_match = None
    best_delta = float("inf")
    pattern = re.compile(r"prompt_log_(\d{8}_\d{6})\.md$")

    for f in os.scandir(PROMPT_LOG_DIR):
        if not f.is_file():
            continue
        m = pattern.match(f.name)
        if not m:
            continue
        try:
            log_dt = datetime.strptime(m.group(1), "%Y%m%d_%H%M%S").replace(tzinfo=CST)
            delta = abs((exec_dt - log_dt).total_seconds())
            if delta < 300 and delta < best_delta:  # 5 分钟窗口
                best_delta = delta
                best_match = f.path
        except Exception:
            continue
    return best_match


# ═══════════════════════════════════════════════════════════
# Markdown 生成
# ═══════════════════════════════════════════════════════════

def _msg_semantic_label(msg: dict) -> str:
    """为对话消息生成语义标签，用于快速定位。"""
    entries = msg.get("entries", [])
    if not entries:
        return ""
    parts = []
    for e in entries:
        et = e["type"]
        if et == "system_prompt_ref":
            parts.append("系统提示词")
        elif et == "steering_ref":
            parts.append(f"Steering({len(e.get('files', []))})")
        elif et == "steering_doc":
            parts.append(f"Steering:`{e.get('name', '?')}`")
        elif et == "toolUse":
            name = e.get("name", "?")
            # 提取关键参数作为上下文
            args = e.get("args", {})
            ctx = ""
            if name in ("readFile", "readCode", "readMultipleFiles"):
                ctx = args.get("path", "") or ",".join(args.get("paths", [])[:2])
            elif name in ("fsWrite", "strReplace", "editCode"):
                ctx = args.get("path", "")
            elif name == "grepSearch":
                ctx = args.get("query", "")[:30]
            elif name == "invokeSubAgent":
                ctx = args.get("name", "")
            elif name == "executePwsh":
                ctx = (args.get("command", "") or "")[:40]
            elif name == "taskStatus":
                ctx = args.get("status", "")
            if ctx:
                parts.append(f"调用 `{name}` → {ctx}")
            else:
                parts.append(f"调用 `{name}`")
        elif et == "toolUseResponse":
            name = e.get("name", "?")
            ok = "✅" if e.get("success") else "❌"
            parts.append(f"结果 `{name}` {ok}")
        elif et == "document":
            parts.append(f"文档:{e.get('doc_type', '?')}")
        elif et == "text":
            # 文本内容：提取前 50 字符作为预览（bot 和 human 都加）
            role = msg.get("role", "")
            if role in ("bot", "human"):
                text = (e.get("text") or "").strip()
                if text:
                    preview = text[:50].replace("\n", " ")
                    if len(text) > 50:
                        preview += "…"
                    icon = "💬" if role == "bot" else "📝"
                    parts.append(f"{icon} `{preview}`")
    return ", ".join(parts) if parts else ""


def _step_semantic_label(step: dict) -> str:
    """为 action step 生成带图标的语义标签。"""
    at = step.get("actionType", "?")
    state = step.get("actionState", "?")
    fc = step.get("_file_change")
    sub_eid = step.get("subExecutionId")

    # 状态图标
    if state == "Error":
        state_icon = "❌"
    elif state in ("Success", "Accepted"):
        state_icon = "✅"
    else:
        state_icon = "⏳"

    # 类型图标 + 上下文
    if at in ("write", "append") and fc:
        fname = fc.get("file", "?")
        short = fname.rsplit("/", 1)[-1] if "/" in fname else fname
        orig = fc.get("original", "")
        if orig:
            return f"⚡ `{at}` 修改 `{short}` {state_icon}"
        else:
            return f"⚡ `{at}` 新建 `{short}` {state_icon}"
    elif at == "invokeSubAgent":
        inp = step.get("input", {})
        # Kiro 原始 log 用 subAgentName，工具 schema 用 name
        agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
        return f"🔀 `invokeSubAgent` → {agent_name} {state_icon}"
    elif at == "subagentResponse":
        return f"🔀 `subagentResponse` {state_icon}"
    elif at in ("readFiles", "readCode"):
        inp = step.get("input", {})
        if isinstance(inp, dict):
            files = inp.get("files", [])
            if files and isinstance(files[0], dict):
                paths = [f.get("path", "?") for f in files[:2]]
            else:
                paths = [str(f) for f in files[:2]]
            ctx = ", ".join(paths)
        else:
            ctx = ""
        return f"📖 `{at}` {ctx} {state_icon}" if ctx else f"📖 `{at}` {state_icon}"
    elif at == "search":
        return f"🔍 `search` {state_icon}"
    elif at == "say":
        return f"💬 `say` {state_icon}"
    elif at == "taskStatus":
        return f"📋 `taskStatus` {state_icon}"
    elif at == "steering":
        return f"📄 `steering` {state_icon}"
    elif at == "runCommand":
        return f"🖥️ `runCommand` {state_icon}"
    elif at == "getDiagnostics":
        return f"🩺 `getDiagnostics` {state_icon}"
    elif at == "ContextualHookInvoked":
        inp = step.get("input", {})
        hook_name = inp.get("name", "?") if isinstance(inp, dict) else "?"
        return f"🪝 Hook `{hook_name}` {state_icon}"
    elif at == "intentClassification":
        ir = step.get("intentResult", {})
        cls = ir.get("classification", "?") if isinstance(ir, dict) else "?"
        return f"🎯 意图: `{cls}` {state_icon}"
    elif at == "replace":
        inp = step.get("input", {})
        path = inp.get("file", inp.get("path", "?")) if isinstance(inp, dict) else "?"
        short = path.rsplit("/", 1)[-1] if "/" in str(path) else str(path)
        return f"✏️ `replace` `{short}` {state_icon}"
    else:
        return f"`{at}` [{state}]"


def _load_previous_actions_md(main_md_path: str) -> str:
    """从已有的 main.md 文件中提取 '## 4. Actions 时间线' 部分的原始 Markdown。

    用于覆盖模式下，将前几轮 execution 的 actions 追加到新一轮前面。
    返回空字符串表示无历史 actions。
    """
    if not os.path.isfile(main_md_path):
        return ""
    try:
        with open(main_md_path, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception:
        return ""

    # 定位 "## 4. Actions 时间线" 到下一个 "## 5." 或文件末尾
    start_marker = "## 4. Actions 时间线"
    end_marker = "## 5."
    start_idx = content.find(start_marker)
    if start_idx == -1:
        return ""
    # 跳过标题行本身，从下一行开始
    body_start = content.find("\n", start_idx)
    if body_start == -1:
        return ""
    body_start += 1

    end_idx = content.find(end_marker, body_start)
    if end_idx == -1:
        actions_md = content[body_start:]
    else:
        actions_md = content[body_start:end_idx]

    return actions_md.strip()


def _merge_summaries(prev_summary: dict, cur_summary: dict) -> dict:
    """增量合并两轮 execution 的结构化摘要。

    规则：
    - duration_s: 累加
    - action_count: 累加
    - msg_count: 取最新（context.messages 是累积超集）
    - files_modified / files_created: 并集（保序）
    - sub_agents: 并集
    - errors: 拼接
    - description: 取最新非空值
    - workflow / status: 取最新
    """
    if not prev_summary:
        return dict(cur_summary)

    merged = dict(cur_summary)
    merged["duration_s"] = round(
        prev_summary.get("duration_s", 0) + cur_summary.get("duration_s", 0), 1
    )
    merged["action_count"] = (
        prev_summary.get("action_count", 0) + cur_summary.get("action_count", 0)
    )
    # msg_count: 取较大值（最新轮次的 context.messages 是累积超集）
    merged["msg_count"] = max(
        prev_summary.get("msg_count", 0), cur_summary.get("msg_count", 0)
    )
    # 文件列表：并集，保序
    def _union_lists(a: list, b: list) -> list:
        seen = set()
        result = []
        for item in a + b:
            if item not in seen:
                seen.add(item)
                result.append(item)
        return result

    merged["files_modified"] = _union_lists(
        prev_summary.get("files_modified", []),
        cur_summary.get("files_modified", []),
    )
    merged["files_created"] = _union_lists(
        prev_summary.get("files_created", []),
        cur_summary.get("files_created", []),
    )
    merged["sub_agents"] = _union_lists(
        prev_summary.get("sub_agents", []),
        cur_summary.get("sub_agents", []),
    )
    merged["errors"] = (
        prev_summary.get("errors", []) + cur_summary.get("errors", [])
    )
    # description: 取最新非空值
    if cur_summary.get("description"):
        merged["description"] = cur_summary["description"]
    elif prev_summary.get("description"):
        merged["description"] = prev_summary["description"]

    return merged


def _build_execution_summary(
    log: dict,
    conversation: list[dict],
    timeline: list[dict],
    sub_file_map: Optional[dict[str, str]] = None,
) -> dict:
    """构建结构化执行摘要（零 LLM 成本，纯规则化提取）。
    返回 dict 供 md 渲染和索引存储共用。
    """
    dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000

    files_modified = []
    files_created = []
    sub_agents = []
    errors = []

    for step in timeline:
        if step.get("_compressed"):
            continue
        idx = step.get("index", "?")
        at = step.get("actionType", "?")
        state = step.get("actionState", "?")

        fc = step.get("_file_change")
        if fc:
            fname = fc.get("file", "?")
            if fc.get("original"):
                files_modified.append(fname)
            else:
                files_created.append(fname)

        if at == "invokeSubAgent":
            inp = step.get("input", {})
            agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
            sub_agents.append(agent_name)

        if state == "Error":
            errors.append(f"Step {idx}: `{at}`")

    for msg in conversation:
        for e in msg.get("entries", []):
            if e.get("type") == "toolUseResponse" and not e.get("success"):
                errors.append(f"Msg {msg['index']}: `{e.get('name', '?')}`")

    # 去重文件名
    files_modified = list(dict.fromkeys(files_modified))
    files_created = list(dict.fromkeys(files_created))

    # description 由外部 LLM 生成（百炼 API），提取阶段不生成
    description = ""

    return {
        "workflow": log.get("workflowType", "?"),
        "status": log.get("status", "?"),
        "duration_s": round(dur, 1),
        "msg_count": len(conversation),
        "action_count": len(timeline),
        "files_modified": files_modified,
        "files_created": files_created,
        "sub_agents": sub_agents,
        "errors": errors,
        "description": description,
    }


def _render_summary_md(summary: dict, sub_file_map: Optional[dict[str, str]] = None) -> str:
    """将结构化摘要渲染为 Markdown 文本（放在文件最前面）。"""
    L = []
    # 一句话概览
    status_icon = "✅" if summary["status"] == "succeed" else "❌"
    L.append(f"{status_icon} `{summary['workflow']}` | {summary['duration_s']}s | "
             f"{summary['msg_count']} msgs | {summary['action_count']} actions")
    L.append("")

    desc = summary.get("description", "")
    if desc:
        L.append(f"> {desc}")
        L.append("")

    fm = summary["files_modified"]
    fc = summary["files_created"]
    if fm or fc:
        L.append(f"**文件变更** (修改 {len(fm)} / 新建 {len(fc)})")
        for f in fm:
            L.append(f"- ⚡ 修改 `{f}`")
        for f in fc:
            L.append(f"- ✨ 新建 `{f}`")
        L.append("")

    sa = summary["sub_agents"]
    if sa:
        L.append(f"**子代理** ({len(sa)}): {', '.join(f'`{a}`' for a in sa)}")
        L.append("")

    errs = summary["errors"]
    if errs:
        L.append(f"**错误** ({len(errs)})")
        for e in errs:
            L.append(f"- ❌ {e}")
        L.append("")

    if not fm and not fc and not sa and not errs:
        L.append("*(无文件变更、子代理调用或错误)*")

    return "\n".join(L)


def _build_nav_summary(
    conversation: list[dict],
    timeline: list[dict],
    sub_file_map: Optional[dict[str, str]] = None,
) -> str:
    """生成快速导航摘要：文件变更、子代理、错误。"""
    file_changes = []
    sub_agents = []
    errors = []

    for step in timeline:
        if step.get("_compressed"):
            continue
        idx = step.get("index", "?")
        at = step.get("actionType", "?")
        state = step.get("actionState", "?")

        # 文件变更
        fc = step.get("_file_change")
        if fc:
            fname = fc.get("file", "?")
            orig = fc.get("original", "")
            action = "修改" if orig else "新建"
            file_changes.append(f"- Step {idx}: ⚡ {action} `{fname}`")

        # 子代理
        if at == "invokeSubAgent":
            inp = step.get("input", {})
            agent_name = (inp.get("subAgentName") or inp.get("name", "?")) if isinstance(inp, dict) else "?"
            sub_eid = step.get("subExecutionId", "")
            sub_path = ""
            if sub_file_map and sub_eid and sub_eid in sub_file_map:
                sub_path = f" → `{sub_file_map[sub_eid].replace(chr(92), '/')}`"
            sub_agents.append(f"- Step {idx}: 🔀 `{agent_name}`{sub_path}")

        # 错误
        if state == "Error":
            errors.append(f"- Step {idx}: ❌ `{at}`")

    # 对话中的错误工具结果
    for msg in conversation:
        for e in msg.get("entries", []):
            if e.get("type") == "toolUseResponse" and not e.get("success"):
                errors.append(f"- Msg {msg['index']}: ❌ `{e.get('name', '?')}`")

    lines = []
    if file_changes:
        lines.append(f"**文件变更** ({len(file_changes)})")
        lines.extend(file_changes)
        lines.append("")
    if sub_agents:
        lines.append(f"**子代理调用** ({len(sub_agents)})")
        lines.extend(sub_agents)
        lines.append("")
    if errors:
        lines.append(f"**错误** ({len(errors)})")
        lines.extend(errors)
        lines.append("")
    if not lines:
        lines.append("*(无文件变更、子代理调用或错误)*")

    return "\n".join(lines)


def generate_full_record(
    log: dict,
    conversation: list[dict],
    timeline: list[dict],
    diffs: dict[str, dict],
    session_info: Optional[dict] = None,
    prompt_log_path: Optional[str] = None,
    is_sub: bool = False,
    sub_index: int = 0,
    sub_file_map: Optional[dict[str, str]] = None,
    accumulated_actions_md: str = "",
    merged_summary: Optional[dict] = None,
    execution_round: int = 1,
) -> tuple[str, dict]:
    """生成单个 execution 的 Markdown 全量记录（覆盖模式）。

    覆盖模式下：
    - 对话记录：直接使用最新 execution 的 context.messages（累积超集，无需去重）
    - Actions 时间线：前几轮的 accumulated_actions_md + 本轮 timeline
    - 执行摘要：使用 merged_summary（已合并前几轮）

    Args:
        log: 原始 execution log JSON
        conversation: parse_messages 输出
        timeline: parse_actions 输出
        diffs: collect_diffs 输出
        session_info: 会话配置（仅主 execution 有）
        prompt_log_path: 匹配的 prompt_log 文件路径
        is_sub: 是否为子代理 execution
        sub_index: 子代理序号（从 1 开始）
        accumulated_actions_md: 前几轮 execution 的 actions Markdown（追加到本轮前面）
        merged_summary: 合并后的结构化摘要（含前几轮），None 时使用本轮单独摘要
        execution_round: 当前是第几轮 execution（用于标注）
    """
    L = []
    exec_id = log.get("executionId", "?")
    chat_id = log.get("chatSessionId", "?")

    # 构建结构化摘要（供 md 和索引共用）
    _summary = _build_execution_summary(log, conversation, timeline, sub_file_map)
    # 覆盖模式：使用合并后的摘要（含前几轮累积）
    display_summary = merged_summary if merged_summary else _summary

    # 标题
    if is_sub:
        L.append(f"# 子代理 Execution #{sub_index}\n")
    else:
        L.append("# Kiro 会话全量记录\n")
    L.append(f"> 生成时间: {datetime.now(CST).strftime('%Y-%m-%d %H:%M:%S')} CST\n")

    # ── 0. 执行摘要（文件最前面，AI 读前 20 行即可掌握全貌）──
    L.append("## 📋 执行摘要\n")
    if execution_round > 1:
        L.append(f"*（合并 {execution_round} 轮 execution）*\n")
    L.append(_render_summary_md(display_summary, sub_file_map))
    L.append("")

    # ── 1. 元数据 ──
    L.append("## 1. 元数据\n")
    L.append("| 字段 | 值 |")
    L.append("|------|-----|")
    L.append(f"| executionId | `{exec_id}` |")
    L.append(f"| chatSessionId | `{chat_id}` |")
    L.append(f"| workflowType | `{log.get('workflowType', '?')}` |")
    L.append(f"| autonomyMode | `{log.get('autonomyMode', '?')}` |")
    L.append(f"| status | `{log.get('status', '?')}` |")
    L.append(f"| startTime | `{ts_fmt(log.get('startTime'))}` |")
    L.append(f"| endTime | `{ts_fmt(log.get('endTime'))}` |")
    dur = (log.get("endTime", 0) - log.get("startTime", 0)) / 1000
    L.append(f"| duration | `{dur:.1f}s` |")
    L.append(f"| contextUsage | `{log.get('contextUsagePercentage', 0):.2f}%` |")
    L.append("")

    if session_info and not is_sub:
        L.append(f"- 会话标题: `{session_info.get('title', '?')}`")
        L.append(f"- 创建时间: `{ts_fmt(int(session_info.get('dateCreated', 0)))}`")
        L.append(f"- 工作区: `{session_info.get('workspaceDirectory', '?')}`")
        L.append("")

    if prompt_log_path and not is_sub:
        rel = os.path.relpath(prompt_log_path).replace("\\", "/")
        L.append(f"- 关联 prompt_log: `{rel}`")
        L.append("")

    # ── 2. 用户输入 ──
    L.append("## 2. 用户输入\n")
    input_text = ""
    for msg in log.get("input", {}).get("data", {}).get("messages", []):
        for entry in msg.get("content", msg.get("entries", [])):
            if isinstance(entry, dict) and entry.get("text"):
                input_text += entry["text"] + "\n"
    if input_text.strip():
        L.append(fence(input_text.strip()) + "\n")
    else:
        L.append("*(无用户输入)*\n")

    # ── 3. 对话记录 ──
    L.append("## 3. 对话记录\n")

    # 覆盖模式：直接使用全部消息（最新 execution 的 context.messages 是累积超集）
    h = sum(1 for m in conversation if m["role"] == "human")
    b = sum(1 for m in conversation if m["role"] == "bot")
    t = sum(1 for m in conversation if m["role"] == "tool")
    L.append(f"共 {len(conversation)} 条消息: human={h}, bot={b}, tool={t}\n")

    for msg in conversation:
        emoji = {"human": "👤", "bot": "🤖", "tool": "🔧"}.get(msg["role"], "❓")
        # 生成语义标签
        msg_label = _msg_semantic_label(msg)
        label_suffix = f" — {msg_label}" if msg_label else ""

        # P0: 压缩 hook 输出的空消息（特征：HUMAN 消息只含 "Output:\nCommand executed..." 或 "Output:\nCommand timed out..."）
        if msg["role"] == "human" and len(msg["entries"]) == 1:
            e0 = msg["entries"][0]
            if e0["type"] == "text":
                _txt = (e0.get("text") or "").strip()
                if _txt.startswith("Output:") and ("Exit Code:" in _txt) and len(_txt) < 200:
                    # 提取 exit code
                    import re as _re
                    _ec_match = _re.search(r"Exit Code:\s*(-?\d+)", _txt)
                    _ec = _ec_match.group(1) if _ec_match else "?"
                    L.append(f"### Msg {msg['index']}: 👤 HUMAN — 🪝 Hook 输出 (exit={_ec})\n")
                    continue

        L.append(f"### Msg {msg['index']}: {emoji} {msg['role'].upper()}{label_suffix}\n")

        for entry in msg["entries"]:
            et = entry["type"]

            if et == "system_prompt_ref":
                ref = entry["ref_file"]
                chars = entry["char_count"]
                sp_path = f"docs/audit/session_logs/_system_prompts/{ref}"
                L.append(f"**[系统提示词]** → `{sp_path}` ({chars} 字符)\n")

            elif et == "steering_ref":
                files = ", ".join(entry["files"])
                chars = entry["char_count"]
                L.append(f"**[Steering]** 引用: {files} ({chars} 字符)\n")

            elif et == "text":
                text = entry.get("text", "")
                if not text:
                    L.append("*(空)*\n")
                else:
                    L.append(fence(text) + "\n")

            elif et == "toolUse":
                name = entry.get("name", "?")
                args = entry.get("args", {})
                L.append(f"**[🔧 调用]** `{name}`\n")
                # P1: strReplace/editCode 的代码变更用 diff 格式展示
                if name in ("strReplace", "editCode") and isinstance(args, dict):
                    _path = args.get("path", "?")
                    _lang = "python" if _path.endswith(".py") else "sql" if _path.endswith(".sql") else ""
                    L.append(f"- 文件: `{_path}`\n")
                    _old = args.get("oldStr", args.get("old_str", ""))
                    _new = args.get("newStr", args.get("new_str", ""))
                    _sel = args.get("selector", "")
                    _op = args.get("operation", "")
                    _repl = args.get("replacement", "")
                    if _sel:
                        L.append(f"- selector: `{_sel}`" + (f" ({_op})" if _op else ""))
                    if _old:
                        L.append(f"- 删除:\n" + fence(trunc(_old, 2000), _lang))
                    if _new:
                        L.append(f"- 插入:\n" + fence(trunc(_new, 2000), _lang))
                    if _repl:
                        L.append(f"- 替换为:\n" + fence(trunc(_repl, 2000), _lang))
                    L.append("")
                else:
                    L.append(fence(safe_json(args, 5000), "json") + "\n")

            elif et == "toolUseResponse":
                ok = "✅" if entry.get("success") else "❌"
                L.append(f"**[📋 结果]** `{entry.get('name', '?')}` {ok}\n")
                msg_text = entry.get("message", "")
                if msg_text:
                    L.append(fence(trunc(msg_text, 5000)) + "\n")

            elif et == "document":
                target = entry.get("target", "")
                L.append(f"**[📄 文档]** type=`{entry.get('doc_type')}` target=`{target}`\n")

            elif et == "steering_doc":
                L.append(f"**[📄 Steering]** `{entry.get('name', 'steering')}`\n")

            else:
                L.append(f"**[{et}]** keys={entry.get('raw_keys')}\n")

    # ── 4. Actions 时间线 ──
    L.append("## 4. Actions 时间线\n")

    # 覆盖模式：先输出前几轮累积的 actions，再输出本轮
    if accumulated_actions_md:
        L.append(accumulated_actions_md)
        L.append("")
        L.append(f"---\n")
        L.append(f"### 🔄 第 {execution_round} 轮 Execution ({ts_fmt(log.get('startTime'))})\n")

    L.append(f"共 {len(timeline)} 个\n")

    for step in timeline:
        if step.get("_compressed"):
            L.append(f"- `model` [{step.get('actionState')}] @ {step.get('emittedAt')}\n")
            continue

        at = step.get('actionType', '?')
        state = step.get('actionState', '?')
        # 生成语义标签
        step_label = _step_semantic_label(step)
        L.append(f"### Step {step['index']}: {step_label} @ {step.get('emittedAt','?')}\n")
        if step.get("subExecutionId"):
            sub_eid = step["subExecutionId"]
            L.append(f"- subExecutionId: `{sub_eid}`")
            # 标注子代理文件路径（如果有映射）
            if sub_file_map and sub_eid in sub_file_map:
                sub_path = sub_file_map[sub_eid].replace("\\", "/")
                L.append(f"- 子代理记录: `{sub_path}`")
        if step.get("endTime"):
            L.append(f"- endTime: {step['endTime']}")
        # 文件变更展示
        if step.get("_file_change"):
            fc = step["_file_change"]
            fname = fc.get("file", "?")
            orig = fc.get("original", "")
            mod = fc.get("modified", "")
            lang = "python" if fname.endswith(".py") else "sql" if fname.endswith(".sql") else ""
            L.append(f"- 文件变更: `{fname}`")
            if orig and mod:
                L.append(f"  - 修改前 ({len(orig)} 字符):\n" + fence(trunc(orig, 3000), lang))
                L.append(f"  - 修改后 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
            elif mod:
                L.append(f"  - 新建 ({len(mod)} 字符):\n" + fence(trunc(mod, 3000), lang))
        # 特殊处理各种 action type 的内容展示
        _at = step.get("actionType", "")
        if _at == "say":
            _say_msg = (step.get("output") or {}).get("message", "")
            if _say_msg:
                L.append(f"- 💬 AI 回复:\n\n{_say_msg}\n")
            else:
                for k in ("intentResult", "input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        elif _at == "invokeSubAgent":
            _sub_input = step.get("input") or {}
            _sub_prompt = _sub_input.get("prompt", "")
            _sub_name = _sub_input.get("name", "") or _sub_input.get("subAgentName", "") or "?"
            if _sub_prompt:
                L.append(f"- 📤 委托 `{_sub_name}`:\n\n{trunc(_sub_prompt, 3000)}\n")
            _sub_output = step.get("output") or {}
            _sub_resp = _sub_output.get("response", "")
            if _sub_resp:
                L.append(f"- 📥 子代理输出:\n\n{_sub_resp}\n")
            elif not _sub_prompt:
                for k in ("intentResult", "input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        elif _at == "subagentResponse":
            _sr_input = step.get("input") or {}
            _sr_resp = _sr_input.get("response", "")
            if _sr_resp:
                L.append(f"- 📥 子代理返回:\n\n{_sr_resp}\n")
            else:
                for k in ("intentResult", "input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        elif _at == "ContextualHookInvoked":
            # P1: hook 执行——提取名称、命令、exitCode
            _hi = step.get("input") or {}
            _ho = step.get("output") or {}
            _h_name = _hi.get("name", "?")
            _h_cmd = _ho.get("command", "")
            _h_result = _ho.get("result", {})
            _h_exit = _h_result.get("exitCode", "?") if isinstance(_h_result, dict) else "?"
            _h_out = _h_result.get("output", "") if isinstance(_h_result, dict) else ""
            if _h_cmd:
                L.append(f"- `$ {_h_cmd}`")
            L.append(f"- Exit: `{_h_exit}`")
            if _h_out and _h_out != "Command executed successfully with no output.":
                L.append(f"- 输出:\n" + fence(trunc(_h_out, 2000)))
        elif _at == "intentClassification":
            # P1: 意图分类——压缩为一行
            _ir = step.get("intentResult", {})
            _cls = _ir.get("classification", "?") if isinstance(_ir, dict) else "?"
            L.append(f"- 分类结果: `{_cls}`")
        elif _at == "runCommand":
            # P0: 命令执行——提取命令、exitCode、输出
            _rc_in = step.get("input") or {}
            _rc_out = step.get("output") or {}
            _rc_cmd = _rc_in.get("command", "") if isinstance(_rc_in, dict) else ""
            _rc_result = _rc_out.get("result", {}) if isinstance(_rc_out, dict) else {}
            _rc_exit = _rc_result.get("exitCode", "?") if isinstance(_rc_result, dict) else "?"
            _rc_output = _rc_result.get("output", "") if isinstance(_rc_result, dict) else ""
            if _rc_cmd:
                L.append(f"- `$ {_rc_cmd}`")
            L.append(f"- Exit: `{_rc_exit}`")
            if _rc_output:
                L.append(f"- 输出:\n" + fence(trunc(_rc_output, 3000)))
        elif _at == "search":
            # P2: 搜索——提取 query 和 why
            _s_in = step.get("input") or {}
            _s_query = _s_in.get("query", "") if isinstance(_s_in, dict) else ""
            _s_why = _s_in.get("why", "") if isinstance(_s_in, dict) else ""
            if _s_query:
                L.append(f"- 🔍 `{trunc(_s_query, 100)}`")
            if _s_why:
                L.append(f"- 原因: {trunc(_s_why, 200)}")
            # 展示搜索结果（如果有）
            _s_out = step.get("output")
            if _s_out and isinstance(_s_out, dict):
                _s_files = _s_out.get("files", [])
                if _s_files:
                    L.append(f"- 结果: {len(_s_files)} 个文件")
        elif _at == "steering":
            # P2: steering——提取文件名列表
            _st_in = step.get("input") or {}
            _st_docs = _st_in.get("documents", []) if isinstance(_st_in, dict) else []
            if _st_docs:
                import urllib.parse
                names = []
                for d in _st_docs[:10]:
                    if isinstance(d, str):
                        # URL 编码的路径，提取文件名
                        decoded = urllib.parse.unquote(d)
                        name = decoded.rsplit("/", 1)[-1] if "/" in decoded else decoded
                        names.append(name)
                if names:
                    L.append(f"- 文件: {', '.join(names)}")
            else:
                for k in ("input", "output"):
                    if k in step:
                        L.append(f"- {k}:\n" + fence(safe_json(step[k], 3000), "json"))
        elif _at == "getDiagnostics":
            # P2: 诊断——提取路径和问题数
            _gd_in = step.get("input") or {}
            _gd_out = step.get("output") or {}
            _gd_paths = _gd_in.get("paths", []) if isinstance(_gd_in, dict) else []
            if _gd_paths:
                L.append(f"- 文件: {', '.join(str(p) for p in _gd_paths[:5])}")
            if isinstance(_gd_out, dict):
                _gd_diags = _gd_out.get("diagnostics", [])
                if isinstance(_gd_diags, list):
                    L.append(f"- 问题数: {len(_gd_diags)}")
                    for d in _gd_diags[:5]:
                        if isinstance(d, dict):
                            L.append(f"  - {d.get('severity', '?')}: {d.get('message', '?')[:100]}")
                elif not _gd_diags:
                    L.append("- ✅ 无问题")
        elif _at in ("readFiles", "readCode"):
            # P3: 文件读取——只展示路径，不展示内容
            _rf_in = step.get("input") or {}
            if isinstance(_rf_in, dict):
                _rf_files = _rf_in.get("files", [])
                paths = []
                for f in _rf_files[:5]:
                    if isinstance(f, dict):
                        paths.append(f.get("path", "?"))
                    else:
                        paths.append(str(f))
                if paths:
                    L.append(f"- 文件: {', '.join(paths)}")
        else:
            for k in ("intentResult", "input", "output"):
                if k in step:
                    L.append(f"- {k}:\n" + fence(safe_json(step[k], 5000), "json"))
        L.append("")

    # ── 5. 资源消耗 ──
    usage = log.get("usageSummary", [])
    if usage:
        L.append("## 5. 资源消耗\n")
        L.append("| 工具 | 消耗 | 单位 |")
        L.append("|------|------|------|")
        total = 0
        for u in usage:
            tools = ", ".join(u.get("usedTools", ["-"]))
            amt = u.get("usage", 0)
            total += amt
            L.append(f"| {tools} | {amt} | {u.get('unit', '?')} |")
        L.append(f"| **合计** | **{total:.4f}** | |")
        L.append("")

    return "\n".join(L), _summary, display_summary


# ═══════════════════════════════════════════════════════════
# 索引管理
# ═══════════════════════════════════════════════════════════

def load_index() -> dict:
    if os.path.isfile(INDEX_PATH):
        try:
            with open(INDEX_PATH, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {"version": 2, "entries": {}}


def load_full_index() -> dict:
    if os.path.isfile(INDEX_FULL_PATH):
        try:
            with open(INDEX_FULL_PATH, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            pass
    return {"version": 2, "entries": {}}


def save_index(index: dict):
    os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
    with open(INDEX_PATH, "w", encoding="utf-8") as f:
        json.dump(index, f, ensure_ascii=False, indent=2)
    _save_day_indexes(index, "_day_index.json")


def save_full_index(index: dict):
    os.makedirs(os.path.dirname(INDEX_FULL_PATH), exist_ok=True)
    with open(INDEX_FULL_PATH, "w", encoding="utf-8") as f:
        json.dump(index, f, ensure_ascii=False, indent=2)
    _save_day_indexes(index, "_day_index_full.json")


def _save_day_indexes(index: dict, filename: str):
    """将根级索引按 output_dir 中的 {ym}/{dd} 拆分，写入每个 day_dir 下。

    路径格式：docs/audit/session_logs/{ym}/{dd}/{filename}
    每个 day 级索引只包含 output_dir 在该天目录下的 entry。
    """
    entries = index.get("entries", {})
    if not entries:
        return

    # 按 day_dir 分组
    day_groups: dict[str, dict[str, dict]] = {}
    prefix = SESSION_LOG_DIR.replace("\\", "/")
    for eid, ent in entries.items():
        out_dir = ent.get("output_dir", "").replace("\\", "/")
        if not out_dir.startswith(prefix):
            continue
        # out_dir 格式：docs/audit/session_logs/2026-03/03/01_abc12345_013337
        # 取到 day_dir：docs/audit/session_logs/2026-03/03
        rel = out_dir[len(prefix):].lstrip("/")
        parts = rel.split("/")
        if len(parts) >= 2:
            day_key = f"{parts[0]}/{parts[1]}"  # "2026-03/03"
            day_groups.setdefault(day_key, {})[eid] = ent

    for day_key, day_entries in day_groups.items():
        day_dir = os.path.join(SESSION_LOG_DIR, day_key.replace("/", os.sep))
        day_idx_path = os.path.join(day_dir, filename)
        os.makedirs(day_dir, exist_ok=True)
        day_data = {"version": 2, "entries": day_entries}
        with open(day_idx_path, "w", encoding="utf-8") as f:
            json.dump(day_data, f, ensure_ascii=False, indent=2)


def update_index(index: dict, exec_id: str, output_dir: str, meta: dict,
                 summary: Optional[dict] = None, is_sub: bool = False,
                 parent_exec_id: str = "",
                 chat_session_id: str = "",
                 prev_exec_ids: Optional[list[str]] = None):
    """添加/更新一条索引记录，含结构化摘要供 AI 低成本查询。

    覆盖模式下：
    - 同一 chatSession 只保留最新 execution 的 entry
    - prev_exec_ids 中的旧 entry 会被移除
    - summary 应为已合并的（调用方负责合并）

    is_sub=True 时标记为子代理 entry，并记录 parent_exec_id。
    """
    # 覆盖模式：将同 chatSession 的旧 entry 标记为已替代（保留在索引中防止重复提取）
    if prev_exec_ids:
        for old_eid in prev_exec_ids:
            if old_eid in index.get("entries", {}) and old_eid != exec_id:
                # 保留 entry 但标记为 superseded，避免被 extract_latest 重复提取
                index["entries"][old_eid] = {
                    "superseded_by": exec_id,
                    "chatSessionId": index["entries"][old_eid].get("chatSessionId", ""),
                }

    entry = {
        "output_dir": output_dir.replace("\\", "/"),
        "chatSessionId": meta.get("chatSessionId", "") or chat_session_id,
        "startTime": meta.get("startTime", ""),
        "endTime": meta.get("endTime", ""),
        "status": meta.get("status", ""),
        "workflowType": meta.get("workflowType", ""),
        "indexed_at": datetime.now(CST).isoformat(),
    }
    if is_sub:
        entry["is_sub"] = True
        if parent_exec_id:
            entry["parent_exec_id"] = parent_exec_id
    if summary:
        entry["summary"] = {
            "duration_s": summary.get("duration_s", 0),
            "msg_count": summary.get("msg_count", 0),
            "action_count": summary.get("action_count", 0),
            "files_modified": summary.get("files_modified", []),
            "files_created": summary.get("files_created", []),
            "sub_agents": summary.get("sub_agents", []),
            "errors": summary.get("errors", []),
        }
        if summary.get("description"):
            entry["description"] = summary["description"]
    index["entries"][exec_id] = entry


# ═══════════════════════════════════════════════════════════
# 主提取逻辑
# ═══════════════════════════════════════════════════════════

def extract_single_execution(
    agent_dir: str,
    hex_dir: str,
    execution: dict,
    session_dir: Optional[str],
    index: dict,
    full_index: Optional[dict] = None,
    sub_mode: bool = False,
    sub_index: int = 0,
    output_base_dir: Optional[str] = None,
    parent_exec_id: str = "",
    force: bool = False,
) -> Optional[str]:
    """提取单个 execution 并写入文件（覆盖模式）。

    覆盖模式：同一 chatSession 只保留一个 main.md 文件。
    - 对话记录：使用最新 execution 的 context.messages（累积超集）
    - Actions 时间线：前几轮的 actions + 本轮 actions（追加）
    - 执行摘要：增量合并（duration 累加、files 并集、msg_count 取最新）
    - 索引：同 chatSession 只保留一条 entry

    Args:
        agent_dir: kiro.kiroagent 目录
        hex_dir: execution 所在的 32 位 hex 目录
        execution: manifest 中的 execution 条目
        session_dir: workspace-sessions 子目录（用于加载会话信息）
        index: 精简索引字典（仅主对话）
        full_index: 完整索引字典（主对话 + 子代理），None 时不写入
        sub_mode: 是否为子代理模式
        sub_index: 子代理序号
        output_base_dir: 子代理模式下的输出目录（与主 execution 同目录）
        parent_exec_id: 子代理的父 execution ID
        force: 强制重新提取（忽略已索引检查）

    Returns:
        输出目录路径，或 None（如果失败/已索引）
    """
    exec_id = execution.get("executionId", "")
    chat_id = execution.get("chatSessionId", "")

    # 跳过已索引且文件仍存在的（子代理不检查，因为它们跟随主 execution）
    # force 模式下跳过此检查（用于 --all 重新提取）
    if not sub_mode and not force and exec_id in index.get("entries", {}):
        existing_entry = index["entries"][exec_id]
        # superseded/no_log 占位条目：永远跳过，不重试
        if existing_entry.get("superseded_by") or existing_entry.get("no_log"):
            return None
        existing_dir = existing_entry.get("output_dir", "")
        if existing_dir and os.path.isdir(existing_dir):
            return None
        # 文件已被清理，从索引中移除，继续提取
        del index["entries"][exec_id]

    # 加载 execution log
    log_path = find_execution_log(agent_dir, hex_dir, execution)
    if not log_path:
        # Tombstone：日志文件不存在（已被 Kiro 清理），写入占位条目防止重复尝试
        if not sub_mode and exec_id:
            index.setdefault("entries", {})[exec_id] = {
                "no_log": True,
                "chatSessionId": chat_id,
                "indexed_at": datetime.now(CST).isoformat(),
            }
            if full_index is not None:
                full_index.setdefault("entries", {})[exec_id] = {
                    "no_log": True,
                    "chatSessionId": chat_id,
                    "indexed_at": datetime.now(CST).isoformat(),
                }
        return None

    try:
        with open(log_path, "rb") as f:
            raw = f.read()
        # 清洗 surrogate 字符（Kiro log 中 emoji 可能被存为 surrogate pair）
        text = raw.decode("utf-8", errors="surrogatepass").encode("utf-8", errors="replace").decode("utf-8")
        log = _sanitize_surrogates(json.loads(text))
    except Exception:
        return None

    # 从完整 log 补充 chatSessionId（新版 manifest 条目中可能缺失）
    if not chat_id:
        chat_id = log.get("chatSessionId", "")

    # 解析
    messages = log.get("context", {}).get("messages", [])
    actions = log.get("actions", [])
    conversation = parse_messages(messages)
    timeline = parse_actions(actions)
    diffs = collect_diffs(agent_dir, hex_dir, execution)

    # 会话信息（仅主 execution）
    session_info = None
    if not sub_mode and session_dir and chat_id:
        session_info = load_session_detail(session_dir, chat_id)

    # prompt_log 匹配（仅主 execution）
    prompt_log = None
    if not sub_mode:
        start_time = log.get("startTime", 0)
        prompt_log = find_matching_prompt_log(start_time)

    # 取 execution 开始时间（用于目录和文件命名的时间后缀）
    _start_ms = log.get("startTime") or execution.get("startTime", 0)
    _ym, _dd, _hms = ts_date_parts(_start_ms)

    # 确定输出目录
    if sub_mode and output_base_dir:
        out_dir = output_base_dir
    else:
        chat_short = chat_id[:8] if chat_id else hash8(exec_id)
        day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
        out_dir = _resolve_chat_dir(day_dir, chat_short, _hms)

    os.makedirs(out_dir, exist_ok=True)

    # 跨天指引
    if not sub_mode:
        chat_short = chat_id[:8] if chat_id else hash8(exec_id)
        chat_day_dir = os.path.dirname(out_dir)
        exec_day_dir = os.path.join(SESSION_LOG_DIR, _ym, _dd)
        if os.path.normpath(chat_day_dir) != os.path.normpath(exec_day_dir):
            _write_cross_day_ref(exec_day_dir, chat_short, out_dir)

    # 先递归提取子代理，收集 sub_file_map（subExecutionId → 文件路径）
    sub_file_map: dict[str, str] = {}
    if not sub_mode:
        sub_exec_ids = extract_sub_execution_ids(actions)
        for si, sub_eid in enumerate(sub_exec_ids, 1):
            sub_execs = find_all_executions(agent_dir, execution_id=sub_eid)
            if sub_execs:
                sub_exec = sub_execs[0]
                extract_single_execution(
                    agent_dir=agent_dir,
                    hex_dir=sub_exec["_hex_dir"],
                    execution=sub_exec,
                    session_dir=session_dir,
                    index=index,
                    full_index=full_index,
                    sub_mode=True,
                    sub_index=si,
                    output_base_dir=out_dir,
                    parent_exec_id=exec_id,
                    force=force,
                )
                sub_filename = f"sub_{si:02d}_{sub_eid[:8]}.md"
                sub_file_map[sub_eid] = os.path.join(out_dir, sub_filename)

    # ── 覆盖模式：加载前几轮的 actions 和 summary ──
    accumulated_actions_md = ""
    prev_summary: dict = {}
    execution_round = 1
    prev_exec_ids: list[str] = []  # 同 chatSession 的旧 executionId 列表

    if not sub_mode:
        # 查找同 chatSession 的已有 main.md 文件
        existing_main_path = ""
        if os.path.isdir(out_dir):
            for f in os.listdir(out_dir):
                if f.startswith("main_") and f.endswith(".md"):
                    existing_main_path = os.path.join(out_dir, f)
                    break  # 覆盖模式下只有一个 main 文件

        if existing_main_path:
            # 提取前几轮累积的 actions
            accumulated_actions_md = _load_previous_actions_md(existing_main_path)

        # 从索引中查找同 chatSession 的旧 entry，获取 prev_summary
        if chat_id:
            for eid, ent in list(index.get("entries", {}).items()):
                if ent.get("chatSessionId") == chat_id and eid != exec_id and not ent.get("is_sub"):
                    # 跳过已被替代的条目（无 summary/output_dir）
                    if ent.get("superseded_by") or ent.get("no_log"):
                        prev_exec_ids.append(eid)
                        continue
                    prev_exec_ids.append(eid)
                    ent_summary = ent.get("summary", {})
                    # 保留旧 entry 的 description（LLM 生成的）
                    if ent.get("description"):
                        ent_summary["description"] = ent["description"]
                    prev_summary = _merge_summaries(prev_summary, ent_summary)

        # 计算当前是第几轮
        execution_round = len(prev_exec_ids) + 1

    # 生成 Markdown + 结构化摘要
    # 先计算本轮 summary，与前几轮合并后传给 generate_full_record 渲染
    cur_summary = _build_execution_summary(log, conversation, timeline, sub_file_map if not sub_mode else None)
    final_summary = _merge_summaries(prev_summary, cur_summary) if prev_summary else cur_summary

    md, _cur_summary_unused, _display_unused = generate_full_record(
        log=log,
        conversation=conversation,
        timeline=timeline,
        diffs=diffs,
        session_info=session_info,
        prompt_log_path=prompt_log,
        is_sub=sub_mode,
        sub_index=sub_index,
        sub_file_map=sub_file_map if not sub_mode else None,
        accumulated_actions_md=accumulated_actions_md,
        merged_summary=final_summary if prev_summary else None,
        execution_round=execution_round,
    )

    # ── 写入文件（覆盖模式：删除旧 main，写入新 main）──
    if sub_mode:
        filename = f"sub_{sub_index:02d}_{exec_id[:8]}.md"
    else:
        # 覆盖模式：删除同目录下所有旧 main_*.md
        if os.path.isdir(out_dir):
            for f in os.listdir(out_dir):
                if f.startswith("main_") and f.endswith(".md"):
                    try:
                        os.remove(os.path.join(out_dir, f))
                    except Exception:
                        pass
        # 固定文件名：main_01_{最新execId前8位}.md
        filename = f"main_01_{exec_id[:8]}.md"

    filepath = os.path.join(out_dir, filename)
    md_safe = md.encode("utf-8", errors="surrogateescape").decode("utf-8", errors="replace")
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(md_safe)

    # 更新索引（覆盖模式：移除旧 entry，写入新 entry）
    _meta = {
        "chatSessionId": chat_id,
        "startTime": ts_fmt(log.get("startTime")),
        "endTime": ts_fmt(log.get("endTime")),
        "status": log.get("status", ""),
        "workflowType": log.get("workflowType", ""),
    }
    if not sub_mode:
        update_index(index, exec_id, out_dir, _meta,
                     summary=final_summary,
                     chat_session_id=chat_id,
                     prev_exec_ids=prev_exec_ids)
        if full_index is not None:
            update_index(full_index, exec_id, out_dir, _meta,
                         summary=final_summary,
                         chat_session_id=chat_id,
                         prev_exec_ids=prev_exec_ids)
    else:
        if full_index is not None:
            update_index(full_index, exec_id, out_dir, _meta,
                         summary=cur_summary, is_sub=True,
                         parent_exec_id=parent_exec_id)

    return out_dir


# ═══════════════════════════════════════════════════════════
# 入口函数
# ═══════════════════════════════════════════════════════════

def extract_latest(global_storage: Optional[str] = None, workspace_path: Optional[str] = None):
    """提取所有未索引的终态 execution + partial/补全逻辑（供 agent_on_stop 调用）。

    覆盖模式：同一 chatSession 的 execution 按 startTime 升序提取，
    后续轮次覆盖前一轮的 main.md（对话用最新超集，actions 追加）。

    Partial/补全：
    1. 检测当前 session 的 execution（可能还在 running），提取为 status="partial"
    2. 下次调用时，找到 status="partial" 的 entry，重新提取覆盖
    """
    gs = global_storage or DEFAULT_GLOBAL_STORAGE
    ws = workspace_path or os.getcwd()

    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        return

    session_dir = find_workspace_session_dir(agent_dir, ws)
    chat_ids = None
    if session_dir:
        sessions = load_sessions_json(session_dir)
        chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
                    if s.get("chatSessionId") or s.get("sessionId")}

    TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")

    all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
    if not all_execs:
        return

    index = load_index()
    full_index = load_full_index()

    # ── 步骤 1：补全之前标记为 partial 的 entry ──
    partial_eids = [
        eid for eid, ent in index.get("entries", {}).items()
        if ent.get("status") == "partial"
    ]
    for p_eid in partial_eids:
        # 在 all_execs 中找到对应的 execution
        matched = [e for e in all_execs if e.get("executionId") == p_eid]
        if matched:
            p_exec = matched[0]
            # 如果已变为终态，重新提取覆盖
            if p_exec.get("status", "") in TERMINAL_STATUSES:
                result = extract_single_execution(
                    agent_dir=agent_dir,
                    hex_dir=p_exec["_hex_dir"],
                    execution=p_exec,
                    session_dir=session_dir,
                    index=index,
                    full_index=full_index,
                    force=True,  # 强制覆盖
                )
                if result:
                    # 逐条持久化，避免中途超时导致下次重复处理
                    save_index(index)
                    save_full_index(full_index)
                    print(f"[session-extract] completed partial: {result}")

    # ── 步骤 2：提取未索引的终态 execution ──
    # 同时检查 index（主对话）和 full_index（含子代理），避免重复提取子代理 execution
    indexed_eids = set(index.get("entries", {}).keys())
    if full_index:
        indexed_eids |= set(full_index.get("entries", {}).keys())
    ready = [e for e in all_execs
             if e.get("executionId", "") not in indexed_eids
             and e.get("status", "") in TERMINAL_STATUSES]

    # ── 步骤 3：检测当前 session 的 running execution，提取为 partial ──
    # 当前 hook 触发时，本 session 的 execution 通常还在 running
    # 通过环境变量 KIRO_CHAT_SESSION_ID 识别（如果有的话）
    current_chat_id = os.environ.get("KIRO_CHAT_SESSION_ID", "")
    if current_chat_id:
        running_execs = [
            e for e in all_execs
            if e.get("chatSessionId") == current_chat_id
            and e.get("status") not in TERMINAL_STATUSES
            and e.get("executionId", "") not in indexed_eids
        ]
        for r_exec in running_execs:
            # 标记为 partial 提取
            r_exec["_is_partial"] = True
            ready.append(r_exec)

    if not ready:
        return

    # agent_on_stop 场景下限制单次处理量，避免处理数千个历史 execution
    # 优先处理最新的（按 startTime 降序排列，ready 已经是降序的来自 find_all_executions）
    MAX_PER_RUN = 50
    if len(ready) > MAX_PER_RUN:
        # 按 startTime 降序排序，取最新的 MAX_PER_RUN 个
        ready.sort(key=lambda x: x.get("startTime", 0), reverse=True)
        skipped = len(ready) - MAX_PER_RUN
        ready = ready[:MAX_PER_RUN]
        print(f"[session-extract] capped to {MAX_PER_RUN} most recent (skipped {skipped} older)")

    # 按 chatSessionId 分组，组内按 startTime 升序
    from collections import defaultdict
    chat_groups: dict[str, list[dict]] = defaultdict(list)
    for e in ready:
        cid = e.get("chatSessionId", "") or "unknown"
        chat_groups[cid].append(e)
    for cid in chat_groups:
        chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))

    import threading
    from concurrent.futures import ThreadPoolExecutor, as_completed

    lock = threading.Lock()
    extracted_count = 0
    tombstone_count = 0

    def _extract_group(group_execs: list[dict]) -> tuple[list[str], dict, dict]:
        """串行提取同一 chatSession 的所有 execution，返回 (results, idx_entries, full_entries)。"""
        local_index: dict = {"version": 2, "entries": {}}
        local_full: dict = {"version": 2, "entries": {}}
        results: list[str] = []
        for execution in group_execs:
            eid = execution.get("executionId", "")
            try:
                result = extract_single_execution(
                    agent_dir=agent_dir,
                    hex_dir=execution["_hex_dir"],
                    execution=execution,
                    session_dir=session_dir,
                    index=local_index,
                    full_index=local_full,
                )
                if result:
                    if execution.get("_is_partial"):
                        if eid in local_index.get("entries", {}):
                            local_index["entries"][eid]["status"] = "partial"
                        if eid in local_full.get("entries", {}):
                            local_full["entries"][eid]["status"] = "partial"
                    results.append(result)
            except Exception as exc:
                print(f"[session-extract] ✗ {eid[:8]}: {exc}")
        return results, local_index.get("entries", {}), local_full.get("entries", {})

    workers = min(4, len(chat_groups))
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_extract_group, execs): cid
                   for cid, execs in chat_groups.items()}
        for future in as_completed(futures):
            results, idx_entries, full_entries = future.result()
            with lock:
                # 合并到主索引
                index.setdefault("entries", {}).update(idx_entries)
                if full_index is not None:
                    full_index.setdefault("entries", {}).update(full_entries)
                extracted_count += len(results)
                tombstone_count += sum(
                    1 for ent in idx_entries.values() if ent.get("no_log")
                )
                # 逐组持久化，避免中途超时导致下次重复处理
                if idx_entries:
                    save_index(index)
                    save_full_index(full_index)
            for r in results:
                print(f"[session-extract] extracted: {r}")

    if extracted_count > 1:
        print(f"[session-extract] total: {extracted_count} executions")
    if tombstone_count > 0:
        print(f"[session-extract] tombstoned: {tombstone_count} (no log found)")


def extract_all_unindexed(
    global_storage: Optional[str] = None,
    workspace_path: Optional[str] = None,
    limit: Optional[int] = None,
    workers: int = 8,
    rebuild: bool = False,
):
    """提取 execution（覆盖模式，多线程并行）。

    rebuild=True 时清空索引和旧文件，全量重建（用于迁移到覆盖模式）。
    rebuild=False 时只提取未索引的（增量模式）。
    """
    gs = global_storage or DEFAULT_GLOBAL_STORAGE
    ws = workspace_path or os.getcwd()

    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        print("[session-extract] kiro.kiroagent dir not found")
        return

    session_dir = find_workspace_session_dir(agent_dir, ws)
    chat_ids = None
    if session_dir:
        sessions = load_sessions_json(session_dir)
        chat_ids = {s.get("chatSessionId") or s.get("sessionId") for s in sessions
                    if s.get("chatSessionId") or s.get("sessionId")}

    all_execs = find_all_executions(agent_dir, chat_session_ids=chat_ids)
    if not all_execs:
        print("[session-extract] no executions found")
        return

    TERMINAL_STATUSES = ("succeed", "failed", "stopped", "aborted")

    if rebuild:
        # 全量重建：清空索引，处理所有终态 execution
        print("[session-extract] REBUILD mode: clearing indexes, reprocessing all")
        index = {"version": 2, "entries": {}}
        full_index = {"version": 2, "entries": {}}
        # 清理旧的 main_*.md 文件（保留 sub_*.md 和其他文件）
        # 旧文件会在 extract_single_execution 中被覆盖，无需预清理
        todo = [e for e in all_execs
                if e.get("status", "") in TERMINAL_STATUSES]
    else:
        index = load_index()
        full_index = load_full_index()
        todo = [e for e in all_execs
                if e.get("executionId", "") not in index.get("entries", {})
                and e.get("status", "") in TERMINAL_STATUSES]

    if limit:
        todo = todo[:limit]
    if not todo:
        print("[session-extract] all indexed, nothing to do")
        return

    print(f"[session-extract] {len(todo)} executions to extract (workers={workers})")

    import threading
    from concurrent.futures import ThreadPoolExecutor, as_completed

    # 按 chatSessionId 分组，同组内按 startTime 排序串行提取
    # 覆盖模式下同组串行确保 actions 按时间顺序追加
    from collections import defaultdict
    chat_groups: dict[str, list[dict]] = defaultdict(list)
    for e in todo:
        cid = e.get("chatSessionId", "") or "unknown"
        chat_groups[cid].append(e)
    for cid in chat_groups:
        chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))

    lock = threading.Lock()
    count = 0

    def _extract_group(group_execs):
        """串行提取同一 chatSession 的所有 execution（覆盖模式）"""
        local_index = {"version": 2, "entries": {}}
        local_full = {"version": 2, "entries": {}}
        results = []
        for execution in group_execs:
            try:
                result = extract_single_execution(
                    agent_dir=agent_dir,
                    hex_dir=execution["_hex_dir"],
                    execution=execution,
                    session_dir=session_dir,
                    index=local_index,
                    full_index=local_full,
                    force=rebuild,
                )
                if result:
                    results.append(result)
            except Exception as e:
                eid = execution.get("executionId", "?")[:8]
                print(f"[session-extract] ✗ {eid}: {e}")
        return results, local_index.get("entries", {}), local_full.get("entries", {})

    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_extract_group, execs): cid
                   for cid, execs in chat_groups.items()}
        for future in as_completed(futures):
            results, idx_entries, full_entries = future.result()
            if results:
                with lock:
                    count += len(results)
                    index["entries"].update(idx_entries)
                    full_index["entries"].update(full_entries)
                    if count % 50 == 0:
                        save_index(index)
                        save_full_index(full_index)
                        print(f"[session-extract] [{count}/{len(todo)}] checkpoint saved")
                    elif count % 10 == 0:
                        print(f"[session-extract] [{count}/{len(todo)}]")

    # 后处理去重：多线程合并可能产生同 chatSessionId 的多条 entry
    # （同组串行保证组内不重复，但 double-submit 等场景仍可能引入）
    def _dedup_index_entries(idx: dict) -> int:
        """按 chatSessionId 去重，保留 startTime 最晚的 entry，合并 summary。返回去除条数。"""
        entries = idx.get("entries", {})
        chat_groups_dedup: dict[str, list[tuple[str, dict]]] = defaultdict(list)
        keep: dict[str, dict] = {}
        for eid, ent in entries.items():
            if ent.get("is_sub"):
                keep[eid] = ent
                continue
            cid = ent.get("chatSessionId", "")
            if not cid:
                keep[eid] = ent
                continue
            chat_groups_dedup[cid].append((eid, ent))
        removed = 0
        for cid, group in chat_groups_dedup.items():
            if len(group) == 1:
                keep[group[0][0]] = group[0][1]
                continue
            group.sort(key=lambda x: x[1].get("startTime", ""))
            merged_summary: dict = {}
            for eid, ent in group:
                merged_summary = _merge_summaries(merged_summary, ent.get("summary", {}))
            last_eid, last_ent = group[-1]
            last_ent["summary"] = merged_summary
            for eid, ent in reversed(group):
                if ent.get("description"):
                    last_ent["description"] = ent["description"]
                    break
            keep[last_eid] = last_ent
            removed += len(group) - 1
        idx["entries"] = keep
        return removed

    removed_main = _dedup_index_entries(index)
    removed_full = _dedup_index_entries(full_index)
    if removed_main or removed_full:
        print(f"[session-extract] dedup: removed {removed_main} main / {removed_full} full duplicates")

    # 最终保存
    save_index(index)
    save_full_index(full_index)
    print(f"[session-extract] done, extracted {count}/{len(todo)}, final entries: {len(index.get('entries', {}))}")


def extract_by_id(
    execution_id: str,
    global_storage: Optional[str] = None,
):
    """提取指定 executionId 的 execution"""
    gs = global_storage or DEFAULT_GLOBAL_STORAGE
    agent_dir = find_kiro_agent_dir(gs)
    if not agent_dir:
        print("[session-extract] kiro.kiroagent dir not found")
        return

    execs = find_all_executions(agent_dir, execution_id=execution_id)
    if not execs:
        print(f"[session-extract] execution not found: {execution_id}")
        return
    # 验证确实匹配到了目标 execution（前缀匹配）
    matched = execs[0]
    if not matched.get("executionId", "").startswith(execution_id):
        print(f"[session-extract] execution not found: {execution_id}")
        return

    index = load_index()
    full_index = load_full_index()
    result = extract_single_execution(
        agent_dir=agent_dir,
        hex_dir=execs[0]["_hex_dir"],
        execution=execs[0],
        session_dir=None,
        index=index,
        full_index=full_index,
        force=True,  # 指定 ID 时强制提取
    )
    if result:
        save_index(index)
        save_full_index(full_index)
        print(f"[session-extract] extracted: {result}")


# ═══════════════════════════════════════════════════════════
# CLI 入口
# ═══════════════════════════════════════════════════════════

def main():
    import argparse
    parser = argparse.ArgumentParser(description="Kiro 执行日志全量提取器 v3（覆盖模式）")
    parser.add_argument("--all", action="store_true", help="提取所有未索引的 execution")
    parser.add_argument("--rebuild", action="store_true", help="全量重建：清空索引，重新提取所有 execution（覆盖模式迁移用）")
    parser.add_argument("--recent", type=int, metavar="N", help="提取最近 N 条未索引的")
    parser.add_argument("--workers", type=int, default=8, help="并行线程数（默认 8）")
    parser.add_argument("--execution-id", type=str, help="提取指定 executionId")
    parser.add_argument("--global-storage", type=str, help="globalStorage 路径")
    parser.add_argument("--workspace", type=str, help="workspace 路径")
    args = parser.parse_args()

    gs = args.global_storage
    ws = args.workspace

    if args.execution_id:
        extract_by_id(args.execution_id, global_storage=gs)
    elif args.rebuild:
        extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers, rebuild=True)
    elif args.all:
        extract_all_unindexed(global_storage=gs, workspace_path=ws, workers=args.workers)
    elif args.recent:
        extract_all_unindexed(global_storage=gs, workspace_path=ws, limit=args.recent, workers=args.workers)
    else:
        extract_latest(global_storage=gs, workspace_path=ws)


if __name__ == "__main__":
    main()