Neo-ZQYY/scripts/ops/migrate_session_dirs.py

#!/usr/bin/env python3
"""migrate_session_dirs — 将旧格式 session_logs 目录迁移到新格式。

旧格式：{chatShort}_{HHMMSS}/main_{seq}_{execShort}_{HHMMSS}.md
新格式：{seq:02d}_{chatShort}_{HHMMSS}/main_{seq}_{execShort}.md

迁移规则：
1. 同一天内，按 chatSessionId 分组，同 chatSession 的多个旧目录合并到一个新目录
2. 新目录按当天出现顺序分配序号（01_, 02_, ...）
3. 文件名去掉时间后缀（_HHMMSS）
4. 更新双索引中所有 entry 的 output_dir
5. 跨天对话生成 _ref_{chatShort}.md 指引文件

用法：
  python scripts/ops/migrate_session_dirs.py --dry-run    # 预览变更
  python scripts/ops/migrate_session_dirs.py              # 执行迁移
"""

import json
import os
import re
import shutil
import sys
from collections import defaultdict

from _env_paths import ensure_repo_root

ensure_repo_root()

SESSION_LOG_DIR = os.path.join("docs", "audit", "session_logs")
INDEX_PATH = os.path.join(SESSION_LOG_DIR, "_session_index.json")
INDEX_FULL_PATH = os.path.join(SESSION_LOG_DIR, "_session_index_full.json")

# 旧目录名格式：{hex8}_{HHMMSS}（无序号前缀）
OLD_DIR_PATTERN = re.compile(r"^([0-9a-f]{8})_(\d{6})$")
# 新目录名格式：{seq:02d}_{hex8}_{HHMMSS}（有序号前缀）
NEW_DIR_PATTERN = re.compile(r"^(\d{2})_([0-9a-f]{8})_(\d{6})$")
# 旧文件名格式：main_{seq}_{hash8}_{HHMMSS}.md 或 sub_{seq}_{hash8}_{HHMMSS}.md
OLD_FILE_PATTERN = re.compile(r"^(main|sub)_(\d{2})_([0-9a-f]{8})_(\d{6})\.md$")


def load_json(path):
    if not os.path.isfile(path):
        return {}
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def save_json(path, data):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def find_all_day_dirs():
    """找到所有 YYYY-MM/DD 目录"""
    results = []
    if not os.path.isdir(SESSION_LOG_DIR):
        return results
    for ym in sorted(os.listdir(SESSION_LOG_DIR)):
        ym_path = os.path.join(SESSION_LOG_DIR, ym)
        if not os.path.isdir(ym_path) or not re.match(r"^\d{4}-\d{2}$", ym):
            continue
        for dd in sorted(os.listdir(ym_path)):
            dd_path = os.path.join(ym_path, dd)
            if not os.path.isdir(dd_path) or not re.match(r"^\d{2}$", dd):
                continue
            results.append(dd_path)
    return results


def scan_day_dir(day_dir):
    """扫描一个 day_dir，返回需要迁移的旧目录列表。

    返回 [(dir_name, chat_short, hms, full_path), ...]
    已经是新格式的目录会被跳过。
    """
    old_dirs = []
    for d in sorted(os.listdir(day_dir)):
        full = os.path.join(day_dir, d)
        if not os.path.isdir(full):
            continue
        # 跳过已经是新格式的
        if NEW_DIR_PATTERN.match(d):
            continue
        m = OLD_DIR_PATTERN.match(d)
        if m:
            old_dirs.append((d, m.group(1), m.group(2), full))
    return old_dirs


def rename_file(old_name):
    """将旧文件名转为新文件名（去掉时间后缀）。

    main_01_abc12345_013337.md → main_01_abc12345.md
    sub_02_def67890_013337.md  → sub_02_def67890.md
    """
    m = OLD_FILE_PATTERN.match(old_name)
    if m:
        prefix, seq, hash8, _hms = m.groups()
        return f"{prefix}_{seq}_{hash8}.md"
    return old_name  # 不匹配的文件名保持不变


def build_migration_plan(day_dir):
    """为一个 day_dir 构建迁移计划。

    返回 plan: [{
        "old_dirs": [(dir_name, full_path), ...],  # 同一 chatSession 的旧目录（可能多个）
        "chat_short": str,
        "first_hms": str,  # 最早的 HHMMSS
        "new_dir_name": str,  # 新目录名 {seq:02d}_{chatShort}_{firstHms}
        "file_renames": [(old_name, new_name), ...],
    }]
    """
    old_dirs = scan_day_dir(day_dir)
    if not old_dirs:
        return []

    # 按 chatShort 分组，同一 chatSession 的多个旧目录合并
    groups = defaultdict(list)
    for dir_name, chat_short, hms, full_path in old_dirs:
        groups[chat_short].append((dir_name, hms, full_path))

    # 按每组最早的 hms 排序，分配序号
    sorted_groups = sorted(groups.items(), key=lambda g: min(h for _, h, _ in g[1]))

    # 检查已有新格式目录，避免序号冲突
    existing_seqs = []
    for d in os.listdir(day_dir):
        if os.path.isdir(os.path.join(day_dir, d)):
            m = NEW_DIR_PATTERN.match(d)
            if m:
                existing_seqs.append(int(m.group(1)))
    next_seq = max(existing_seqs, default=0) + 1

    plan = []
    for chat_short, dirs in sorted_groups:
        dirs.sort(key=lambda x: x[1])  # 按 hms 排序
        first_hms = dirs[0][1]

        # 检查是否已有新格式目录包含此 chatShort（已部分迁移）
        existing_new = None
        for d in os.listdir(day_dir):
            m = NEW_DIR_PATTERN.match(d)
            if m and m.group(2) == chat_short:
                existing_new = d
                break

        if existing_new:
            new_dir_name = existing_new
        else:
            new_dir_name = f"{next_seq:02d}_{chat_short}_{first_hms}"
            next_seq += 1

        # 收集所有文件的重命名计划
        file_renames = []
        for dir_name, hms, full_path in dirs:
            for fname in sorted(os.listdir(full_path)):
                if not fname.endswith(".md"):
                    continue
                new_fname = rename_file(fname)
                file_renames.append((
                    os.path.join(full_path, fname),       # 旧完整路径
                    fname,                                  # 旧文件名
                    new_fname,                              # 新文件名
                ))

        plan.append({
            "old_dirs": [(d, p) for d, _, p in dirs],
            "chat_short": chat_short,
            "first_hms": first_hms,
            "new_dir_name": new_dir_name,
            "new_dir_path": os.path.join(day_dir, new_dir_name),
            "file_renames": file_renames,
        })

    return plan


def execute_migration(plan, day_dir, dry_run=False):
    """执行一个 day_dir 的迁移计划。返回 (moved_dirs, moved_files, errors)"""
    moved_dirs = 0
    moved_files = 0
    errors = []

    for item in plan:
        new_dir_path = item["new_dir_path"]
        old_dirs = item["old_dirs"]

        if not dry_run:
            os.makedirs(new_dir_path, exist_ok=True)

        # 移动文件到新目录
        for old_src, old_fname, new_fname in item["file_renames"]:
            dst = os.path.join(new_dir_path, new_fname)
            # 如果目标已存在（部分迁移过），跳过
            if os.path.isfile(dst):
                continue
            if dry_run:
                print(f"  MOVE {old_src} → {dst}")
            else:
                try:
                    shutil.move(old_src, dst)
                    moved_files += 1
                except Exception as e:
                    errors.append(f"move {old_src}: {e}")

        # 删除空的旧目录
        for dir_name, dir_path in old_dirs:
            # 新目录名和旧目录名相同时跳过（不需要删除）
            if dir_name == item["new_dir_name"]:
                continue
            if dry_run:
                print(f"  RMDIR {dir_path}")
            else:
                try:
                    # 只删除空目录
                    remaining = os.listdir(dir_path)
                    if not remaining:
                        os.rmdir(dir_path)
                        moved_dirs += 1
                    else:
                        errors.append(f"rmdir {dir_path}: not empty ({remaining})")
                except Exception as e:
                    errors.append(f"rmdir {dir_path}: {e}")

    return moved_dirs, moved_files, errors


def update_indexes(all_plans):
    """根据迁移计划更新双索引中的 output_dir。

    旧 output_dir 格式：docs/audit/session_logs/2026-03/03/b6b5e1fd_013337
    新 output_dir 格式：docs/audit/session_logs/2026-03/03/01_b6b5e1fd_013337
    """
    # 构建映射：旧目录路径 → 新目录路径
    dir_map = {}
    for plan_list, day_dir in all_plans:
        for item in plan_list:
            new_path = item["new_dir_path"].replace("\\", "/")
            for dir_name, dir_path in item["old_dirs"]:
                old_path = dir_path.replace("\\", "/")
                dir_map[old_path] = new_path

    if not dir_map:
        return 0

    updated = 0
    for idx_path in [INDEX_PATH, INDEX_FULL_PATH]:
        data = load_json(idx_path)
        entries = data.get("entries", {})
        changed = False
        for eid, ent in entries.items():
            old_dir = ent.get("output_dir", "")
            if old_dir in dir_map:
                ent["output_dir"] = dir_map[old_dir]
                changed = True
                updated += 1
        if changed:
            save_json(idx_path, data)

    return updated


def main():
    import argparse
    parser = argparse.ArgumentParser(description="迁移 session_logs 目录到新格式")
    parser.add_argument("--dry-run", action="store_true", help="预览变更，不实际执行")
    args = parser.parse_args()

    day_dirs = find_all_day_dirs()
    if not day_dirs:
        print("[migrate] 未找到任何 day_dir")
        return

    total_plans = []
    total_items = 0
    total_files = 0

    for day_dir in day_dirs:
        plan = build_migration_plan(day_dir)
        if plan:
            total_plans.append((plan, day_dir))
            total_items += len(plan)
            total_files += sum(len(item["file_renames"]) for item in plan)

    if not total_plans:
        print("[migrate] 所有目录已是新格式，无需迁移")
        return

    print(f"[migrate] 共 {len(day_dirs)} 个 day_dir，{total_items} 个对话组，{total_files} 个文件待迁移")

    if args.dry_run:
        print("\n=== DRY RUN ===\n")
        for plan, day_dir in total_plans:
            rel = os.path.relpath(day_dir)
            print(f"\n--- {rel} ---")
            for item in plan:
                old_names = [d for d, _ in item["old_dirs"]]
                print(f"  {' + '.join(old_names)} → {item['new_dir_name']}/")
                for _, old_fname, new_fname in item["file_renames"]:
                    if old_fname != new_fname:
                        print(f"    {old_fname} → {new_fname}")
                    else:
                        print(f"    {old_fname} (不变)")
        print(f"\n[dry-run] 共 {total_items} 个对话组，{total_files} 个文件")
        return

    # 执行迁移
    all_moved_dirs = 0
    all_moved_files = 0
    all_errors = []

    for plan, day_dir in total_plans:
        md, mf, errs = execute_migration(plan, day_dir, dry_run=False)
        all_moved_dirs += md
        all_moved_files += mf
        all_errors.extend(errs)
        rel = os.path.relpath(day_dir)
        print(f"[migrate] {rel}: {mf} files moved, {md} dirs removed")

    # 更新索引
    idx_updated = update_indexes(total_plans)
    print(f"[migrate] 索引更新: {idx_updated} entries")

    if all_errors:
        print(f"\n[migrate] {len(all_errors)} 个错误:")
        for e in all_errors[:20]:
            print(f"  ✗ {e}")

    print(f"\n[migrate] 完成: {all_moved_files} files, {all_moved_dirs} dirs removed, {idx_updated} index entries updated")


if __name__ == "__main__":
    main()