Neo-ZQYY/scripts/ops/gen_dataflow_report.py

"""
数据流结构分析报告生成器（v3）

读取 analyze_dataflow.py 采集的数据，生成带锚点链接、上下游映射列、
业务描述、多示例值、字段差异报告的 Markdown 报告。

增强内容（v3）：
- 总览表增加 API JSON 字段数列
- 覆盖率表增加业务描述列
- 逐表详情增加业务描述列（来自 BD_manual 文档）
- 说明+示例值合并，多示例展示，枚举值解释
- 总览章节增加 API↔ODS↔DWD 字段对比差异报告

用法:
    python scripts/ops/gen_dataflow_report.py
    python scripts/ops/gen_dataflow_report.py --output-dir export/dataflow_analysis
"""

from __future__ import annotations

import argparse
import json
import os
from datetime import datetime
from pathlib import Path

from dotenv import load_dotenv


def load_json(path: Path) -> dict | list | None:
    if not path.exists():
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="生成数据流结构分析 Markdown 报告")
    parser.add_argument("--output-dir", type=str, default=None,
                        help="输出目录（默认读取 SYSTEM_ANALYZE_ROOT 或 export/dataflow_analysis）")
    return parser


def resolve_data_dir(override: str | None = None) -> Path:
    if override:
        return Path(override)
    env_root = os.environ.get("SYSTEM_ANALYZE_ROOT")
    if env_root:
        return Path(env_root)
    return Path("export/dataflow_analysis")


def _esc(s: str) -> str:
    """转义 Markdown 表格中的管道符"""
    return str(s).replace("|", "\\|").replace("\n", " ") if s else ""


# ── 字段用途推测规则 ──
# 基于字段名模式 + 表名上下文推断字段可能的业务含义
# 置信度：高(≥80%) / 中(50-79%) / 低(<50%)
import re as _re

_FIELD_GUESS_RULES: list[tuple[str, str, str]] = [
    # (字段名模式正则, 推测用途, 置信度)
    # ── SCD2 / ETL 元数据 ──
    (r"^scd2_", "SCD2 缓慢变化维度元数据", "高"),
    (r"^etl_", "ETL 流程元数据", "高"),
    (r"^dw_insert", "数仓装载时间戳", "高"),
    (r"^content_hash$", "数据变更检测哈希", "高"),
    (r"^source_file$", "ETL 来源文件标识", "高"),
    (r"^source_endpoint$", "ETL 来源接口标识", "高"),
    (r"^fetched_at$", "ETL 抓取时间", "高"),
    (r"^payload$", "原始 JSON 全量存储", "高"),
    # ── 主键 / 外键 ──
    (r"^id$", "主键标识", "高"),
    # ── 门店 / 组织（放在通用 _id$ 之前） ──
    (r"^(site_id|shop_id|store_id)$", "门店标识", "高"),
    (r"^(tenant_id|org_id)$", "租户/组织标识", "高"),
    (r"(shop_name|site_name|store_name)", "门店名称", "高"),
    # ── 时间类 ──
    (r"(^|_)(create|created)(_at|_time|_date)$", "记录创建时间", "高"),
    (r"(^|_)(update|updated|modify)(_at|_time|_date)$", "记录更新时间", "高"),
    (r"(^|_)(delete|deleted)(_at|_time|_date)$", "逻辑删除时间", "高"),
    (r"(^|_)(start|begin)(_at|_time|_date)$", "起始时间", "中"),
    (r"(^|_)(end|expire)(_at|_time|_date)$", "结束/过期时间", "中"),
    (r"(^|_)entry_time$", "入职/入场时间", "中"),
    (r"(^|_)resign_time$", "离职时间", "中"),
    (r"_time$", "时间戳字段", "中"),
    (r"_date$", "日期字段", "中"),
    # ── 通用派生（放在标志位之前，确保 derived_flag 等优先匹配派生） ──
    (r"^derived_", "ETL 派生计算列", "高"),
    (r"^calc_", "计算字段", "中"),
    # ── 状态 / 标志 ──
    (r"(^|_)is_delete$", "逻辑删除标志", "高"),
    (r"^is_", "布尔标志位", "中"),
    (r"(^|_)status$", "状态码", "中"),
    (r"_status$", "状态字段", "中"),
    (r"_enabled$", "启用/禁用开关", "中"),
    (r"_flag$", "标志位", "中"),
    # ── 金额 / 价格 ──
    (r"(price|amount|fee|cost|money|balance|total)", "金额/价格相关", "中"),
    (r"(discount|coupon|refund)", "优惠/退款相关", "中"),
    # ── 人员 ──
    (r"(real_name|nickname|^name$)", "姓名/昵称", "中"),
    (r"(mobile|phone|tel)", "联系电话", "中"),
    (r"(avatar|photo|image)", "头像/图片 URL", "中"),
    (r"(gender|sex)", "性别", "高"),
    (r"(birth|birthday)", "出生日期", "高"),
    (r"(height|weight)", "身高/体重", "高"),
    # ── 嵌套对象常见前缀 ──
    (r"^siteProfile\.", "门店档案嵌套属性", "高"),
    (r"^memberInfo\.", "会员信息嵌套属性", "中"),
    (r"^assistantInfo\.", "助教信息嵌套属性", "中"),
    (r"^tableInfo\.", "台桌信息嵌套属性", "中"),
    (r"^orderInfo\.", "订单信息嵌套属性", "中"),
    (r"^payInfo\.", "支付信息嵌套属性", "中"),
    # ── 排序 / 显示 ──
    (r"(sort|order|rank|seq)", "排序/序号", "低"),
    (r"(remark|memo|note|comment|introduce)", "备注/说明文本", "中"),
    (r"(url|link|qrcode|qr_code)", "链接/二维码", "中"),
    # ── 通用 ID 后缀（放在具体 ID 规则之后） ──
    (r"_id$", "关联实体 ID（外键）", "中"),
]


def _guess_field_purpose(field_name: str, table_name: str, layer: str) -> tuple[str, str]:
    """根据字段名和表上下文推测用途，返回 (推测用途, 置信度)。"""
    fn_lower = field_name.lower()
    for pattern, purpose, confidence in _FIELD_GUESS_RULES:
        if _re.search(pattern, fn_lower):
            return purpose, confidence
    return f"待分析（{layer}层字段）", "低"


def _format_samples(samples: list[str], max_show: int = 5) -> str:
    """格式化多示例值，截断过长的值"""
    if not samples:
        return ""
    shown = []
    for s in samples[:max_show]:
        s = _esc(s)
        if len(s) > 30:
            s = s[:27] + "..."
        shown.append(f"`{s}`")
    result = ", ".join(shown)
    if len(samples) > max_show:
        result += f" …共{len(samples)}种"
    return result


def _is_enum_like(samples: list[str], total_records: int) -> bool:
    """判断字段是否像枚举（不同值少且记录数足够多）"""
    if total_records < 5:
        return False
    return 1 < len(samples) <= 8


def generate_report(data_dir: Path) -> str:
    """生成完整的 Markdown 报告"""
    manifest = load_json(data_dir / "collection_manifest.json")
    if not manifest:
        raise FileNotFoundError(f"找不到 collection_manifest.json: {data_dir}")

    tables = manifest["tables"]
    now = datetime.now()
    lines: list[str] = []

    def w(s: str = ""):
        lines.append(s)

    # ── 报告头 ──
    w("# 飞球连接器 — 数据流结构分析报告")
    w()
    w(f"> 生成时间：{now.strftime('%Y-%m-%d %H:%M:%S')} CST")
    w(f"> 分析范围：飞球（feiqiu）连接器，共 {len(tables)} 张 ODS 表")
    w("> 数据来源：API JSON 采样 + PostgreSQL ODS/DWD 表结构 + 三层字段映射 + BD_manual 业务文档")
    w()

    # ── 1. 总览表（增加 API JSON 字段数列） ──
    w("## 1. 总览")
    w()
    w("| # | ODS 表名 | 业务描述 | 采样记录数 | API JSON 字段数 | ODS 列数 | DWD 目标表 | DWD 总列数 |")
    w("|---|---------|---------|-----------|---------------|---------|-----------|-----------|")
    total_records = 0
    total_ods_cols = 0
    total_dwd_cols = 0
    total_json_fields = 0
    for i, t in enumerate(tables, 1):
        dwd_names = ", ".join(t["dwd_tables"]) if t["dwd_tables"] else "—"
        json_fc = t.get("json_field_count", 0)
        w(f"| {i} | `{t['table']}` | {t['description']} | {t['record_count']} | {json_fc} | {t['ods_column_count']} | {dwd_names} | {t['dwd_column_count']} |")
        total_records += t["record_count"]
        total_ods_cols += t["ods_column_count"]
        total_dwd_cols += t["dwd_column_count"]
        total_json_fields += json_fc
    w(f"| | **合计** | | **{total_records}** | **{total_json_fields}** | **{total_ods_cols}** | | **{total_dwd_cols}** |")
    w()

    # ── 1.1 字段对比差异报告 ──
    _write_field_diff_report(w, data_dir, tables)

    # ── 2. 全局统计 ──
    w("## 2. 全局统计")
    w()

    # 2.1 JSON→ODS 映射覆盖
    total_json = 0
    total_mapped = 0
    per_table_stats: list[dict] = []
    for t in tables:
        fm = load_json(data_dir / "field_mappings" / f"{t['table']}.json")
        if not fm or "json_to_ods" not in fm:
            per_table_stats.append({
                "table": t["table"], "description": t["description"],
                "json_count": 0, "mapped": 0, "unmapped": 0, "pct": "—",
            })
            continue
        j2o = fm["json_to_ods"]
        json_count = len(j2o)
        mapped = sum(1 for m in j2o if m.get("ods_col") is not None)
        unmapped = json_count - mapped
        pct = f"{mapped / json_count * 100:.1f}%" if json_count > 0 else "—"
        per_table_stats.append({
            "table": t["table"], "description": t["description"],
            "json_count": json_count, "mapped": mapped, "unmapped": unmapped, "pct": pct,
        })
        total_json += json_count
        total_mapped += mapped

    total_unmapped = total_json - total_mapped
    w("### 2.1 JSON→ODS 映射覆盖")
    w()
    w(f"- JSON 字段总数：{total_json}")
    if total_json > 0:
        w(f"- 已映射到 ODS 列：{total_mapped}（{total_mapped / total_json * 100:.1f}%）")
        w(f"- 仅存于 payload：{total_unmapped}（{total_unmapped / total_json * 100:.1f}%）")
    else:
        w("- 已映射到 ODS 列：0")
        w("- 仅存于 payload：0")
    w()

    # 2.2 ODS→DWD 映射覆盖
    w("### 2.2 ODS→DWD 映射覆盖")
    w()
    w(f"- DWD 列总数：{total_dwd_cols}")
    w()

    # 2.3 各表覆盖率（增加业务描述列）
    w("### 2.3 各表 JSON→ODS 映射覆盖率")
    w()
    w("| ODS 表名 | 业务描述 | JSON 字段数 | 已映射 | 仅 payload | 覆盖率 |")
    w("|---------|---------|-----------|-------|-----------|-------|")
    sorted_stats = sorted(per_table_stats, key=lambda x: (0 if x["pct"] == "—" else -float(x["pct"].rstrip("%"))))
    for s in sorted_stats:
        w(f"| `{s['table']}` | {s['description']} | {s['json_count']} | {s['mapped']} | {s['unmapped']} | {s['pct']} |")
    w()

    # ── 3. 逐表详情 ──
    w("## 3. 逐表详情")
    w()

    for idx, t in enumerate(tables, 1):
        table_name = t["table"]
        fm = load_json(data_dir / "field_mappings" / f"{table_name}.json")
        jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
        ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
        bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")

        # 锚点 ID
        anchors = fm.get("anchors", {}) if fm else {}
        api_anchor = anchors.get("api", f"api-{table_name}")
        ods_anchor = anchors.get("ods", f"ods-{table_name}")
        dwd_anchors = anchors.get("dwd", {})

        dwd_tables_list = t.get("dwd_tables", [])
        json_fc = t.get("json_field_count", 0)

        w(f"### 3.{idx} {table_name}（{t['description']}）")
        w()
        w(f"- 任务代码：`{t['task_code']}`")
        w(f"- 采样记录数：{t['record_count']}")
        w(f"- API JSON 字段数：{json_fc}")
        w(f"- ODS 列数：{t['ods_column_count']}")
        if dwd_tables_list:
            w(f"- DWD 目标表：{', '.join(dwd_tables_list)}")
        else:
            w("- DWD 目标表：—（仅 ODS 落地）")
        w()

        # ── API 源字段区块 ──
        _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor)

        # ── ODS 表结构区块 ──
        _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors)

        # ── DWD 表结构区块 ──
        for dwd_name in dwd_tables_list:
            dwd_anchor = dwd_anchors.get(dwd_name, f"dwd-{dwd_name}")
            dwd_schema = load_json(data_dir / "db_schemas" / f"dwd_{dwd_name}.json")
            _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name)

    return "\n".join(lines)


def _write_field_diff_report(w, data_dir: Path, tables: list[dict]):
    """生成 API↔ODS↔DWD 字段对比差异报告（汇总表 + 逐表分表）"""
    w("### 1.1 API↔ODS↔DWD 字段对比差异")
    w()
    w("以下汇总各表在三层之间的字段差异（点击数字跳转至分表详情）：")
    w()
    w("| ODS 表名 | API→ODS 未映射 | ODS 无 JSON 源 | ODS→DWD 未映射 | DWD 无 ODS 源 | 主要差异原因 |")
    w("|---------|--------------|--------------|--------------|-------------|------------|")

    # 收集每表差异数据，用于汇总表和分表
    etl_meta_cols = {"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"}
    diff_rows: list[dict] = []

    for t in tables:
        table_name = t["table"]
        fm = load_json(data_dir / "field_mappings" / f"{table_name}.json")
        if not fm:
            w(f"| `{table_name}` | — | — | — | — | 无映射数据 |")
            diff_rows.append(None)
            continue

        anchors = fm.get("anchors", {})
        api_anchor = anchors.get("api", f"api-{table_name.replace('_', '-')}")
        ods_anchor = anchors.get("ods", f"ods-{table_name.replace('_', '-')}")
        dwd_anchors = anchors.get("dwd", {})
        diff_anchor = f"diff-{table_name.replace('_', '-')}"

        j2o = fm.get("json_to_ods", [])
        o2d = fm.get("ods_to_dwd", {})
        d2o = fm.get("dwd_to_ods", {})

        # ── API→ODS 未映射字段 ──
        api_unmapped_flat: list[str] = []
        api_unmapped_nested: list[str] = []
        for m in j2o:
            if m.get("ods_col") is None:
                jp = m.get("json_path", "")
                if "." in jp:
                    api_unmapped_nested.append(jp)
                else:
                    api_unmapped_flat.append(jp)
        api_unmapped_total = len(api_unmapped_flat) + len(api_unmapped_nested)

        # ── ODS 无 JSON 源 ──
        ods_schema = load_json(data_dir / "db_schemas" / f"ods_{table_name}.json")
        ods_mapped_cols = {m["ods_col"] for m in j2o if m.get("ods_col")}
        ods_no_json_fields: list[str] = []
        if ods_schema and "columns" in ods_schema:
            for col in ods_schema["columns"]:
                if col["name"] not in ods_mapped_cols and col["name"] not in etl_meta_cols:
                    ods_no_json_fields.append(col["name"])

        # ── ODS→DWD 未映射 ──
        ods_cols_with_dwd = set(o2d.keys())
        ods_no_dwd_fields: list[str] = []
        if ods_schema and "columns" in ods_schema:
            for col in ods_schema["columns"]:
                if col["name"] not in ods_cols_with_dwd and col["name"] not in etl_meta_cols:
                    ods_no_dwd_fields.append(col["name"])

        # ── DWD 无 ODS 源 ──
        dwd_no_ods_fields: list[tuple[str, str]] = []  # (dwd_table, dwd_col)
        for dwd_name, entries in d2o.items():
            for entry in entries:
                if entry.get("ods_source") == "—":
                    dwd_no_ods_fields.append((dwd_name, entry["dwd_col"]))

        # 差异原因
        reasons: list[str] = []
        if api_unmapped_nested:
            reasons.append(f"嵌套对象 {len(api_unmapped_nested)} 个")
        if api_unmapped_flat:
            reasons.append(f"平层未映射 {len(api_unmapped_flat)} 个")
        if dwd_no_ods_fields:
            reasons.append(f"SCD2/派生列 {len(dwd_no_ods_fields)} 个")
        reason_str = "；".join(reasons) if reasons else "—"

        # 汇总表单元格：数量 + 跳转链接
        def _cell(count: int) -> str:
            if count == 0:
                return "0"
            return f"[{count}](#{diff_anchor})"

        w(f"| `{table_name}` | {_cell(api_unmapped_total)} | {_cell(len(ods_no_json_fields))} | {_cell(len(ods_no_dwd_fields))} | {_cell(len(dwd_no_ods_fields))} | {reason_str} |")

        diff_rows.append({
            "table_name": table_name,
            "diff_anchor": diff_anchor,
            "api_anchor": api_anchor,
            "ods_anchor": ods_anchor,
            "dwd_anchors": dwd_anchors,
            "api_unmapped_flat": api_unmapped_flat,
            "api_unmapped_nested": api_unmapped_nested,
            "ods_no_json_fields": ods_no_json_fields,
            "ods_no_dwd_fields": ods_no_dwd_fields,
            "dwd_no_ods_fields": dwd_no_ods_fields,
        })

    w()

    # ── 逐表差异分表 ──
    sub_idx = 0
    for row in diff_rows:
        if row is None:
            continue
        has_any = (row["api_unmapped_flat"] or row["api_unmapped_nested"]
                   or row["ods_no_json_fields"] or row["ods_no_dwd_fields"]
                   or row["dwd_no_ods_fields"])
        if not has_any:
            continue

        sub_idx += 1
        table_name = row["table_name"]
        w(f'<a id="{row["diff_anchor"]}"></a>')
        w()
        w(f"#### 1.1.{sub_idx} {table_name} 字段差异明细")
        w()

        api_anchor = row["api_anchor"]
        ods_anchor = row["ods_anchor"]
        dwd_anchors = row["dwd_anchors"]

        # 加载辅助数据：json_trees（示例值）、bd_descriptions（业务说明）
        jt = load_json(data_dir / "json_trees" / f"{table_name}.json")
        bd = load_json(data_dir / "bd_descriptions" / f"{table_name}.json")
        jt_lookup: dict[str, dict] = {}
        if jt and "fields" in jt:
            for fld in jt["fields"]:
                jt_lookup[fld["path"]] = fld
        ods_descs = bd.get("ods_fields", {}) if bd else {}
        dwd_descs_all = bd.get("dwd_fields", {}) if bd else {}

        def _sample_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
            """从 json_trees 或 bd_descriptions 获取示例值字符串"""
            if layer == "API":
                entry = jt_lookup.get(field_name, {})
                samples = entry.get("samples", [])
                total_recs = entry.get("total_records", 0)
                if not samples:
                    single = entry.get("sample", "")
                    if single:
                        samples = [str(single)]
                if _is_enum_like(samples, total_recs):
                    return ", ".join(f"`{_esc(s)}`" for s in samples[:5])
                if samples:
                    return _format_samples(samples, max_show=3)
            return ""

        def _desc_str(field_name: str, layer: str, dwd_tbl: str = "") -> str:
            """从 bd_descriptions 获取业务说明"""
            key = field_name.split(".")[-1].replace("[]", "").lower()
            if layer in ("ODS", "API"):
                desc = ods_descs.get(key, "")
            elif layer == "DWD" and dwd_tbl:
                desc = dwd_descs_all.get(dwd_tbl, {}).get(key, "")
            else:
                desc = ""
            if desc and len(desc) > 40:
                desc = desc[:37] + "..."
            return _esc(desc)

        # ── API→ODS 未映射（平层） ──
        if row["api_unmapped_flat"]:
            w(f"**API→ODS 未映射（平层）** — {len(row['api_unmapped_flat'])} 个")
            w()
            w("| # | JSON 字段 | 推测用途 | 置信度 | 示例值 | 说明 | 状态 |")
            w("|---|----------|---------|-------|-------|------|------|")
            for i, f in enumerate(row["api_unmapped_flat"], 1):
                purpose, conf = _guess_field_purpose(f, table_name, "API")
                sample = _sample_str(f, "API")
                desc = _desc_str(f, "API")
                w(f"| {i} | **[`{_esc(f)}`](#{api_anchor})** | {_esc(purpose)} | {conf} | {sample} | {desc} | **⚠️ 未映射** |")
            w()

        # ── API→ODS 未映射（嵌套对象） ──
        if row["api_unmapped_nested"]:
            w(f"<details><summary>API→ODS 未映射（嵌套对象）— {len(row['api_unmapped_nested'])} 个</summary>")
            w()
            w("| # | JSON 字段 | 推测用途 | 置信度 | 示例值 | 说明 | 状态 |")
            w("|---|----------|---------|-------|-------|------|------|")
            for i, f in enumerate(row["api_unmapped_nested"], 1):
                purpose, conf = _guess_field_purpose(f, table_name, "API")
                sample = _sample_str(f, "API")
                desc = _desc_str(f, "API")
                w(f"| {i} | [`{_esc(f)}`](#{api_anchor}) | {_esc(purpose)} | {conf} | {sample} | {desc} | 📦 嵌套 |")
            w()
            w("</details>")
            w()

        # ── ODS 无 JSON 源 ──
        if row["ods_no_json_fields"]:
            w(f"**ODS 无 JSON 源** — {len(row['ods_no_json_fields'])} 个")
            w()
            w("| # | ODS 列 | 推测用途 | 置信度 | 说明 | 状态 |")
            w("|---|-------|---------|-------|------|------|")
            for i, f in enumerate(row["ods_no_json_fields"], 1):
                purpose, conf = _guess_field_purpose(f, table_name, "ODS")
                desc = _desc_str(f, "ODS")
                w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {_esc(purpose)} | {conf} | {desc} | **⚠️ 无 JSON 源** |")
            w()

        # ── ODS→DWD 未映射 ──
        if row["ods_no_dwd_fields"]:
            w(f"**ODS→DWD 未映射** — {len(row['ods_no_dwd_fields'])} 个")
            w()
            w("| # | ODS 列 | 推测用途 | 置信度 | 说明 | 状态 |")
            w("|---|-------|---------|-------|------|------|")
            for i, f in enumerate(row["ods_no_dwd_fields"], 1):
                purpose, conf = _guess_field_purpose(f, table_name, "ODS")
                desc = _desc_str(f, "ODS")
                w(f"| {i} | **[`{_esc(f)}`](#{ods_anchor})** | {_esc(purpose)} | {conf} | {desc} | **⚠️ 无 DWD 目标** |")
            w()

        # ── DWD 无 ODS 源 ──
        if row["dwd_no_ods_fields"]:
            w(f"**DWD 无 ODS 源** — {len(row['dwd_no_ods_fields'])} 个")
            w()
            w("| # | DWD 表 | DWD 列 | 推测用途 | 置信度 | 说明 | 状态 |")
            w("|---|-------|-------|---------|-------|------|------|")
            for i, (dwd_name, dwd_col) in enumerate(row["dwd_no_ods_fields"], 1):
                dwd_a = dwd_anchors.get(dwd_name, f"dwd-{dwd_name.replace('_', '-')}")
                purpose, conf = _guess_field_purpose(dwd_col, table_name, "DWD")
                desc = _desc_str(dwd_col, "DWD", dwd_tbl=dwd_name)
                w(f"| {i} | {dwd_name} | **[`{_esc(dwd_col)}`](#{dwd_a})** | {_esc(purpose)} | {conf} | {desc} | **⚠️ 无 ODS 源** |")
            w()

    w()


def _write_api_section(w, fm, jt, bd, table_name, api_anchor, ods_anchor):
    """生成 API 源字段区块（增加业务描述列，合并说明+示例值）"""
    w(f'<a id="{api_anchor}"></a>')
    w()
    w(f"#### API 源字段 — {table_name} [🔗 ODS](#{ods_anchor})")
    w()

    if not fm or "json_to_ods" not in fm:
        w("_无 field_mappings 数据_")
        w()
        return

    j2o = fm["json_to_ods"]
    # 构建 json_tree 查找表（含 samples）
    jt_lookup: dict[str, dict] = {}
    if jt and "fields" in jt:
        for f in jt["fields"]:
            jt_lookup[f["path"]] = f

    # BD_manual ODS 描述（用于交叉引用 JSON 字段的业务含义）
    ods_descs = bd.get("ods_fields", {}) if bd else {}

    mapped_count = sum(1 for m in j2o if m.get("ods_col") is not None)
    total_count = len(j2o)
    if total_count > 0:
        w(f"已映射 {mapped_count}/{total_count}，覆盖率 {mapped_count / total_count * 100:.1f}%")
    else:
        w("无字段")
    w()
    w("| # | JSON 字段 | 类型 | → ODS 列 | 业务描述 | 示例值与说明 |")
    w("|---|----------|------|---------|---------|------------|")

    for i, m in enumerate(j2o, 1):
        json_path = m["json_path"]
        json_type = m.get("json_type", "")
        ods_col = m.get("ods_col")
        match_type = m.get("match_type", "")
        occurrence_pct = m.get("occurrence_pct", 0)

        # 从 json_tree 获取示例值（优先用 samples 多示例）
        jt_entry = jt_lookup.get(json_path, {})
        samples = jt_entry.get("samples", [])
        total_recs = jt_entry.get("total_records", 0)
        if not samples:
            single = jt_entry.get("sample", "")
            if single:
                samples = [str(single)]

        # 构建 ODS 列链接
        if ods_col:
            ods_link = f"[`{ods_col}`](#{ods_anchor})"
        else:
            ods_link = "⚠️ 未映射"

        # 业务描述（从 BD_manual 查找，用 ODS 列名或 JSON 叶子名）
        leaf = json_path.split(".")[-1].replace("[]", "").lower()
        biz_desc = ods_descs.get(leaf, "")
        if biz_desc and len(biz_desc) > 60:
            biz_desc = biz_desc[:57] + "..."
        biz_desc = _esc(biz_desc)

        # 合并说明+示例值
        notes_parts: list[str] = []
        if json_path.startswith("siteProfile.") or ("." in json_path and match_type == "unmapped"):
            notes_parts.append("📦 嵌套对象")
        if match_type == "case_insensitive":
            notes_parts.append("大小写匹配")
        if occurrence_pct < 100:
            notes_parts.append(f"出现率 {occurrence_pct:.0f}%")

        # 示例值展示
        if _is_enum_like(samples, total_recs):
            notes_parts.append(f"枚举值: {', '.join(f'`{_esc(s)}`' for s in samples[:8])}")
        elif samples:
            notes_parts.append(f"示例: {_format_samples(samples)}")

        note_str = "；".join(notes_parts) if notes_parts else ""

        w(f"| {i} | `{_esc(json_path)}` | {json_type} | {ods_link} | {biz_desc} | {note_str} |")

    w()


def _write_ods_section(w, fm, ods_schema, bd, table_name, ods_anchor, api_anchor, dwd_anchors):
    """生成 ODS 表结构区块（含上下游双向映射列 + 业务描述）"""
    w(f'<a id="{ods_anchor}"></a>')
    w()
    w(f"#### ODS 表结构 — ods.{table_name} [🔗 API](#{api_anchor})")
    w()

    if not ods_schema or "columns" not in ods_schema:
        w("_无 DB schema 数据_")
        w()
        return

    # 构建 json_to_ods 反向查找：ods_col → json_path
    ods_to_json: dict[str, str] = {}
    if fm and "json_to_ods" in fm:
        for m in fm["json_to_ods"]:
            if m.get("ods_col"):
                ods_to_json.setdefault(m["ods_col"], m["json_path"])

    # 构建 ods_to_dwd 查找
    ods_to_dwd: dict[str, list[dict]] = {}
    if fm and "ods_to_dwd" in fm:
        ods_to_dwd = fm["ods_to_dwd"]

    # BD_manual ODS 描述
    ods_descs = bd.get("ods_fields", {}) if bd else {}

    cols = ods_schema["columns"]
    w(f"共 {len(cols)} 列")
    w()
    w("| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 | 业务描述 |")
    w("|---|---------|------|----------|-----------|---------|")

    for i, col in enumerate(cols, 1):
        col_name = col["name"]
        col_type = col["data_type"]

        # ← JSON 源
        json_src = ods_to_json.get(col_name)
        if json_src:
            json_link = f"[`{_esc(json_src)}`](#{api_anchor})"
        else:
            json_link = "—"

        # → DWD 目标
        dwd_targets = ods_to_dwd.get(col_name, [])
        if dwd_targets:
            dwd_links = []
            for dt in dwd_targets:
                dwd_tbl = dt["dwd_table"]
                dwd_col = dt["dwd_col"]
                dwd_anc = dwd_anchors.get(dwd_tbl, f"dwd-{dwd_tbl}")
                dwd_links.append(f"[`{dwd_tbl}.{dwd_col}`](#{dwd_anc})")
            dwd_link = ", ".join(dwd_links)
        else:
            dwd_link = "—"

        # 业务描述
        biz_desc = ods_descs.get(col_name.lower(), "")
        if biz_desc and len(biz_desc) > 60:
            biz_desc = biz_desc[:57] + "..."
        biz_desc = _esc(biz_desc)

        w(f"| {i} | `{col_name}` | {col_type} | {json_link} | {dwd_link} | {biz_desc} |")

    w()


def _write_dwd_section(w, fm, dwd_schema, bd, dwd_name, dwd_anchor, ods_anchor, table_name):
    """生成 DWD 表结构区块（增加业务描述列）"""
    w(f'<a id="{dwd_anchor}"></a>')
    w()
    w(f"#### DWD 表结构 — dwd.{dwd_name} [🔗 ODS](#{ods_anchor})")
    w()

    if not dwd_schema or "columns" not in dwd_schema:
        w("_无 DB schema 数据_")
        w()
        return

    # 构建 dwd_to_ods 查找
    dwd_to_ods_map: dict[str, dict] = {}
    if fm and "dwd_to_ods" in fm and dwd_name in fm["dwd_to_ods"]:
        for entry in fm["dwd_to_ods"][dwd_name]:
            dwd_to_ods_map[entry["dwd_col"]] = entry

    # BD_manual DWD 描述
    dwd_descs = {}
    if bd and "dwd_fields" in bd:
        dwd_descs = bd["dwd_fields"].get(dwd_name, {})

    cols = dwd_schema["columns"]
    w(f"共 {len(cols)} 列")
    w()
    w("| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 业务描述 |")
    w("|---|---------|------|----------|------|---------|")

    for i, col in enumerate(cols, 1):
        col_name = col["name"]
        col_type = col["data_type"]

        mapping = dwd_to_ods_map.get(col_name)
        if mapping:
            ods_src = mapping.get("ods_source", "")
            ods_link = f"[`{ods_src}`](#{ods_anchor})" if ods_src and ods_src != "—" else "—"
            transform = mapping.get("mapping_type", "")
            note = mapping.get("note", "")
        else:
            ods_link = "—"
            transform = ""
            note = ""
            if col_name in ("valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id"):
                transform = "ETL 生成"

        # 业务描述（优先 BD_manual，其次 mapping note，最后 DB comment）
        biz_desc = dwd_descs.get(col_name.lower(), "")
        if not biz_desc and note:
            biz_desc = note
        if not biz_desc:
            db_comment = col.get("comment", "")
            if db_comment:
                if "【说明】" in db_comment:
                    desc_part = db_comment.split("【说明】")[1]
                    if "【" in desc_part:
                        desc_part = desc_part.split("【")[0]
                    biz_desc = desc_part.strip().rstrip("。").strip()
                else:
                    biz_desc = db_comment
        if biz_desc and len(biz_desc) > 60:
            biz_desc = biz_desc[:57] + "..."
        biz_desc = _esc(biz_desc)

        w(f"| {i} | `{col_name}` | {col_type} | {ods_link} | {_esc(transform)} | {biz_desc} |")

    w()


def main() -> None:
    load_dotenv(Path(".env"), override=False)

    parser = build_parser()
    args = parser.parse_args()

    data_dir = resolve_data_dir(args.output_dir)
    if not data_dir.exists():
        print(f"错误：数据目录不存在: {data_dir}")
        return

    print(f"读取数据目录: {data_dir}")
    report = generate_report(data_dir)

    now = datetime.now()
    filename = f"dataflow_{now.strftime('%Y-%m-%d_%H%M%S')}.md"
    output_path = data_dir / filename

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(report)

    print(f"\n{'='*60}")
    print(f"报告生成完成")
    print(f"{'='*60}")
    print(f"  输出路径: {output_path}")
    print(f"  文件大小: {output_path.stat().st_size / 1024:.1f} KB")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()