Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/compare_api_ods_v2.py

# -*- coding: utf-8 -*-
"""
API 参考文档 vs ODS 实际表结构 对比脚本 (v2)

从 docs/api-reference/*.md 的 JSON 样例中提取字段，
查询 PostgreSQL billiards_ods 的实际列，
输出差异报告 JSON 和 Markdown + ALTER SQL。

用法: python scripts/compare_api_ods_v2.py
"""
import json
import os
import re
import sys
from datetime import datetime

ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, ROOT)

from dotenv import load_dotenv
load_dotenv(os.path.join(ROOT, ".env"))

import psycopg2

# ODS 元列（ETL 管理列，不来自 API）
ODS_META_COLS = {
    "source_file", "source_endpoint", "fetched_at",
    "payload", "content_hash",
}


def load_registry():
    """加载 API 注册表"""
    path = os.path.join(ROOT, "docs", "api-reference", "api_registry.json")
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def extract_fields_from_md(md_path, api_id):
    """
    从 md 文件的 JSON 样例（五、响应样例）中提取所有字段名（小写）。
    对 settlement_records / recharge_settlements 等嵌套结构，
    提取 settleList 内层字段 + siteProfile 字段。
    """
    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()

    # 提取所有 ```json ... ``` 代码块
    json_blocks = re.findall(r'```json\s*\n(.*?)\n```', content, re.DOTALL)
    if not json_blocks:
        return None, None, "无 JSON 样例"

    # 找到最大的 JSON 对象（响应样例通常是最大的）
    sample_json = None
    for block in json_blocks:
        try:
            parsed = json.loads(block)
            if isinstance(parsed, dict):
                if sample_json is None or len(str(parsed)) > len(str(sample_json)):
                    sample_json = parsed
        except json.JSONDecodeError:
            continue

    if sample_json is None:
        return None, None, "无法解析 JSON 样例"

    fields = set()
    has_nested = False

    # settlement_records / recharge_settlements 嵌套结构:
    # { "siteProfile": {...}, "settleList": {...} }
    if "siteProfile" in sample_json and "settleList" in sample_json:
        has_nested = True
        sl = sample_json.get("settleList", {})
        if isinstance(sl, dict):
            for k in sl:
                fields.add(k.lower())
        return fields, has_nested, None

    # CHANGE: stock_goods_category_tree 特殊结构处理
    # intent: goodsCategoryList 是数组包装，ODS 存储的是展平后的分类节点字段
    # assumptions: 外层 total/goodsCategoryList 不是 ODS 列
    if "goodsCategoryList" in sample_json and isinstance(sample_json["goodsCategoryList"], list):
        has_nested = True
        arr = sample_json["goodsCategoryList"]
        if arr and isinstance(arr[0], dict):
            _extract_flat(arr[0], fields)
        return fields, has_nested, None

    for k in sample_json:
        fields.add(k.lower())
    return fields, has_nested, None


def _extract_flat(obj, fields):
    """递归提取字典的标量字段名（跳过数组/嵌套对象值，但保留键名）"""
    if not isinstance(obj, dict):
        return
    for k, v in obj.items():
        fields.add(k.lower())


def get_all_ods_columns(conn):
    """查询所有 ODS 表的列信息"""
    cur = conn.cursor()
    cur.execute("""
        SELECT table_name, column_name, data_type, ordinal_position
        FROM information_schema.columns
        WHERE table_schema = 'billiards_ods'
        ORDER BY table_name, ordinal_position
    """)
    rows = cur.fetchall()
    cur.close()

    tables = {}
    for table_name, col_name, data_type, pos in rows:
        if table_name not in tables:
            tables[table_name] = {}
        tables[table_name][col_name] = {
            "data_type": data_type,
            "ordinal_position": pos,
        }
    return tables


def guess_pg_type(name):
    """根据字段名猜测 PostgreSQL 类型（用于 ALTER TABLE ADD COLUMN）"""
    n = name.lower()
    if n == "id" or n.endswith("_id") or n.endswith("id"):
        return "bigint"
    money_kw = ["amount", "money", "price", "cost", "fee", "discount",
                "deduct", "balance", "charge", "sale", "refund",
                "promotion", "adjust", "rounding", "prepay", "income",
                "royalty", "grade", "point", "stock", "num"]
    for kw in money_kw:
        if kw in n:
            return "numeric(18,2)"
    if "time" in n or "date" in n:
        return "timestamp without time zone"
    if n.startswith("is_") or (n.startswith("is") and len(n) > 2 and n[2].isupper()):
        return "boolean"
    if n.startswith("able_") or n.startswith("can"):
        return "boolean"
    int_kw = ["status", "type", "sort", "count", "seconds", "level",
              "channel", "method", "way", "enabled", "switch", "delete",
              "first", "single", "trash", "confirm", "clock", "cycle",
              "delay", "free", "virtual", "online", "show", "audit",
              "freeze", "send", "required", "scene", "range", "tag",
              "on", "minutes", "number", "duration"]
    for kw in int_kw:
        if kw in n:
            return "integer"
    return "text"


def compare_one(api_entry, md_path, ods_tables):
    """比较单个 API 与其 ODS 表"""
    api_id = api_entry["id"]
    ods_table = api_entry.get("ods_table")
    name_zh = api_entry.get("name_zh", "")

    result = {
        "api_id": api_id,
        "name_zh": name_zh,
        "ods_table": ods_table,
    }

    if not ods_table:
        result["status"] = "skip"
        result["reason"] = "无对应 ODS 表（ods_table=null）"
        return result

    if api_entry.get("skip"):
        result["status"] = "skip"
        result["reason"] = "接口标记为 skip（暂不可用）"
        return result

    # 提取 API JSON 样例字段
    api_fields, has_nested, err = extract_fields_from_md(md_path, api_id)
    if err:
        result["status"] = "error"
        result["reason"] = err
        return result

    # 获取 ODS 表列
    if ods_table not in ods_tables:
        result["status"] = "error"
        result["reason"] = f"ODS 表 {ods_table} 不存在"
        return result

    ods_cols = ods_tables[ods_table]
    ods_biz_cols = {c for c in ods_cols if c not in ODS_META_COLS}

    # 比较
    api_lower = {f.lower() for f in api_fields}
    ods_lower = {c.lower() for c in ods_biz_cols}

    # API 有但 ODS 没有的字段
    api_only = sorted(api_lower - ods_lower)
    # ODS 有但 API 没有的字段（非元列）
    ods_only = sorted(ods_lower - api_lower)
    # 两边都有的字段
    matched = sorted(api_lower & ods_lower)

    result["status"] = "ok" if not api_only else "drift"
    result["has_nested_structure"] = has_nested
    result["api_field_count"] = len(api_lower)
    result["ods_biz_col_count"] = len(ods_biz_cols)
    result["ods_total_col_count"] = len(ods_cols)
    result["matched_count"] = len(matched)
    result["api_only"] = api_only
    result["api_only_count"] = len(api_only)
    result["ods_only"] = ods_only
    result["ods_only_count"] = len(ods_only)
    result["matched"] = matched

    return result


def generate_alter_sql(results, ods_tables):
    """生成 ALTER TABLE SQL 语句"""
    sqls = []
    for r in results:
        if r.get("status") != "drift" or not r.get("api_only"):
            continue
        table = r["ods_table"]
        for field in r["api_only"]:
            pg_type = guess_pg_type(field)
            sqls.append(
                f"ALTER TABLE billiards_ods.{table} "
                f"ADD COLUMN IF NOT EXISTS {field} {pg_type};"
            )
    return sqls


def generate_markdown_report(results, alter_sqls):
    """生成 Markdown 报告"""
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    lines = [
        "# API 参考文档 vs ODS 实际表结构 对比报告 (v2)",
        "",
        f"> 生成时间：{now}",
        "> 数据来源：`docs/api-reference/*.md` JSON 样例 vs `billiards_ods` 实际列",
        "",
        "---",
        "",
        "## 一、汇总",
        "",
        "| API 接口 | 中文名 | ODS 表 | 状态 | API 字段数 | ODS 业务列数 | 匹配 | API 独有 | ODS 独有 |",
        "|----------|--------|--------|------|-----------|-------------|------|---------|---------|",
    ]

    total_api_only = 0
    total_ods_only = 0
    ok_count = 0
    drift_count = 0
    skip_count = 0
    error_count = 0

    for r in results:
        status = r.get("status", "?")
        if status == "skip":
            skip_count += 1
            lines.append(
                f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
                f"| ⏭️ 跳过 | - | - | - | - | - |"
            )
            continue
        if status == "error":
            error_count += 1
            lines.append(
                f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
                f"| ❌ 错误 | - | - | - | - | - |"
            )
            continue

        api_only_n = r.get("api_only_count", 0)
        ods_only_n = r.get("ods_only_count", 0)
        total_api_only += api_only_n
        total_ods_only += ods_only_n

        if status == "ok":
            ok_count += 1
            badge = "✅ 对齐"
        else:
            drift_count += 1
            badge = "⚠️ 漂移"

        lines.append(
            f"| {r['api_id']} | {r['name_zh']} | {r['ods_table']} "
            f"| {badge} | {r['api_field_count']} | {r['ods_biz_col_count']} "
            f"| {r['matched_count']} | {api_only_n} | {ods_only_n} |"
        )

    lines.extend([
        "",
        f"**统计**：对齐 {ok_count} / 漂移 {drift_count} / 跳过 {skip_count} / 错误 {error_count}",
        f"**API 独有字段总计**：{total_api_only}（需要 ALTER TABLE ADD COLUMN）",
        f"**ODS 独有列总计**：{total_ods_only}（API 中不存在，可能是历史遗留或 ETL 派生列）",
        "",
    ])

    # 详情：每个漂移表的字段差异
    drift_results = [r for r in results if r.get("status") == "drift"]
    if drift_results:
        lines.extend(["---", "", "## 二、漂移详情", ""])
        for r in drift_results:
            lines.extend([
                f"### {r['api_id']}（{r['name_zh']}）→ `{r['ods_table']}`",
                "",
            ])
            if r["api_only"]:
                lines.append("**API 有 / ODS 缺**：")
                for f in r["api_only"]:
                    pg_type = guess_pg_type(f)
                    lines.append(f"- `{f}` → 建议类型 `{pg_type}`")
                lines.append("")
            if r["ods_only"]:
                lines.append("**ODS 有 / API 无**（非元列）：")
                for f in r["ods_only"]:
                    lines.append(f"- `{f}`")
                lines.append("")

    # ODS 独有列详情（所有表）
    ods_only_results = [r for r in results if r.get("ods_only") and r.get("status") in ("ok", "drift")]
    if ods_only_results:
        lines.extend(["---", "", "## 三、ODS 独有列详情（API 中不存在）", ""])
        for r in ods_only_results:
            if not r["ods_only"]:
                continue
            lines.extend([
                f"### `{r['ods_table']}`（{r['name_zh']}）",
                "",
                "| 列名 | 说明 |",
                "|------|------|",
            ])
            for f in r["ods_only"]:
                lines.append(f"| `{f}` | ODS 独有，API JSON 样例中不存在 |")
            lines.append("")

    # ALTER SQL
    if alter_sqls:
        lines.extend([
            "---", "",
            "## 四、ALTER SQL（对齐 ODS 表结构）", "",
            "```sql",
            "-- 自动生成的 ALTER TABLE 语句",
            f"-- 生成时间：{now}",
            "-- 注意：类型为根据字段名猜测，请人工复核后执行",
            "",
        ])
        lines.extend(alter_sqls)
        lines.extend(["", "```", ""])

    return "\n".join(lines)


def main():
    dsn = os.environ.get("PG_DSN")
    if not dsn:
        print("错误：未设置 PG_DSN 环境变量", file=sys.stderr)
        sys.exit(1)

    print("连接数据库...")
    conn = psycopg2.connect(dsn)

    print("查询 ODS 表结构...")
    ods_tables = get_all_ods_columns(conn)
    print(f"  共 {len(ods_tables)} 张 ODS 表")

    print("加载 API 注册表...")
    registry = load_registry()
    print(f"  共 {len(registry)} 个 API 端点")

    results = []
    for entry in registry:
        api_id = entry["id"]
        ods_table = entry.get("ods_table")
        md_path = os.path.join(ROOT, "docs", "api-reference", f"{api_id}.md")

        if not os.path.exists(md_path):
            results.append({
                "api_id": api_id,
                "name_zh": entry.get("name_zh", ""),
                "ods_table": ods_table,
                "status": "error",
                "reason": f"文档不存在: {md_path}",
            })
            continue

        r = compare_one(entry, md_path, ods_tables)
        results.append(r)

        status_icon = {"ok": "✅", "drift": "⚠️", "skip": "⏭️", "error": "❌"}.get(r["status"], "?")
        extra = ""
        if r.get("api_only_count"):
            extra = f" (API独有: {r['api_only_count']})"
        if r.get("ods_only_count"):
            extra += f" (ODS独有: {r['ods_only_count']})"
        print(f"  {status_icon} {api_id} → {ods_table or '-'}{extra}")

    conn.close()

    # 生成 ALTER SQL
    alter_sqls = generate_alter_sql(results, ods_tables)

    # 输出 JSON 报告
    json_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.json")
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\nJSON 报告: {json_path}")

    # 输出 Markdown 报告
    md_report = generate_markdown_report(results, alter_sqls)
    md_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.md")
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md_report)
    print(f"Markdown 报告: {md_path}")

    # 输出 ALTER SQL 文件
    if alter_sqls:
        sql_path = os.path.join(ROOT, "database", "migrations",
                                "20260213_align_ods_with_api_v2.sql")
        os.makedirs(os.path.dirname(sql_path), exist_ok=True)
        with open(sql_path, "w", encoding="utf-8") as f:
            f.write("-- API vs ODS 对齐迁移脚本 (v2)\n")
            f.write(f"-- 生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write("-- 注意：类型为根据字段名猜测，请人工复核后执行\n\n")
            f.write("BEGIN;\n\n")
            for sql in alter_sqls:
                f.write(sql + "\n")
            f.write("\nCOMMIT;\n")
        print(f"ALTER SQL: {sql_path}")
    else:
        print("无需 ALTER SQL（所有表已对齐）")

    # 统计
    ok_n = sum(1 for r in results if r.get("status") == "ok")
    drift_n = sum(1 for r in results if r.get("status") == "drift")
    skip_n = sum(1 for r in results if r.get("status") == "skip")
    err_n = sum(1 for r in results if r.get("status") == "error")
    print(f"\n汇总：对齐 {ok_n} / 漂移 {drift_n} / 跳过 {skip_n} / 错误 {err_n}")
    print(f"ALTER SQL 语句数：{len(alter_sqls)}")


if __name__ == "__main__":
    main()


# ──────────────────────────────────────────────
# AI_CHANGELOG:
# - 日期: 2026-02-13
#   Prompt: P20260213-223000 — 用 API 参考文档比对数据库 ODS 实际表结构（重做，不依赖 DDL）
#   直接原因: 前次比对脚本 stock_goods_category_tree 嵌套结构解析 bug，需重写脚本
#   变更摘要: 完整重写脚本，从 api-reference/*.md JSON 样例提取字段，查询 PG billiards_ods 实际列，
#             处理三种特殊结构（标准/settleList 嵌套/goodsCategoryList 数组包装），输出 JSON+MD 报告
#   风险与验证: 纯分析脚本，不修改数据库；验证方式：运行脚本确认 "对齐 22 / 漂移 0"
# ──────────────────────────────────────────────