init: 项目初始提交 - NeoZQYY Monorepo 完整代码

2026-02-15 14:58:14 +08:00
commit ded6dfb9d8
769 changed files with 182616 additions and 0 deletions
--- a/apps/etl/pipelines/feiqiu/scripts/compare_api_ods_v2.py
+++ b/apps/etl/pipelines/feiqiu/scripts/compare_api_ods_v2.py
@@ -0,0 +1,461 @@
+# -*- coding: utf-8 -*-
+"""
+API 参考文档 vs ODS 实际表结构 对比脚本 (v2)
+
+从 docs/api-reference/*.md 的 JSON 样例中提取字段，
+查询 PostgreSQL billiards_ods 的实际列，
+输出差异报告 JSON 和 Markdown + ALTER SQL。
+
+用法: python scripts/compare_api_ods_v2.py
+"""
+import json
+import os
+import re
+import sys
+from datetime import datetime
+
+ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, ROOT)
+
+from dotenv import load_dotenv
+load_dotenv(os.path.join(ROOT, ".env"))
+
+import psycopg2
+
+# ODS 元列（ETL 管理列，不来自 API）
+ODS_META_COLS = {
+    "source_file", "source_endpoint", "fetched_at",
+    "payload", "content_hash",
+}
+
+
+def load_registry():
+    """加载 API 注册表"""
+    path = os.path.join(ROOT, "docs", "api-reference", "api_registry.json")
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def extract_fields_from_md(md_path, api_id):
+    """
+    从 md 文件的 JSON 样例（五、响应样例）中提取所有字段名（小写）。
+    对 settlement_records / recharge_settlements 等嵌套结构，
+    提取 settleList 内层字段 + siteProfile 字段。
+    """
+    with open(md_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # 提取所有 ```json ... ``` 代码块
+    json_blocks = re.findall(r'```json\s*\n(.*?)\n```', content, re.DOTALL)
+    if not json_blocks:
+        return None, None, "无 JSON 样例"
+
+    # 找到最大的 JSON 对象（响应样例通常是最大的）
+    sample_json = None
+    for block in json_blocks:
+        try:
+            parsed = json.loads(block)
+            if isinstance(parsed, dict):
+                if sample_json is None or len(str(parsed)) > len(str(sample_json)):
+                    sample_json = parsed
+        except json.JSONDecodeError:
+            continue
+
+    if sample_json is None:
+        return None, None, "无法解析 JSON 样例"
+
+    fields = set()
+    has_nested = False
+
+    # settlement_records / recharge_settlements 嵌套结构:
+    # { "siteProfile": {...}, "settleList": {...} }
+    if "siteProfile" in sample_json and "settleList" in sample_json:
+        has_nested = True
+        sl = sample_json.get("settleList", {})
+        if isinstance(sl, dict):
+            for k in sl:
+                fields.add(k.lower())
+        return fields, has_nested, None
+
+    # CHANGE: stock_goods_category_tree 特殊结构处理
+    # intent: goodsCategoryList 是数组包装，ODS 存储的是展平后的分类节点字段
+    # assumptions: 外层 total/goodsCategoryList 不是 ODS 列
+    if "goodsCategoryList" in sample_json and isinstance(sample_json["goodsCategoryList"], list):
+        has_nested = True
+        arr = sample_json["goodsCategoryList"]
+        if arr and isinstance(arr[0], dict):
+            _extract_flat(arr[0], fields)
+        return fields, has_nested, None
+
+    for k in sample_json:
+        fields.add(k.lower())
+    return fields, has_nested, None
+
+
+def _extract_flat(obj, fields):
+    """递归提取字典的标量字段名（跳过数组/嵌套对象值，但保留键名）"""
+    if not isinstance(obj, dict):
+        return
+    for k, v in obj.items():
+        fields.add(k.lower())
+
+
+def get_all_ods_columns(conn):
+    """查询所有 ODS 表的列信息"""
+    cur = conn.cursor()
+    cur.execute("""
+        SELECT table_name, column_name, data_type, ordinal_position
+        FROM information_schema.columns
+        WHERE table_schema = 'billiards_ods'
+        ORDER BY table_name, ordinal_position
+    """)
+    rows = cur.fetchall()
+    cur.close()
+
+    tables = {}
+    for table_name, col_name, data_type, pos in rows:
+        if table_name not in tables:
+            tables[table_name] = {}
+        tables[table_name][col_name] = {
+            "data_type": data_type,
+            "ordinal_position": pos,
+        }
+    return tables
+
+
+
+def guess_pg_type(name):
+    """根据字段名猜测 PostgreSQL 类型（用于 ALTER TABLE ADD COLUMN）"""
+    n = name.lower()
+    if n == "id" or n.endswith("_id") or n.endswith("id"):
+        return "bigint"
+    money_kw = ["amount", "money", "price", "cost", "fee", "discount",
+                "deduct", "balance", "charge", "sale", "refund",
+                "promotion", "adjust", "rounding", "prepay", "income",
+                "royalty", "grade", "point", "stock", "num"]
+    for kw in money_kw:
+        if kw in n:
+            return "numeric(18,2)"
+    if "time" in n or "date" in n:
+        return "timestamp without time zone"
+    if n.startswith("is_") or (n.startswith("is") and len(n) > 2 and n[2].isupper()):
+        return "boolean"
+    if n.startswith("able_") or n.startswith("can"):
+        return "boolean"
+    int_kw = ["status", "type", "sort", "count", "seconds", "level",
+              "channel", "method", "way", "enabled", "switch", "delete",
+              "first", "single", "trash", "confirm", "clock", "cycle",
+              "delay", "free", "virtual", "online", "show", "audit",
+              "freeze", "send", "required", "scene", "range", "tag",
+              "on", "minutes", "number", "duration"]
+    for kw in int_kw:
+        if kw in n:
+            return "integer"
+    return "text"
+
+
+def compare_one(api_entry, md_path, ods_tables):
+    """比较单个 API 与其 ODS 表"""
+    api_id = api_entry["id"]
+    ods_table = api_entry.get("ods_table")
+    name_zh = api_entry.get("name_zh", "")
+
+    result = {
+        "api_id": api_id,
+        "name_zh": name_zh,
+        "ods_table": ods_table,
+    }
+
+    if not ods_table:
+        result["status"] = "skip"
+        result["reason"] = "无对应 ODS 表（ods_table=null）"
+        return result
+
+    if api_entry.get("skip"):
+        result["status"] = "skip"
+        result["reason"] = "接口标记为 skip（暂不可用）"
+        return result
+
+    # 提取 API JSON 样例字段
+    api_fields, has_nested, err = extract_fields_from_md(md_path, api_id)
+    if err:
+        result["status"] = "error"
+        result["reason"] = err
+        return result
+
+    # 获取 ODS 表列
+    if ods_table not in ods_tables:
+        result["status"] = "error"
+        result["reason"] = f"ODS 表 {ods_table} 不存在"
+        return result
+
+    ods_cols = ods_tables[ods_table]
+    ods_biz_cols = {c for c in ods_cols if c not in ODS_META_COLS}
+
+    # 比较
+    api_lower = {f.lower() for f in api_fields}
+    ods_lower = {c.lower() for c in ods_biz_cols}
+
+    # API 有但 ODS 没有的字段
+    api_only = sorted(api_lower - ods_lower)
+    # ODS 有但 API 没有的字段（非元列）
+    ods_only = sorted(ods_lower - api_lower)
+    # 两边都有的字段
+    matched = sorted(api_lower & ods_lower)
+
+    result["status"] = "ok" if not api_only else "drift"
+    result["has_nested_structure"] = has_nested
+    result["api_field_count"] = len(api_lower)
+    result["ods_biz_col_count"] = len(ods_biz_cols)
+    result["ods_total_col_count"] = len(ods_cols)
+    result["matched_count"] = len(matched)
+    result["api_only"] = api_only
+    result["api_only_count"] = len(api_only)
+    result["ods_only"] = ods_only
+    result["ods_only_count"] = len(ods_only)
+    result["matched"] = matched
+
+    return result
+
+
+def generate_alter_sql(results, ods_tables):
+    """生成 ALTER TABLE SQL 语句"""
+    sqls = []
+    for r in results:
+        if r.get("status") != "drift" or not r.get("api_only"):
+            continue
+        table = r["ods_table"]
+        for field in r["api_only"]:
+            pg_type = guess_pg_type(field)
+            sqls.append(
+                f"ALTER TABLE billiards_ods.{table} "
+                f"ADD COLUMN IF NOT EXISTS {field} {pg_type};"
+            )
+    return sqls
+
+
+def generate_markdown_report(results, alter_sqls):
+    """生成 Markdown 报告"""
+    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    lines = [
+        "# API 参考文档 vs ODS 实际表结构 对比报告 (v2)",
+        "",
+        f"> 生成时间：{now}",
+        "> 数据来源：`docs/api-reference/*.md` JSON 样例 vs `billiards_ods` 实际列",
+        "",
+        "---",
+        "",
+        "## 一、汇总",
+        "",
+        "| API 接口 | 中文名 | ODS 表 | 状态 | API 字段数 | ODS 业务列数 | 匹配 | API 独有 | ODS 独有 |",
+        "|----------|--------|--------|------|-----------|-------------|------|---------|---------|",
+    ]
+
+    total_api_only = 0
+    total_ods_only = 0
+    ok_count = 0
+    drift_count = 0
+    skip_count = 0
+    error_count = 0
+
+    for r in results:
+        status = r.get("status", "?")
+        if status == "skip":
+            skip_count += 1
+            lines.append(
+                f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
+                f"| ⏭️ 跳过 | - | - | - | - | - |"
+            )
+            continue
+        if status == "error":
+            error_count += 1
+            lines.append(
+                f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
+                f"| ❌ 错误 | - | - | - | - | - |"
+            )
+            continue
+
+        api_only_n = r.get("api_only_count", 0)
+        ods_only_n = r.get("ods_only_count", 0)
+        total_api_only += api_only_n
+        total_ods_only += ods_only_n
+
+        if status == "ok":
+            ok_count += 1
+            badge = "✅ 对齐"
+        else:
+            drift_count += 1
+            badge = "⚠️ 漂移"
+
+        lines.append(
+            f"| {r['api_id']} | {r['name_zh']} | {r['ods_table']} "
+            f"| {badge} | {r['api_field_count']} | {r['ods_biz_col_count']} "
+            f"| {r['matched_count']} | {api_only_n} | {ods_only_n} |"
+        )
+
+    lines.extend([
+        "",
+        f"**统计**：对齐 {ok_count} / 漂移 {drift_count} / 跳过 {skip_count} / 错误 {error_count}",
+        f"**API 独有字段总计**：{total_api_only}（需要 ALTER TABLE ADD COLUMN）",
+        f"**ODS 独有列总计**：{total_ods_only}（API 中不存在，可能是历史遗留或 ETL 派生列）",
+        "",
+    ])
+
+    # 详情：每个漂移表的字段差异
+    drift_results = [r for r in results if r.get("status") == "drift"]
+    if drift_results:
+        lines.extend(["---", "", "## 二、漂移详情", ""])
+        for r in drift_results:
+            lines.extend([
+                f"### {r['api_id']}（{r['name_zh']}）→ `{r['ods_table']}`",
+                "",
+            ])
+            if r["api_only"]:
+                lines.append("**API 有 / ODS 缺**：")
+                for f in r["api_only"]:
+                    pg_type = guess_pg_type(f)
+                    lines.append(f"- `{f}` → 建议类型 `{pg_type}`")
+                lines.append("")
+            if r["ods_only"]:
+                lines.append("**ODS 有 / API 无**（非元列）：")
+                for f in r["ods_only"]:
+                    lines.append(f"- `{f}`")
+                lines.append("")
+
+    # ODS 独有列详情（所有表）
+    ods_only_results = [r for r in results if r.get("ods_only") and r.get("status") in ("ok", "drift")]
+    if ods_only_results:
+        lines.extend(["---", "", "## 三、ODS 独有列详情（API 中不存在）", ""])
+        for r in ods_only_results:
+            if not r["ods_only"]:
+                continue
+            lines.extend([
+                f"### `{r['ods_table']}`（{r['name_zh']}）",
+                "",
+                "| 列名 | 说明 |",
+                "|------|------|",
+            ])
+            for f in r["ods_only"]:
+                lines.append(f"| `{f}` | ODS 独有，API JSON 样例中不存在 |")
+            lines.append("")
+
+    # ALTER SQL
+    if alter_sqls:
+        lines.extend([
+            "---", "",
+            "## 四、ALTER SQL（对齐 ODS 表结构）", "",
+            "```sql",
+            "-- 自动生成的 ALTER TABLE 语句",
+            f"-- 生成时间：{now}",
+            "-- 注意：类型为根据字段名猜测，请人工复核后执行",
+            "",
+        ])
+        lines.extend(alter_sqls)
+        lines.extend(["", "```", ""])
+
+    return "\n".join(lines)
+
+
+
+def main():
+    dsn = os.environ.get("PG_DSN")
+    if not dsn:
+        print("错误：未设置 PG_DSN 环境变量", file=sys.stderr)
+        sys.exit(1)
+
+    print("连接数据库...")
+    conn = psycopg2.connect(dsn)
+
+    print("查询 ODS 表结构...")
+    ods_tables = get_all_ods_columns(conn)
+    print(f"  共 {len(ods_tables)} 张 ODS 表")
+
+    print("加载 API 注册表...")
+    registry = load_registry()
+    print(f"  共 {len(registry)} 个 API 端点")
+
+    results = []
+    for entry in registry:
+        api_id = entry["id"]
+        ods_table = entry.get("ods_table")
+        md_path = os.path.join(ROOT, "docs", "api-reference", f"{api_id}.md")
+
+        if not os.path.exists(md_path):
+            results.append({
+                "api_id": api_id,
+                "name_zh": entry.get("name_zh", ""),
+                "ods_table": ods_table,
+                "status": "error",
+                "reason": f"文档不存在: {md_path}",
+            })
+            continue
+
+        r = compare_one(entry, md_path, ods_tables)
+        results.append(r)
+
+        status_icon = {"ok": "✅", "drift": "⚠️", "skip": "⏭️", "error": "❌"}.get(r["status"], "?")
+        extra = ""
+        if r.get("api_only_count"):
+            extra = f" (API独有: {r['api_only_count']})"
+        if r.get("ods_only_count"):
+            extra += f" (ODS独有: {r['ods_only_count']})"
+        print(f"  {status_icon} {api_id} → {ods_table or '-'}{extra}")
+
+    conn.close()
+
+    # 生成 ALTER SQL
+    alter_sqls = generate_alter_sql(results, ods_tables)
+
+    # 输出 JSON 报告
+    json_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.json")
+    os.makedirs(os.path.dirname(json_path), exist_ok=True)
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\nJSON 报告: {json_path}")
+
+    # 输出 Markdown 报告
+    md_report = generate_markdown_report(results, alter_sqls)
+    md_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.md")
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write(md_report)
+    print(f"Markdown 报告: {md_path}")
+
+    # 输出 ALTER SQL 文件
+    if alter_sqls:
+        sql_path = os.path.join(ROOT, "database", "migrations",
+                                "20260213_align_ods_with_api_v2.sql")
+        os.makedirs(os.path.dirname(sql_path), exist_ok=True)
+        with open(sql_path, "w", encoding="utf-8") as f:
+            f.write("-- API vs ODS 对齐迁移脚本 (v2)\n")
+            f.write(f"-- 生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write("-- 注意：类型为根据字段名猜测，请人工复核后执行\n\n")
+            f.write("BEGIN;\n\n")
+            for sql in alter_sqls:
+                f.write(sql + "\n")
+            f.write("\nCOMMIT;\n")
+        print(f"ALTER SQL: {sql_path}")
+    else:
+        print("无需 ALTER SQL（所有表已对齐）")
+
+    # 统计
+    ok_n = sum(1 for r in results if r.get("status") == "ok")
+    drift_n = sum(1 for r in results if r.get("status") == "drift")
+    skip_n = sum(1 for r in results if r.get("status") == "skip")
+    err_n = sum(1 for r in results if r.get("status") == "error")
+    print(f"\n汇总：对齐 {ok_n} / 漂移 {drift_n} / 跳过 {skip_n} / 错误 {err_n}")
+    print(f"ALTER SQL 语句数：{len(alter_sqls)}")
+
+
+if __name__ == "__main__":
+    main()
+
+
+# ──────────────────────────────────────────────
+# AI_CHANGELOG:
+# - 日期: 2026-02-13
+#   Prompt: P20260213-223000 — 用 API 参考文档比对数据库 ODS 实际表结构（重做，不依赖 DDL）
+#   直接原因: 前次比对脚本 stock_goods_category_tree 嵌套结构解析 bug，需重写脚本
+#   变更摘要: 完整重写脚本，从 api-reference/*.md JSON 样例提取字段，查询 PG billiards_ods 实际列，
+#             处理三种特殊结构（标准/settleList 嵌套/goodsCategoryList 数组包装），输出 JSON+MD 报告
+#   风险与验证: 纯分析脚本，不修改数据库；验证方式：运行脚本确认 "对齐 22 / 漂移 0"
+# ──────────────────────────────────────────────