在准备环境前提交次全部更改。

2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions
--- a/scripts/ops/gen_api_field_mapping.py
+++ b/scripts/ops/gen_api_field_mapping.py
@@ -0,0 +1,398 @@
+# -*- coding: utf-8 -*-
+"""
+从数据库 payload 字段提取 API 原始 JSON 字段，生成 API 源字段 → ODS 映射文档。
+直接从 API 返回的 JSON 分析，不依赖处理代码。
+
+用法: python scripts/ops/gen_api_field_mapping.py
+输出: 在 docs/reports/dataflow_api_ods_dwd.md 的每个表章节中插入 API 源字段小节
+"""
+import json
+import os
+import re
+import sys
+from collections import OrderedDict
+from pathlib import Path
+
+import psycopg2
+
+ROOT = Path(__file__).resolve().parents[2]
+INPUT_DOC = ROOT / "docs" / "reports" / "dataflow_api_ods_dwd.md"
+OUTPUT_DOC = INPUT_DOC  # 原地更新
+
+# ODS schema 名（从数据库动态检测）
+ODS_SCHEMA = None  # 运行时自动检测
+
+# ODS 表列表（与文档中的顺序一致）
+ODS_TABLES = [
+    "assistant_accounts_master",
+    "assistant_cancellation_records",
+    "assistant_service_records",
+    "goods_stock_movements",
+    "goods_stock_summary",
+    "group_buy_packages",
+    "group_buy_redemption_records",
+    "member_balance_changes",
+    "member_profiles",
+    "member_stored_value_cards",
+    "payment_transactions",
+    "platform_coupon_redemption_records",
+    "recharge_settlements",
+    "refund_transactions",
+    "settlement_records",
+    "settlement_ticket_details",
+    "site_tables_master",
+    "stock_goods_category_tree",
+    "store_goods_master",
+    "store_goods_sales_records",
+    "table_fee_discount_records",
+    "table_fee_transactions",
+    "tenant_goods_master",
+]
+
+# ETL 元数据列（不来自 API）
+ETL_META_COLS = {
+    "content_hash", "source_file", "source_endpoint",
+    "fetched_at", "payload", "record_index",
+}
+
+# 需要展平的嵌套层（merge_record_layers 逻辑）
+FLATTEN_KEYS = {"data", "settleList"}
+
+
+def get_db_dsn() -> str:
+    """从 .env 文件读取数据库连接串。"""
+    from dotenv import load_dotenv
+    env_path = ROOT / "apps" / "etl" / "pipelines" / "feiqiu" / ".env"
+    if env_path.exists():
+        load_dotenv(env_path, override=True)
+    load_dotenv(ROOT / ".env")
+    dsn = os.environ.get("PG_DSN") or os.environ.get("DB_DSN") or os.environ.get("DATABASE_URL")
+    if not dsn:
+        print("错误: 未找到 PG_DSN / DB_DSN / DATABASE_URL 环境变量", file=sys.stderr)
+        sys.exit(1)
+    return dsn
+
+
+def flatten_json_keys(obj: dict, prefix: str = "") -> list[tuple[str, str]]:
+    """
+    递归提取 JSON 对象的所有叶子键及其值类型。
+    返回 [(key_path, value_type), ...]
+    对于嵌套对象，用 "." 连接路径。
+    对于数组，标记为 array 并递归展开元素。
+    """
+    results = []
+    if not isinstance(obj, dict):
+        return results
+    for k, v in obj.items():
+        full_key = f"{prefix}.{k}" if prefix else k
+        if v is None:
+            results.append((full_key, "null"))
+        elif isinstance(v, bool):
+            results.append((full_key, "boolean"))
+        elif isinstance(v, int):
+            results.append((full_key, "integer"))
+        elif isinstance(v, float):
+            results.append((full_key, "number"))
+        elif isinstance(v, str):
+            results.append((full_key, "string"))
+        elif isinstance(v, list):
+            results.append((full_key, "array"))
+            # 递归展开数组中的第一个对象元素
+            for item in v:
+                if isinstance(item, dict):
+                    results.extend(flatten_json_keys(item, f"{full_key}[]"))
+                    break
+        elif isinstance(v, dict):
+            results.append((full_key, "object"))
+            results.extend(flatten_json_keys(v, full_key))
+    return results
+
+
+def get_top_level_keys(obj: dict) -> list[tuple[str, str]]:
+    """
+    提取 JSON 对象的顶层键及其值类型（merge_record_layers 展平后的视角）。
+    模拟 ETL 的 merge_record_layers：展平 data 和 settleList 嵌套层。
+    """
+    merged = dict(obj)
+    # 展平 data 层
+    data_part = merged.get("data")
+    while isinstance(data_part, dict):
+        merged = {**data_part, **merged}
+        data_part = data_part.get("data")
+    # 展平 settleList 层
+    settle_inner = merged.get("settleList")
+    if isinstance(settle_inner, dict):
+        merged = {**settle_inner, **merged}
+
+    results = []
+    for k, v in merged.items():
+        if v is None:
+            vtype = "null"
+        elif isinstance(v, bool):
+            vtype = "boolean"
+        elif isinstance(v, int):
+            vtype = "integer"
+        elif isinstance(v, float):
+            vtype = "number"
+        elif isinstance(v, str):
+            vtype = "string"
+        elif isinstance(v, list):
+            vtype = "array"
+        elif isinstance(v, dict):
+            vtype = "object"
+        else:
+            vtype = type(v).__name__
+        results.append((k, vtype))
+    return results
+
+
+def fetch_sample_payloads(conn, table: str, sample_count: int = 5) -> list[dict]:
+    """从 ODS 表获取多条 payload 样本，合并字段以覆盖更多字段。"""
+    sql = f"""
+        SELECT payload
+        FROM {ODS_SCHEMA}.{table}
+        WHERE payload IS NOT NULL
+        ORDER BY fetched_at DESC
+        LIMIT {sample_count}
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql)
+        rows = cur.fetchall()
+    payloads = []
+    for row in rows:
+        p = row[0]
+        if isinstance(p, str):
+            p = json.loads(p)
+        if isinstance(p, dict):
+            payloads.append(p)
+    return payloads
+
+
+def merge_payloads_keys(payloads: list[dict]) -> OrderedDict[str, str]:
+    """合并多条 payload 的键，保留第一次出现的顺序和非 null 类型。"""
+    merged = OrderedDict()
+    for p in payloads:
+        keys = get_top_level_keys(p)
+        for k, vtype in keys:
+            if k not in merged:
+                merged[k] = vtype
+            elif merged[k] == "null" and vtype != "null":
+                merged[k] = vtype
+    return merged
+
+
+def get_ods_columns(conn, table: str) -> list[tuple[str, str]]:
+    """从数据库获取 ODS 表的列名和类型。"""
+    sql = """
+        SELECT column_name, data_type
+        FROM information_schema.columns
+        WHERE table_schema = %s AND table_name = %s
+        ORDER BY ordinal_position
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (ODS_SCHEMA, table))
+        return [(r[0], r[1]) for r in cur.fetchall()]
+
+
+def compute_mapping(api_keys: OrderedDict, ods_cols: list[tuple[str, str]]) -> list[dict]:
+    """
+    计算 API 字段 → ODS 列的映射关系。
+    ETL 使用大小写不敏感匹配（_get_value_case_insensitive）。
+    """
+    # 构建 ODS 列名的小写查找表
+    ods_by_lower = {}
+    for col_name, col_type in ods_cols:
+        ods_by_lower[col_name.lower()] = (col_name, col_type)
+
+    mappings = []
+    matched_ods = set()
+
+    for api_key, api_type in api_keys.items():
+        api_lower = api_key.lower()
+        # 跳过嵌套对象键（siteProfile, tableProfile 等）
+        if api_type == "object" and api_lower in ("siteprofile", "tableprofile", "data", "settlelist"):
+            mappings.append({
+                "api_field": api_key,
+                "api_type": api_type,
+                "ods_column": "—",
+                "ods_type": "—",
+                "mapping": "嵌套对象，展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象，不直接映射到列",
+            })
+            continue
+
+        if api_lower in ods_by_lower:
+            ods_col, ods_type = ods_by_lower[api_lower]
+            matched_ods.add(ods_col.lower())
+            note = "同名映射" if api_key == ods_col else "大小写不敏感匹配"
+            mappings.append({
+                "api_field": api_key,
+                "api_type": api_type,
+                "ods_column": ods_col,
+                "ods_type": ods_type,
+                "mapping": note,
+            })
+        else:
+            mappings.append({
+                "api_field": api_key,
+                "api_type": api_type,
+                "ods_column": "—",
+                "ods_type": "—",
+                "mapping": "未入 ODS 列（仅存于 payload）",
+            })
+
+    # 找出 ODS 中有但 API 中没有的列（ETL 元数据列）
+    for col_name, col_type in ods_cols:
+        if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS:
+            # 可能是从嵌套对象中提取的
+            mappings.append({
+                "api_field": "—",
+                "api_type": "—",
+                "ods_column": col_name,
+                "ods_type": col_type,
+                "mapping": "ETL 派生/嵌套提取",
+            })
+
+    return mappings
+
+
+def generate_api_section(table: str, api_keys: OrderedDict, ods_cols: list[tuple[str, str]], mappings: list[dict]) -> str:
+    """生成单个表的 API 源字段小节 Markdown。"""
+    lines = []
+
+    # API 源字段列表
+    lines.append(f"### API 源字段（{len(api_keys)} 个）")
+    lines.append("")
+    lines.append("> 以下字段从 `payload` JSONB 中提取，展示 API 返回 JSON 的顶层结构（经 `merge_record_layers` 展平后）。")
+    lines.append("")
+    lines.append("| # | API 字段名 | JSON 类型 | 映射到 ODS 列 | 说明 |")
+    lines.append("|---|-----------|-----------|--------------|------|")
+
+    for idx, m in enumerate(mappings, 1):
+        api_f = m["api_field"]
+        api_t = m["api_type"]
+        ods_c = m["ods_column"]
+        note = m["mapping"]
+        if api_f == "—":
+            continue  # 跳过 ETL 派生列，在下面单独说明
+        ods_display = f"`{ods_c}`" if ods_c != "—" else "—"
+        lines.append(f"| {idx} | `{api_f}` | {api_t} | {ods_display} | {note} |")
+
+    # 统计
+    mapped_count = sum(1 for m in mappings if m["ods_column"] != "—" and m["api_field"] != "—")
+    unmapped_count = sum(1 for m in mappings if m["ods_column"] == "—" and m["api_field"] != "—" and m["api_type"] not in ("object",))
+    payload_only = [m["api_field"] for m in mappings if m["mapping"] == "未入 ODS 列（仅存于 payload）"]
+
+    lines.append("")
+    if payload_only:
+        lines.append(f"> 映射统计：{mapped_count} 个字段映射到 ODS 列，{len(payload_only)} 个字段仅存于 `payload` JSONB 中。")
+        lines.append(f"> 仅存于 payload 的字段：{', '.join(f'`{f}`' for f in payload_only)}")
+    else:
+        lines.append(f"> 映射统计：{mapped_count} 个字段全部映射到 ODS 列。")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def insert_sections_into_doc(doc_text: str, sections: dict[str, str]) -> str:
+    """
+    在现有文档的每个表章节中，在 "### ODS 表" 之前插入 API 源字段小节。
+    如果已存在 "### API 源字段" 则替换。
+    """
+    lines = doc_text.split("\n")
+    result = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+
+        # 检测 "## table_name" 章节标题
+        m = re.match(r"^## (\w+)\s*$", line)
+        if m:
+            table_name = m.group(1)
+            result.append(line)
+            i += 1
+
+            if table_name in sections:
+                # 跳过空行
+                while i < len(lines) and lines[i].strip() == "":
+                    result.append(lines[i])
+                    i += 1
+
+                # 如果已存在 "### API 源字段"，跳过旧内容直到下一个 ### 或 ##
+                if i < len(lines) and lines[i].startswith("### API 源字段"):
+                    # 跳过旧的 API 源字段小节
+                    i += 1
+                    while i < len(lines):
+                        if lines[i].startswith("### ") or lines[i].startswith("## "):
+                            break
+                        i += 1
+
+                # 插入新的 API 源字段小节
+                result.append(sections[table_name])
+                result.append("")
+            continue
+
+        result.append(line)
+        i += 1
+
+    return "\n".join(result)
+
+
+def detect_ods_schema(conn) -> str:
+    """自动检测 ODS schema 名（可能是 ods 或 billiards_ods）。"""
+    with conn.cursor() as cur:
+        cur.execute("""
+            SELECT schema_name FROM information_schema.schemata
+            WHERE schema_name IN ('ods', 'billiards_ods')
+            ORDER BY schema_name
+        """)
+        rows = cur.fetchall()
+    for row in rows:
+        if row[0] == "ods":
+            return "ods"
+    for row in rows:
+        if row[0] == "billiards_ods":
+            return "billiards_ods"
+    print("错误: 未找到 ods 或 billiards_ods schema", file=sys.stderr)
+    sys.exit(1)
+
+
+def main():
+    global ODS_SCHEMA
+    dsn = get_db_dsn()
+    conn = psycopg2.connect(dsn)
+    conn.set_client_encoding("UTF8")
+
+    ODS_SCHEMA = detect_ods_schema(conn)
+    print(f"检测到 ODS schema: {ODS_SCHEMA}")
+
+    print("正在从数据库提取 API 原始字段...")
+    sections = {}
+
+    for table in ODS_TABLES:
+        print(f"  处理: {table}")
+        payloads = fetch_sample_payloads(conn, table, sample_count=10)
+        if not payloads:
+            print(f"    警告: {table} 无 payload 数据，跳过")
+            continue
+
+        api_keys = merge_payloads_keys(payloads)
+        ods_cols = get_ods_columns(conn, table)
+        mappings = compute_mapping(api_keys, ods_cols)
+        section_text = generate_api_section(table, api_keys, ods_cols, mappings)
+        sections[table] = section_text
+
+    conn.close()
+
+    print(f"\n读取现有文档: {INPUT_DOC}")
+    doc_text = INPUT_DOC.read_text(encoding="utf-8")
+
+    print("插入 API 源字段小节...")
+    new_doc = insert_sections_into_doc(doc_text, sections)
+
+    OUTPUT_DOC.write_text(new_doc, encoding="utf-8")
+    print(f"文档已更新: {OUTPUT_DOC}")
+    print(f"  处理了 {len(sections)} 个表的 API 源字段映射")
+
+
+if __name__ == "__main__":
+    main()