# -*- coding: utf-8 -*- """ 从数据库 payload 字段提取 API 原始 JSON 字段,生成 API 源字段 → ODS 映射文档。 直接从 API 返回的 JSON 分析,不依赖处理代码。 用法: python scripts/ops/gen_api_field_mapping.py 输出: 在 $FULL_DATAFLOW_DOC_ROOT/dataflow_api_ods_dwd.md 的每个表章节中插入 API 源字段小节 """ import json import os import re import sys from collections import OrderedDict from pathlib import Path import psycopg2 ROOT = Path(__file__).resolve().parents[2] from _env_paths import get_output_path as _get_path INPUT_DOC = _get_path("FULL_DATAFLOW_DOC_ROOT") / "dataflow_api_ods_dwd.md" OUTPUT_DOC = INPUT_DOC # 原地更新 # ODS schema 名(从数据库动态检测) ODS_SCHEMA = None # 运行时自动检测 # ODS 表列表(与文档中的顺序一致) ODS_TABLES = [ "assistant_accounts_master", "assistant_service_records", "goods_stock_movements", "goods_stock_summary", "group_buy_packages", "group_buy_redemption_records", "member_balance_changes", "member_profiles", "member_stored_value_cards", "payment_transactions", "platform_coupon_redemption_records", "recharge_settlements", "refund_transactions", "settlement_records", "site_tables_master", "stock_goods_category_tree", "store_goods_master", "store_goods_sales_records", "table_fee_discount_records", "table_fee_transactions", "tenant_goods_master", ] # ETL 元数据列(不来自 API) ETL_META_COLS = { "content_hash", "source_file", "source_endpoint", "fetched_at", "payload", "record_index", } # 需要展平的嵌套层(merge_record_layers 逻辑) FLATTEN_KEYS = {"data", "settleList"} def get_db_dsn() -> str: """从 .env 文件读取数据库连接串。""" from dotenv import load_dotenv env_path = ROOT / "apps" / "etl" / "pipelines" / "feiqiu" / ".env" if env_path.exists(): load_dotenv(env_path, override=True) load_dotenv(ROOT / ".env") dsn = os.environ.get("PG_DSN") or os.environ.get("DB_DSN") or os.environ.get("DATABASE_URL") if not dsn: print("错误: 未找到 PG_DSN / DB_DSN / DATABASE_URL 环境变量", file=sys.stderr) sys.exit(1) return dsn def flatten_json_keys(obj: dict, prefix: str = "") -> list[tuple[str, str]]: """ 递归提取 JSON 对象的所有叶子键及其值类型。 返回 [(key_path, value_type), ...] 对于嵌套对象,用 "." 连接路径。 对于数组,标记为 array 并递归展开元素。 """ results = [] if not isinstance(obj, dict): return results for k, v in obj.items(): full_key = f"{prefix}.{k}" if prefix else k if v is None: results.append((full_key, "null")) elif isinstance(v, bool): results.append((full_key, "boolean")) elif isinstance(v, int): results.append((full_key, "integer")) elif isinstance(v, float): results.append((full_key, "number")) elif isinstance(v, str): results.append((full_key, "string")) elif isinstance(v, list): results.append((full_key, "array")) # 递归展开数组中的第一个对象元素 for item in v: if isinstance(item, dict): results.extend(flatten_json_keys(item, f"{full_key}[]")) break elif isinstance(v, dict): results.append((full_key, "object")) results.extend(flatten_json_keys(v, full_key)) return results def get_top_level_keys(obj: dict) -> list[tuple[str, str]]: """ 提取 JSON 对象的顶层键及其值类型(merge_record_layers 展平后的视角)。 模拟 ETL 的 merge_record_layers:展平 data 和 settleList 嵌套层。 """ merged = dict(obj) # 展平 data 层 data_part = merged.get("data") while isinstance(data_part, dict): merged = {**data_part, **merged} data_part = data_part.get("data") # 展平 settleList 层 settle_inner = merged.get("settleList") if isinstance(settle_inner, dict): merged = {**settle_inner, **merged} results = [] for k, v in merged.items(): if v is None: vtype = "null" elif isinstance(v, bool): vtype = "boolean" elif isinstance(v, int): vtype = "integer" elif isinstance(v, float): vtype = "number" elif isinstance(v, str): vtype = "string" elif isinstance(v, list): vtype = "array" elif isinstance(v, dict): vtype = "object" else: vtype = type(v).__name__ results.append((k, vtype)) return results def fetch_sample_payloads(conn, table: str, sample_count: int = 5) -> list[dict]: """从 ODS 表获取多条 payload 样本,合并字段以覆盖更多字段。""" sql = f""" SELECT payload FROM {ODS_SCHEMA}.{table} WHERE payload IS NOT NULL ORDER BY fetched_at DESC LIMIT {sample_count} """ with conn.cursor() as cur: cur.execute(sql) rows = cur.fetchall() payloads = [] for row in rows: p = row[0] if isinstance(p, str): p = json.loads(p) if isinstance(p, dict): payloads.append(p) return payloads def merge_payloads_keys(payloads: list[dict]) -> OrderedDict[str, str]: """合并多条 payload 的键,保留第一次出现的顺序和非 null 类型。""" merged = OrderedDict() for p in payloads: keys = get_top_level_keys(p) for k, vtype in keys: if k not in merged: merged[k] = vtype elif merged[k] == "null" and vtype != "null": merged[k] = vtype return merged def get_ods_columns(conn, table: str) -> list[tuple[str, str]]: """从数据库获取 ODS 表的列名和类型。""" sql = """ SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = %s AND table_name = %s ORDER BY ordinal_position """ with conn.cursor() as cur: cur.execute(sql, (ODS_SCHEMA, table)) return [(r[0], r[1]) for r in cur.fetchall()] def compute_mapping(api_keys: OrderedDict, ods_cols: list[tuple[str, str]]) -> list[dict]: """ 计算 API 字段 → ODS 列的映射关系。 ETL 使用大小写不敏感匹配(_get_value_case_insensitive)。 """ # 构建 ODS 列名的小写查找表 ods_by_lower = {} for col_name, col_type in ods_cols: ods_by_lower[col_name.lower()] = (col_name, col_type) mappings = [] matched_ods = set() for api_key, api_type in api_keys.items(): api_lower = api_key.lower() # 跳过嵌套对象键(siteProfile, tableProfile 等) if api_type == "object" and api_lower in ("siteprofile", "tableprofile", "data", "settlelist"): mappings.append({ "api_field": api_key, "api_type": api_type, "ods_column": "—", "ods_type": "—", "mapping": "嵌套对象,展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象,不直接映射到列", }) continue if api_lower in ods_by_lower: ods_col, ods_type = ods_by_lower[api_lower] matched_ods.add(ods_col.lower()) note = "同名映射" if api_key == ods_col else "大小写不敏感匹配" mappings.append({ "api_field": api_key, "api_type": api_type, "ods_column": ods_col, "ods_type": ods_type, "mapping": note, }) else: mappings.append({ "api_field": api_key, "api_type": api_type, "ods_column": "—", "ods_type": "—", "mapping": "未入 ODS 列(仅存于 payload)", }) # 找出 ODS 中有但 API 中没有的列(ETL 元数据列) for col_name, col_type in ods_cols: if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS: # 可能是从嵌套对象中提取的 mappings.append({ "api_field": "—", "api_type": "—", "ods_column": col_name, "ods_type": col_type, "mapping": "ETL 派生/嵌套提取", }) return mappings def generate_api_section(table: str, api_keys: OrderedDict, ods_cols: list[tuple[str, str]], mappings: list[dict]) -> str: """生成单个表的 API 源字段小节 Markdown。""" lines = [] # API 源字段列表 lines.append(f"### API 源字段({len(api_keys)} 个)") lines.append("") lines.append("> 以下字段从 `payload` JSONB 中提取,展示 API 返回 JSON 的顶层结构(经 `merge_record_layers` 展平后)。") lines.append("") lines.append("| # | API 字段名 | JSON 类型 | 映射到 ODS 列 | 说明 |") lines.append("|---|-----------|-----------|--------------|------|") for idx, m in enumerate(mappings, 1): api_f = m["api_field"] api_t = m["api_type"] ods_c = m["ods_column"] note = m["mapping"] if api_f == "—": continue # 跳过 ETL 派生列,在下面单独说明 ods_display = f"`{ods_c}`" if ods_c != "—" else "—" lines.append(f"| {idx} | `{api_f}` | {api_t} | {ods_display} | {note} |") # 统计 mapped_count = sum(1 for m in mappings if m["ods_column"] != "—" and m["api_field"] != "—") unmapped_count = sum(1 for m in mappings if m["ods_column"] == "—" and m["api_field"] != "—" and m["api_type"] not in ("object",)) payload_only = [m["api_field"] for m in mappings if m["mapping"] == "未入 ODS 列(仅存于 payload)"] lines.append("") if payload_only: lines.append(f"> 映射统计:{mapped_count} 个字段映射到 ODS 列,{len(payload_only)} 个字段仅存于 `payload` JSONB 中。") lines.append(f"> 仅存于 payload 的字段:{', '.join(f'`{f}`' for f in payload_only)}") else: lines.append(f"> 映射统计:{mapped_count} 个字段全部映射到 ODS 列。") lines.append("") return "\n".join(lines) def insert_sections_into_doc(doc_text: str, sections: dict[str, str]) -> str: """ 在现有文档的每个表章节中,在 "### ODS 表" 之前插入 API 源字段小节。 如果已存在 "### API 源字段" 则替换。 """ lines = doc_text.split("\n") result = [] i = 0 while i < len(lines): line = lines[i] # 检测 "## table_name" 章节标题 m = re.match(r"^## (\w+)\s*$", line) if m: table_name = m.group(1) result.append(line) i += 1 if table_name in sections: # 跳过空行 while i < len(lines) and lines[i].strip() == "": result.append(lines[i]) i += 1 # 如果已存在 "### API 源字段",跳过旧内容直到下一个 ### 或 ## if i < len(lines) and lines[i].startswith("### API 源字段"): # 跳过旧的 API 源字段小节 i += 1 while i < len(lines): if lines[i].startswith("### ") or lines[i].startswith("## "): break i += 1 # 插入新的 API 源字段小节 result.append(sections[table_name]) result.append("") continue result.append(line) i += 1 return "\n".join(result) def detect_ods_schema(conn) -> str: """自动检测 ODS schema 名(可能是 ods 或 billiards_ods)。""" with conn.cursor() as cur: cur.execute(""" SELECT schema_name FROM information_schema.schemata WHERE schema_name IN ('ods', 'billiards_ods') ORDER BY schema_name """) rows = cur.fetchall() for row in rows: if row[0] == "ods": return "ods" for row in rows: if row[0] == "billiards_ods": return "billiards_ods" print("错误: 未找到 ods 或 billiards_ods schema", file=sys.stderr) sys.exit(1) def main(): global ODS_SCHEMA dsn = get_db_dsn() conn = psycopg2.connect(dsn) conn.set_client_encoding("UTF8") ODS_SCHEMA = detect_ods_schema(conn) print(f"检测到 ODS schema: {ODS_SCHEMA}") print("正在从数据库提取 API 原始字段...") sections = {} for table in ODS_TABLES: print(f" 处理: {table}") payloads = fetch_sample_payloads(conn, table, sample_count=10) if not payloads: print(f" 警告: {table} 无 payload 数据,跳过") continue api_keys = merge_payloads_keys(payloads) ods_cols = get_ods_columns(conn, table) mappings = compute_mapping(api_keys, ods_cols) section_text = generate_api_section(table, api_keys, ods_cols, mappings) sections[table] = section_text conn.close() print(f"\n读取现有文档: {INPUT_DOC}") doc_text = INPUT_DOC.read_text(encoding="utf-8") print("插入 API 源字段小节...") new_doc = insert_sections_into_doc(doc_text, sections) OUTPUT_DOC.write_text(new_doc, encoding="utf-8") print(f"文档已更新: {OUTPUT_DOC}") print(f" 处理了 {len(sections)} 个表的 API 源字段映射") if __name__ == "__main__": main()