Files
Neo-ZQYY/scripts/ops/gen_api_field_mapping.py

399 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
从数据库 payload 字段提取 API 原始 JSON 字段,生成 API 源字段 → ODS 映射文档。
直接从 API 返回的 JSON 分析,不依赖处理代码。
用法: python scripts/ops/gen_api_field_mapping.py
输出: 在 docs/reports/dataflow_api_ods_dwd.md 的每个表章节中插入 API 源字段小节
"""
import json
import os
import re
import sys
from collections import OrderedDict
from pathlib import Path
import psycopg2
ROOT = Path(__file__).resolve().parents[2]
INPUT_DOC = ROOT / "docs" / "reports" / "dataflow_api_ods_dwd.md"
OUTPUT_DOC = INPUT_DOC # 原地更新
# ODS schema 名(从数据库动态检测)
ODS_SCHEMA = None # 运行时自动检测
# ODS 表列表(与文档中的顺序一致)
ODS_TABLES = [
"assistant_accounts_master",
"assistant_cancellation_records",
"assistant_service_records",
"goods_stock_movements",
"goods_stock_summary",
"group_buy_packages",
"group_buy_redemption_records",
"member_balance_changes",
"member_profiles",
"member_stored_value_cards",
"payment_transactions",
"platform_coupon_redemption_records",
"recharge_settlements",
"refund_transactions",
"settlement_records",
"settlement_ticket_details",
"site_tables_master",
"stock_goods_category_tree",
"store_goods_master",
"store_goods_sales_records",
"table_fee_discount_records",
"table_fee_transactions",
"tenant_goods_master",
]
# ETL 元数据列(不来自 API
ETL_META_COLS = {
"content_hash", "source_file", "source_endpoint",
"fetched_at", "payload", "record_index",
}
# 需要展平的嵌套层merge_record_layers 逻辑)
FLATTEN_KEYS = {"data", "settleList"}
def get_db_dsn() -> str:
"""从 .env 文件读取数据库连接串。"""
from dotenv import load_dotenv
env_path = ROOT / "apps" / "etl" / "pipelines" / "feiqiu" / ".env"
if env_path.exists():
load_dotenv(env_path, override=True)
load_dotenv(ROOT / ".env")
dsn = os.environ.get("PG_DSN") or os.environ.get("DB_DSN") or os.environ.get("DATABASE_URL")
if not dsn:
print("错误: 未找到 PG_DSN / DB_DSN / DATABASE_URL 环境变量", file=sys.stderr)
sys.exit(1)
return dsn
def flatten_json_keys(obj: dict, prefix: str = "") -> list[tuple[str, str]]:
"""
递归提取 JSON 对象的所有叶子键及其值类型。
返回 [(key_path, value_type), ...]
对于嵌套对象,用 "." 连接路径。
对于数组,标记为 array 并递归展开元素。
"""
results = []
if not isinstance(obj, dict):
return results
for k, v in obj.items():
full_key = f"{prefix}.{k}" if prefix else k
if v is None:
results.append((full_key, "null"))
elif isinstance(v, bool):
results.append((full_key, "boolean"))
elif isinstance(v, int):
results.append((full_key, "integer"))
elif isinstance(v, float):
results.append((full_key, "number"))
elif isinstance(v, str):
results.append((full_key, "string"))
elif isinstance(v, list):
results.append((full_key, "array"))
# 递归展开数组中的第一个对象元素
for item in v:
if isinstance(item, dict):
results.extend(flatten_json_keys(item, f"{full_key}[]"))
break
elif isinstance(v, dict):
results.append((full_key, "object"))
results.extend(flatten_json_keys(v, full_key))
return results
def get_top_level_keys(obj: dict) -> list[tuple[str, str]]:
"""
提取 JSON 对象的顶层键及其值类型merge_record_layers 展平后的视角)。
模拟 ETL 的 merge_record_layers展平 data 和 settleList 嵌套层。
"""
merged = dict(obj)
# 展平 data 层
data_part = merged.get("data")
while isinstance(data_part, dict):
merged = {**data_part, **merged}
data_part = data_part.get("data")
# 展平 settleList 层
settle_inner = merged.get("settleList")
if isinstance(settle_inner, dict):
merged = {**settle_inner, **merged}
results = []
for k, v in merged.items():
if v is None:
vtype = "null"
elif isinstance(v, bool):
vtype = "boolean"
elif isinstance(v, int):
vtype = "integer"
elif isinstance(v, float):
vtype = "number"
elif isinstance(v, str):
vtype = "string"
elif isinstance(v, list):
vtype = "array"
elif isinstance(v, dict):
vtype = "object"
else:
vtype = type(v).__name__
results.append((k, vtype))
return results
def fetch_sample_payloads(conn, table: str, sample_count: int = 5) -> list[dict]:
"""从 ODS 表获取多条 payload 样本,合并字段以覆盖更多字段。"""
sql = f"""
SELECT payload
FROM {ODS_SCHEMA}.{table}
WHERE payload IS NOT NULL
ORDER BY fetched_at DESC
LIMIT {sample_count}
"""
with conn.cursor() as cur:
cur.execute(sql)
rows = cur.fetchall()
payloads = []
for row in rows:
p = row[0]
if isinstance(p, str):
p = json.loads(p)
if isinstance(p, dict):
payloads.append(p)
return payloads
def merge_payloads_keys(payloads: list[dict]) -> OrderedDict[str, str]:
"""合并多条 payload 的键,保留第一次出现的顺序和非 null 类型。"""
merged = OrderedDict()
for p in payloads:
keys = get_top_level_keys(p)
for k, vtype in keys:
if k not in merged:
merged[k] = vtype
elif merged[k] == "null" and vtype != "null":
merged[k] = vtype
return merged
def get_ods_columns(conn, table: str) -> list[tuple[str, str]]:
"""从数据库获取 ODS 表的列名和类型。"""
sql = """
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
ORDER BY ordinal_position
"""
with conn.cursor() as cur:
cur.execute(sql, (ODS_SCHEMA, table))
return [(r[0], r[1]) for r in cur.fetchall()]
def compute_mapping(api_keys: OrderedDict, ods_cols: list[tuple[str, str]]) -> list[dict]:
"""
计算 API 字段 → ODS 列的映射关系。
ETL 使用大小写不敏感匹配_get_value_case_insensitive
"""
# 构建 ODS 列名的小写查找表
ods_by_lower = {}
for col_name, col_type in ods_cols:
ods_by_lower[col_name.lower()] = (col_name, col_type)
mappings = []
matched_ods = set()
for api_key, api_type in api_keys.items():
api_lower = api_key.lower()
# 跳过嵌套对象键siteProfile, tableProfile 等)
if api_type == "object" and api_lower in ("siteprofile", "tableprofile", "data", "settlelist"):
mappings.append({
"api_field": api_key,
"api_type": api_type,
"ods_column": "",
"ods_type": "",
"mapping": "嵌套对象,展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象,不直接映射到列",
})
continue
if api_lower in ods_by_lower:
ods_col, ods_type = ods_by_lower[api_lower]
matched_ods.add(ods_col.lower())
note = "同名映射" if api_key == ods_col else "大小写不敏感匹配"
mappings.append({
"api_field": api_key,
"api_type": api_type,
"ods_column": ods_col,
"ods_type": ods_type,
"mapping": note,
})
else:
mappings.append({
"api_field": api_key,
"api_type": api_type,
"ods_column": "",
"ods_type": "",
"mapping": "未入 ODS 列(仅存于 payload",
})
# 找出 ODS 中有但 API 中没有的列ETL 元数据列)
for col_name, col_type in ods_cols:
if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS:
# 可能是从嵌套对象中提取的
mappings.append({
"api_field": "",
"api_type": "",
"ods_column": col_name,
"ods_type": col_type,
"mapping": "ETL 派生/嵌套提取",
})
return mappings
def generate_api_section(table: str, api_keys: OrderedDict, ods_cols: list[tuple[str, str]], mappings: list[dict]) -> str:
"""生成单个表的 API 源字段小节 Markdown。"""
lines = []
# API 源字段列表
lines.append(f"### API 源字段({len(api_keys)} 个)")
lines.append("")
lines.append("> 以下字段从 `payload` JSONB 中提取,展示 API 返回 JSON 的顶层结构(经 `merge_record_layers` 展平后)。")
lines.append("")
lines.append("| # | API 字段名 | JSON 类型 | 映射到 ODS 列 | 说明 |")
lines.append("|---|-----------|-----------|--------------|------|")
for idx, m in enumerate(mappings, 1):
api_f = m["api_field"]
api_t = m["api_type"]
ods_c = m["ods_column"]
note = m["mapping"]
if api_f == "":
continue # 跳过 ETL 派生列,在下面单独说明
ods_display = f"`{ods_c}`" if ods_c != "" else ""
lines.append(f"| {idx} | `{api_f}` | {api_t} | {ods_display} | {note} |")
# 统计
mapped_count = sum(1 for m in mappings if m["ods_column"] != "" and m["api_field"] != "")
unmapped_count = sum(1 for m in mappings if m["ods_column"] == "" and m["api_field"] != "" and m["api_type"] not in ("object",))
payload_only = [m["api_field"] for m in mappings if m["mapping"] == "未入 ODS 列(仅存于 payload"]
lines.append("")
if payload_only:
lines.append(f"> 映射统计:{mapped_count} 个字段映射到 ODS 列,{len(payload_only)} 个字段仅存于 `payload` JSONB 中。")
lines.append(f"> 仅存于 payload 的字段:{', '.join(f'`{f}`' for f in payload_only)}")
else:
lines.append(f"> 映射统计:{mapped_count} 个字段全部映射到 ODS 列。")
lines.append("")
return "\n".join(lines)
def insert_sections_into_doc(doc_text: str, sections: dict[str, str]) -> str:
"""
在现有文档的每个表章节中,在 "### ODS 表" 之前插入 API 源字段小节。
如果已存在 "### API 源字段" 则替换。
"""
lines = doc_text.split("\n")
result = []
i = 0
while i < len(lines):
line = lines[i]
# 检测 "## table_name" 章节标题
m = re.match(r"^## (\w+)\s*$", line)
if m:
table_name = m.group(1)
result.append(line)
i += 1
if table_name in sections:
# 跳过空行
while i < len(lines) and lines[i].strip() == "":
result.append(lines[i])
i += 1
# 如果已存在 "### API 源字段",跳过旧内容直到下一个 ### 或 ##
if i < len(lines) and lines[i].startswith("### API 源字段"):
# 跳过旧的 API 源字段小节
i += 1
while i < len(lines):
if lines[i].startswith("### ") or lines[i].startswith("## "):
break
i += 1
# 插入新的 API 源字段小节
result.append(sections[table_name])
result.append("")
continue
result.append(line)
i += 1
return "\n".join(result)
def detect_ods_schema(conn) -> str:
"""自动检测 ODS schema 名(可能是 ods 或 billiards_ods"""
with conn.cursor() as cur:
cur.execute("""
SELECT schema_name FROM information_schema.schemata
WHERE schema_name IN ('ods', 'billiards_ods')
ORDER BY schema_name
""")
rows = cur.fetchall()
for row in rows:
if row[0] == "ods":
return "ods"
for row in rows:
if row[0] == "billiards_ods":
return "billiards_ods"
print("错误: 未找到 ods 或 billiards_ods schema", file=sys.stderr)
sys.exit(1)
def main():
global ODS_SCHEMA
dsn = get_db_dsn()
conn = psycopg2.connect(dsn)
conn.set_client_encoding("UTF8")
ODS_SCHEMA = detect_ods_schema(conn)
print(f"检测到 ODS schema: {ODS_SCHEMA}")
print("正在从数据库提取 API 原始字段...")
sections = {}
for table in ODS_TABLES:
print(f" 处理: {table}")
payloads = fetch_sample_payloads(conn, table, sample_count=10)
if not payloads:
print(f" 警告: {table} 无 payload 数据,跳过")
continue
api_keys = merge_payloads_keys(payloads)
ods_cols = get_ods_columns(conn, table)
mappings = compute_mapping(api_keys, ods_cols)
section_text = generate_api_section(table, api_keys, ods_cols, mappings)
sections[table] = section_text
conn.close()
print(f"\n读取现有文档: {INPUT_DOC}")
doc_text = INPUT_DOC.read_text(encoding="utf-8")
print("插入 API 源字段小节...")
new_doc = insert_sections_into_doc(doc_text, sections)
OUTPUT_DOC.write_text(new_doc, encoding="utf-8")
print(f"文档已更新: {OUTPUT_DOC}")
print(f" 处理了 {len(sections)} 个表的 API 源字段映射")
if __name__ == "__main__":
main()