399 lines
14 KiB
Python
399 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
从数据库 payload 字段提取 API 原始 JSON 字段,生成 API 源字段 → ODS 映射文档。
|
||
直接从 API 返回的 JSON 分析,不依赖处理代码。
|
||
|
||
用法: python scripts/ops/gen_api_field_mapping.py
|
||
输出: 在 docs/reports/dataflow_api_ods_dwd.md 的每个表章节中插入 API 源字段小节
|
||
"""
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
from collections import OrderedDict
|
||
from pathlib import Path
|
||
|
||
import psycopg2
|
||
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
INPUT_DOC = ROOT / "docs" / "reports" / "dataflow_api_ods_dwd.md"
|
||
OUTPUT_DOC = INPUT_DOC # 原地更新
|
||
|
||
# ODS schema 名(从数据库动态检测)
|
||
ODS_SCHEMA = None # 运行时自动检测
|
||
|
||
# ODS 表列表(与文档中的顺序一致)
|
||
ODS_TABLES = [
|
||
"assistant_accounts_master",
|
||
"assistant_cancellation_records",
|
||
"assistant_service_records",
|
||
"goods_stock_movements",
|
||
"goods_stock_summary",
|
||
"group_buy_packages",
|
||
"group_buy_redemption_records",
|
||
"member_balance_changes",
|
||
"member_profiles",
|
||
"member_stored_value_cards",
|
||
"payment_transactions",
|
||
"platform_coupon_redemption_records",
|
||
"recharge_settlements",
|
||
"refund_transactions",
|
||
"settlement_records",
|
||
"settlement_ticket_details",
|
||
"site_tables_master",
|
||
"stock_goods_category_tree",
|
||
"store_goods_master",
|
||
"store_goods_sales_records",
|
||
"table_fee_discount_records",
|
||
"table_fee_transactions",
|
||
"tenant_goods_master",
|
||
]
|
||
|
||
# ETL 元数据列(不来自 API)
|
||
ETL_META_COLS = {
|
||
"content_hash", "source_file", "source_endpoint",
|
||
"fetched_at", "payload", "record_index",
|
||
}
|
||
|
||
# 需要展平的嵌套层(merge_record_layers 逻辑)
|
||
FLATTEN_KEYS = {"data", "settleList"}
|
||
|
||
|
||
def get_db_dsn() -> str:
|
||
"""从 .env 文件读取数据库连接串。"""
|
||
from dotenv import load_dotenv
|
||
env_path = ROOT / "apps" / "etl" / "pipelines" / "feiqiu" / ".env"
|
||
if env_path.exists():
|
||
load_dotenv(env_path, override=True)
|
||
load_dotenv(ROOT / ".env")
|
||
dsn = os.environ.get("PG_DSN") or os.environ.get("DB_DSN") or os.environ.get("DATABASE_URL")
|
||
if not dsn:
|
||
print("错误: 未找到 PG_DSN / DB_DSN / DATABASE_URL 环境变量", file=sys.stderr)
|
||
sys.exit(1)
|
||
return dsn
|
||
|
||
|
||
def flatten_json_keys(obj: dict, prefix: str = "") -> list[tuple[str, str]]:
|
||
"""
|
||
递归提取 JSON 对象的所有叶子键及其值类型。
|
||
返回 [(key_path, value_type), ...]
|
||
对于嵌套对象,用 "." 连接路径。
|
||
对于数组,标记为 array 并递归展开元素。
|
||
"""
|
||
results = []
|
||
if not isinstance(obj, dict):
|
||
return results
|
||
for k, v in obj.items():
|
||
full_key = f"{prefix}.{k}" if prefix else k
|
||
if v is None:
|
||
results.append((full_key, "null"))
|
||
elif isinstance(v, bool):
|
||
results.append((full_key, "boolean"))
|
||
elif isinstance(v, int):
|
||
results.append((full_key, "integer"))
|
||
elif isinstance(v, float):
|
||
results.append((full_key, "number"))
|
||
elif isinstance(v, str):
|
||
results.append((full_key, "string"))
|
||
elif isinstance(v, list):
|
||
results.append((full_key, "array"))
|
||
# 递归展开数组中的第一个对象元素
|
||
for item in v:
|
||
if isinstance(item, dict):
|
||
results.extend(flatten_json_keys(item, f"{full_key}[]"))
|
||
break
|
||
elif isinstance(v, dict):
|
||
results.append((full_key, "object"))
|
||
results.extend(flatten_json_keys(v, full_key))
|
||
return results
|
||
|
||
|
||
def get_top_level_keys(obj: dict) -> list[tuple[str, str]]:
|
||
"""
|
||
提取 JSON 对象的顶层键及其值类型(merge_record_layers 展平后的视角)。
|
||
模拟 ETL 的 merge_record_layers:展平 data 和 settleList 嵌套层。
|
||
"""
|
||
merged = dict(obj)
|
||
# 展平 data 层
|
||
data_part = merged.get("data")
|
||
while isinstance(data_part, dict):
|
||
merged = {**data_part, **merged}
|
||
data_part = data_part.get("data")
|
||
# 展平 settleList 层
|
||
settle_inner = merged.get("settleList")
|
||
if isinstance(settle_inner, dict):
|
||
merged = {**settle_inner, **merged}
|
||
|
||
results = []
|
||
for k, v in merged.items():
|
||
if v is None:
|
||
vtype = "null"
|
||
elif isinstance(v, bool):
|
||
vtype = "boolean"
|
||
elif isinstance(v, int):
|
||
vtype = "integer"
|
||
elif isinstance(v, float):
|
||
vtype = "number"
|
||
elif isinstance(v, str):
|
||
vtype = "string"
|
||
elif isinstance(v, list):
|
||
vtype = "array"
|
||
elif isinstance(v, dict):
|
||
vtype = "object"
|
||
else:
|
||
vtype = type(v).__name__
|
||
results.append((k, vtype))
|
||
return results
|
||
|
||
|
||
def fetch_sample_payloads(conn, table: str, sample_count: int = 5) -> list[dict]:
|
||
"""从 ODS 表获取多条 payload 样本,合并字段以覆盖更多字段。"""
|
||
sql = f"""
|
||
SELECT payload
|
||
FROM {ODS_SCHEMA}.{table}
|
||
WHERE payload IS NOT NULL
|
||
ORDER BY fetched_at DESC
|
||
LIMIT {sample_count}
|
||
"""
|
||
with conn.cursor() as cur:
|
||
cur.execute(sql)
|
||
rows = cur.fetchall()
|
||
payloads = []
|
||
for row in rows:
|
||
p = row[0]
|
||
if isinstance(p, str):
|
||
p = json.loads(p)
|
||
if isinstance(p, dict):
|
||
payloads.append(p)
|
||
return payloads
|
||
|
||
|
||
def merge_payloads_keys(payloads: list[dict]) -> OrderedDict[str, str]:
|
||
"""合并多条 payload 的键,保留第一次出现的顺序和非 null 类型。"""
|
||
merged = OrderedDict()
|
||
for p in payloads:
|
||
keys = get_top_level_keys(p)
|
||
for k, vtype in keys:
|
||
if k not in merged:
|
||
merged[k] = vtype
|
||
elif merged[k] == "null" and vtype != "null":
|
||
merged[k] = vtype
|
||
return merged
|
||
|
||
|
||
def get_ods_columns(conn, table: str) -> list[tuple[str, str]]:
|
||
"""从数据库获取 ODS 表的列名和类型。"""
|
||
sql = """
|
||
SELECT column_name, data_type
|
||
FROM information_schema.columns
|
||
WHERE table_schema = %s AND table_name = %s
|
||
ORDER BY ordinal_position
|
||
"""
|
||
with conn.cursor() as cur:
|
||
cur.execute(sql, (ODS_SCHEMA, table))
|
||
return [(r[0], r[1]) for r in cur.fetchall()]
|
||
|
||
|
||
def compute_mapping(api_keys: OrderedDict, ods_cols: list[tuple[str, str]]) -> list[dict]:
|
||
"""
|
||
计算 API 字段 → ODS 列的映射关系。
|
||
ETL 使用大小写不敏感匹配(_get_value_case_insensitive)。
|
||
"""
|
||
# 构建 ODS 列名的小写查找表
|
||
ods_by_lower = {}
|
||
for col_name, col_type in ods_cols:
|
||
ods_by_lower[col_name.lower()] = (col_name, col_type)
|
||
|
||
mappings = []
|
||
matched_ods = set()
|
||
|
||
for api_key, api_type in api_keys.items():
|
||
api_lower = api_key.lower()
|
||
# 跳过嵌套对象键(siteProfile, tableProfile 等)
|
||
if api_type == "object" and api_lower in ("siteprofile", "tableprofile", "data", "settlelist"):
|
||
mappings.append({
|
||
"api_field": api_key,
|
||
"api_type": api_type,
|
||
"ods_column": "—",
|
||
"ods_type": "—",
|
||
"mapping": "嵌套对象,展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象,不直接映射到列",
|
||
})
|
||
continue
|
||
|
||
if api_lower in ods_by_lower:
|
||
ods_col, ods_type = ods_by_lower[api_lower]
|
||
matched_ods.add(ods_col.lower())
|
||
note = "同名映射" if api_key == ods_col else "大小写不敏感匹配"
|
||
mappings.append({
|
||
"api_field": api_key,
|
||
"api_type": api_type,
|
||
"ods_column": ods_col,
|
||
"ods_type": ods_type,
|
||
"mapping": note,
|
||
})
|
||
else:
|
||
mappings.append({
|
||
"api_field": api_key,
|
||
"api_type": api_type,
|
||
"ods_column": "—",
|
||
"ods_type": "—",
|
||
"mapping": "未入 ODS 列(仅存于 payload)",
|
||
})
|
||
|
||
# 找出 ODS 中有但 API 中没有的列(ETL 元数据列)
|
||
for col_name, col_type in ods_cols:
|
||
if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS:
|
||
# 可能是从嵌套对象中提取的
|
||
mappings.append({
|
||
"api_field": "—",
|
||
"api_type": "—",
|
||
"ods_column": col_name,
|
||
"ods_type": col_type,
|
||
"mapping": "ETL 派生/嵌套提取",
|
||
})
|
||
|
||
return mappings
|
||
|
||
|
||
def generate_api_section(table: str, api_keys: OrderedDict, ods_cols: list[tuple[str, str]], mappings: list[dict]) -> str:
|
||
"""生成单个表的 API 源字段小节 Markdown。"""
|
||
lines = []
|
||
|
||
# API 源字段列表
|
||
lines.append(f"### API 源字段({len(api_keys)} 个)")
|
||
lines.append("")
|
||
lines.append("> 以下字段从 `payload` JSONB 中提取,展示 API 返回 JSON 的顶层结构(经 `merge_record_layers` 展平后)。")
|
||
lines.append("")
|
||
lines.append("| # | API 字段名 | JSON 类型 | 映射到 ODS 列 | 说明 |")
|
||
lines.append("|---|-----------|-----------|--------------|------|")
|
||
|
||
for idx, m in enumerate(mappings, 1):
|
||
api_f = m["api_field"]
|
||
api_t = m["api_type"]
|
||
ods_c = m["ods_column"]
|
||
note = m["mapping"]
|
||
if api_f == "—":
|
||
continue # 跳过 ETL 派生列,在下面单独说明
|
||
ods_display = f"`{ods_c}`" if ods_c != "—" else "—"
|
||
lines.append(f"| {idx} | `{api_f}` | {api_t} | {ods_display} | {note} |")
|
||
|
||
# 统计
|
||
mapped_count = sum(1 for m in mappings if m["ods_column"] != "—" and m["api_field"] != "—")
|
||
unmapped_count = sum(1 for m in mappings if m["ods_column"] == "—" and m["api_field"] != "—" and m["api_type"] not in ("object",))
|
||
payload_only = [m["api_field"] for m in mappings if m["mapping"] == "未入 ODS 列(仅存于 payload)"]
|
||
|
||
lines.append("")
|
||
if payload_only:
|
||
lines.append(f"> 映射统计:{mapped_count} 个字段映射到 ODS 列,{len(payload_only)} 个字段仅存于 `payload` JSONB 中。")
|
||
lines.append(f"> 仅存于 payload 的字段:{', '.join(f'`{f}`' for f in payload_only)}")
|
||
else:
|
||
lines.append(f"> 映射统计:{mapped_count} 个字段全部映射到 ODS 列。")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def insert_sections_into_doc(doc_text: str, sections: dict[str, str]) -> str:
|
||
"""
|
||
在现有文档的每个表章节中,在 "### ODS 表" 之前插入 API 源字段小节。
|
||
如果已存在 "### API 源字段" 则替换。
|
||
"""
|
||
lines = doc_text.split("\n")
|
||
result = []
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i]
|
||
|
||
# 检测 "## table_name" 章节标题
|
||
m = re.match(r"^## (\w+)\s*$", line)
|
||
if m:
|
||
table_name = m.group(1)
|
||
result.append(line)
|
||
i += 1
|
||
|
||
if table_name in sections:
|
||
# 跳过空行
|
||
while i < len(lines) and lines[i].strip() == "":
|
||
result.append(lines[i])
|
||
i += 1
|
||
|
||
# 如果已存在 "### API 源字段",跳过旧内容直到下一个 ### 或 ##
|
||
if i < len(lines) and lines[i].startswith("### API 源字段"):
|
||
# 跳过旧的 API 源字段小节
|
||
i += 1
|
||
while i < len(lines):
|
||
if lines[i].startswith("### ") or lines[i].startswith("## "):
|
||
break
|
||
i += 1
|
||
|
||
# 插入新的 API 源字段小节
|
||
result.append(sections[table_name])
|
||
result.append("")
|
||
continue
|
||
|
||
result.append(line)
|
||
i += 1
|
||
|
||
return "\n".join(result)
|
||
|
||
|
||
def detect_ods_schema(conn) -> str:
|
||
"""自动检测 ODS schema 名(可能是 ods 或 billiards_ods)。"""
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT schema_name FROM information_schema.schemata
|
||
WHERE schema_name IN ('ods', 'billiards_ods')
|
||
ORDER BY schema_name
|
||
""")
|
||
rows = cur.fetchall()
|
||
for row in rows:
|
||
if row[0] == "ods":
|
||
return "ods"
|
||
for row in rows:
|
||
if row[0] == "billiards_ods":
|
||
return "billiards_ods"
|
||
print("错误: 未找到 ods 或 billiards_ods schema", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
|
||
def main():
|
||
global ODS_SCHEMA
|
||
dsn = get_db_dsn()
|
||
conn = psycopg2.connect(dsn)
|
||
conn.set_client_encoding("UTF8")
|
||
|
||
ODS_SCHEMA = detect_ods_schema(conn)
|
||
print(f"检测到 ODS schema: {ODS_SCHEMA}")
|
||
|
||
print("正在从数据库提取 API 原始字段...")
|
||
sections = {}
|
||
|
||
for table in ODS_TABLES:
|
||
print(f" 处理: {table}")
|
||
payloads = fetch_sample_payloads(conn, table, sample_count=10)
|
||
if not payloads:
|
||
print(f" 警告: {table} 无 payload 数据,跳过")
|
||
continue
|
||
|
||
api_keys = merge_payloads_keys(payloads)
|
||
ods_cols = get_ods_columns(conn, table)
|
||
mappings = compute_mapping(api_keys, ods_cols)
|
||
section_text = generate_api_section(table, api_keys, ods_cols, mappings)
|
||
sections[table] = section_text
|
||
|
||
conn.close()
|
||
|
||
print(f"\n读取现有文档: {INPUT_DOC}")
|
||
doc_text = INPUT_DOC.read_text(encoding="utf-8")
|
||
|
||
print("插入 API 源字段小节...")
|
||
new_doc = insert_sections_into_doc(doc_text, sections)
|
||
|
||
OUTPUT_DOC.write_text(new_doc, encoding="utf-8")
|
||
print(f"文档已更新: {OUTPUT_DOC}")
|
||
print(f" 处理了 {len(sections)} 个表的 API 源字段映射")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|