在准备环境前提交次全部更改。
This commit is contained in:
398
scripts/ops/gen_api_field_mapping.py
Normal file
398
scripts/ops/gen_api_field_mapping.py
Normal file
@@ -0,0 +1,398 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
从数据库 payload 字段提取 API 原始 JSON 字段,生成 API 源字段 → ODS 映射文档。
|
||||
直接从 API 返回的 JSON 分析,不依赖处理代码。
|
||||
|
||||
用法: python scripts/ops/gen_api_field_mapping.py
|
||||
输出: 在 docs/reports/dataflow_api_ods_dwd.md 的每个表章节中插入 API 源字段小节
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import OrderedDict
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
INPUT_DOC = ROOT / "docs" / "reports" / "dataflow_api_ods_dwd.md"
|
||||
OUTPUT_DOC = INPUT_DOC # 原地更新
|
||||
|
||||
# ODS schema 名(从数据库动态检测)
|
||||
ODS_SCHEMA = None # 运行时自动检测
|
||||
|
||||
# ODS 表列表(与文档中的顺序一致)
|
||||
ODS_TABLES = [
|
||||
"assistant_accounts_master",
|
||||
"assistant_cancellation_records",
|
||||
"assistant_service_records",
|
||||
"goods_stock_movements",
|
||||
"goods_stock_summary",
|
||||
"group_buy_packages",
|
||||
"group_buy_redemption_records",
|
||||
"member_balance_changes",
|
||||
"member_profiles",
|
||||
"member_stored_value_cards",
|
||||
"payment_transactions",
|
||||
"platform_coupon_redemption_records",
|
||||
"recharge_settlements",
|
||||
"refund_transactions",
|
||||
"settlement_records",
|
||||
"settlement_ticket_details",
|
||||
"site_tables_master",
|
||||
"stock_goods_category_tree",
|
||||
"store_goods_master",
|
||||
"store_goods_sales_records",
|
||||
"table_fee_discount_records",
|
||||
"table_fee_transactions",
|
||||
"tenant_goods_master",
|
||||
]
|
||||
|
||||
# ETL 元数据列(不来自 API)
|
||||
ETL_META_COLS = {
|
||||
"content_hash", "source_file", "source_endpoint",
|
||||
"fetched_at", "payload", "record_index",
|
||||
}
|
||||
|
||||
# 需要展平的嵌套层(merge_record_layers 逻辑)
|
||||
FLATTEN_KEYS = {"data", "settleList"}
|
||||
|
||||
|
||||
def get_db_dsn() -> str:
|
||||
"""从 .env 文件读取数据库连接串。"""
|
||||
from dotenv import load_dotenv
|
||||
env_path = ROOT / "apps" / "etl" / "pipelines" / "feiqiu" / ".env"
|
||||
if env_path.exists():
|
||||
load_dotenv(env_path, override=True)
|
||||
load_dotenv(ROOT / ".env")
|
||||
dsn = os.environ.get("PG_DSN") or os.environ.get("DB_DSN") or os.environ.get("DATABASE_URL")
|
||||
if not dsn:
|
||||
print("错误: 未找到 PG_DSN / DB_DSN / DATABASE_URL 环境变量", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return dsn
|
||||
|
||||
|
||||
def flatten_json_keys(obj: dict, prefix: str = "") -> list[tuple[str, str]]:
|
||||
"""
|
||||
递归提取 JSON 对象的所有叶子键及其值类型。
|
||||
返回 [(key_path, value_type), ...]
|
||||
对于嵌套对象,用 "." 连接路径。
|
||||
对于数组,标记为 array 并递归展开元素。
|
||||
"""
|
||||
results = []
|
||||
if not isinstance(obj, dict):
|
||||
return results
|
||||
for k, v in obj.items():
|
||||
full_key = f"{prefix}.{k}" if prefix else k
|
||||
if v is None:
|
||||
results.append((full_key, "null"))
|
||||
elif isinstance(v, bool):
|
||||
results.append((full_key, "boolean"))
|
||||
elif isinstance(v, int):
|
||||
results.append((full_key, "integer"))
|
||||
elif isinstance(v, float):
|
||||
results.append((full_key, "number"))
|
||||
elif isinstance(v, str):
|
||||
results.append((full_key, "string"))
|
||||
elif isinstance(v, list):
|
||||
results.append((full_key, "array"))
|
||||
# 递归展开数组中的第一个对象元素
|
||||
for item in v:
|
||||
if isinstance(item, dict):
|
||||
results.extend(flatten_json_keys(item, f"{full_key}[]"))
|
||||
break
|
||||
elif isinstance(v, dict):
|
||||
results.append((full_key, "object"))
|
||||
results.extend(flatten_json_keys(v, full_key))
|
||||
return results
|
||||
|
||||
|
||||
def get_top_level_keys(obj: dict) -> list[tuple[str, str]]:
|
||||
"""
|
||||
提取 JSON 对象的顶层键及其值类型(merge_record_layers 展平后的视角)。
|
||||
模拟 ETL 的 merge_record_layers:展平 data 和 settleList 嵌套层。
|
||||
"""
|
||||
merged = dict(obj)
|
||||
# 展平 data 层
|
||||
data_part = merged.get("data")
|
||||
while isinstance(data_part, dict):
|
||||
merged = {**data_part, **merged}
|
||||
data_part = data_part.get("data")
|
||||
# 展平 settleList 层
|
||||
settle_inner = merged.get("settleList")
|
||||
if isinstance(settle_inner, dict):
|
||||
merged = {**settle_inner, **merged}
|
||||
|
||||
results = []
|
||||
for k, v in merged.items():
|
||||
if v is None:
|
||||
vtype = "null"
|
||||
elif isinstance(v, bool):
|
||||
vtype = "boolean"
|
||||
elif isinstance(v, int):
|
||||
vtype = "integer"
|
||||
elif isinstance(v, float):
|
||||
vtype = "number"
|
||||
elif isinstance(v, str):
|
||||
vtype = "string"
|
||||
elif isinstance(v, list):
|
||||
vtype = "array"
|
||||
elif isinstance(v, dict):
|
||||
vtype = "object"
|
||||
else:
|
||||
vtype = type(v).__name__
|
||||
results.append((k, vtype))
|
||||
return results
|
||||
|
||||
|
||||
def fetch_sample_payloads(conn, table: str, sample_count: int = 5) -> list[dict]:
|
||||
"""从 ODS 表获取多条 payload 样本,合并字段以覆盖更多字段。"""
|
||||
sql = f"""
|
||||
SELECT payload
|
||||
FROM {ODS_SCHEMA}.{table}
|
||||
WHERE payload IS NOT NULL
|
||||
ORDER BY fetched_at DESC
|
||||
LIMIT {sample_count}
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql)
|
||||
rows = cur.fetchall()
|
||||
payloads = []
|
||||
for row in rows:
|
||||
p = row[0]
|
||||
if isinstance(p, str):
|
||||
p = json.loads(p)
|
||||
if isinstance(p, dict):
|
||||
payloads.append(p)
|
||||
return payloads
|
||||
|
||||
|
||||
def merge_payloads_keys(payloads: list[dict]) -> OrderedDict[str, str]:
|
||||
"""合并多条 payload 的键,保留第一次出现的顺序和非 null 类型。"""
|
||||
merged = OrderedDict()
|
||||
for p in payloads:
|
||||
keys = get_top_level_keys(p)
|
||||
for k, vtype in keys:
|
||||
if k not in merged:
|
||||
merged[k] = vtype
|
||||
elif merged[k] == "null" and vtype != "null":
|
||||
merged[k] = vtype
|
||||
return merged
|
||||
|
||||
|
||||
def get_ods_columns(conn, table: str) -> list[tuple[str, str]]:
|
||||
"""从数据库获取 ODS 表的列名和类型。"""
|
||||
sql = """
|
||||
SELECT column_name, data_type
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s AND table_name = %s
|
||||
ORDER BY ordinal_position
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (ODS_SCHEMA, table))
|
||||
return [(r[0], r[1]) for r in cur.fetchall()]
|
||||
|
||||
|
||||
def compute_mapping(api_keys: OrderedDict, ods_cols: list[tuple[str, str]]) -> list[dict]:
|
||||
"""
|
||||
计算 API 字段 → ODS 列的映射关系。
|
||||
ETL 使用大小写不敏感匹配(_get_value_case_insensitive)。
|
||||
"""
|
||||
# 构建 ODS 列名的小写查找表
|
||||
ods_by_lower = {}
|
||||
for col_name, col_type in ods_cols:
|
||||
ods_by_lower[col_name.lower()] = (col_name, col_type)
|
||||
|
||||
mappings = []
|
||||
matched_ods = set()
|
||||
|
||||
for api_key, api_type in api_keys.items():
|
||||
api_lower = api_key.lower()
|
||||
# 跳过嵌套对象键(siteProfile, tableProfile 等)
|
||||
if api_type == "object" and api_lower in ("siteprofile", "tableprofile", "data", "settlelist"):
|
||||
mappings.append({
|
||||
"api_field": api_key,
|
||||
"api_type": api_type,
|
||||
"ods_column": "—",
|
||||
"ods_type": "—",
|
||||
"mapping": "嵌套对象,展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象,不直接映射到列",
|
||||
})
|
||||
continue
|
||||
|
||||
if api_lower in ods_by_lower:
|
||||
ods_col, ods_type = ods_by_lower[api_lower]
|
||||
matched_ods.add(ods_col.lower())
|
||||
note = "同名映射" if api_key == ods_col else "大小写不敏感匹配"
|
||||
mappings.append({
|
||||
"api_field": api_key,
|
||||
"api_type": api_type,
|
||||
"ods_column": ods_col,
|
||||
"ods_type": ods_type,
|
||||
"mapping": note,
|
||||
})
|
||||
else:
|
||||
mappings.append({
|
||||
"api_field": api_key,
|
||||
"api_type": api_type,
|
||||
"ods_column": "—",
|
||||
"ods_type": "—",
|
||||
"mapping": "未入 ODS 列(仅存于 payload)",
|
||||
})
|
||||
|
||||
# 找出 ODS 中有但 API 中没有的列(ETL 元数据列)
|
||||
for col_name, col_type in ods_cols:
|
||||
if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS:
|
||||
# 可能是从嵌套对象中提取的
|
||||
mappings.append({
|
||||
"api_field": "—",
|
||||
"api_type": "—",
|
||||
"ods_column": col_name,
|
||||
"ods_type": col_type,
|
||||
"mapping": "ETL 派生/嵌套提取",
|
||||
})
|
||||
|
||||
return mappings
|
||||
|
||||
|
||||
def generate_api_section(table: str, api_keys: OrderedDict, ods_cols: list[tuple[str, str]], mappings: list[dict]) -> str:
|
||||
"""生成单个表的 API 源字段小节 Markdown。"""
|
||||
lines = []
|
||||
|
||||
# API 源字段列表
|
||||
lines.append(f"### API 源字段({len(api_keys)} 个)")
|
||||
lines.append("")
|
||||
lines.append("> 以下字段从 `payload` JSONB 中提取,展示 API 返回 JSON 的顶层结构(经 `merge_record_layers` 展平后)。")
|
||||
lines.append("")
|
||||
lines.append("| # | API 字段名 | JSON 类型 | 映射到 ODS 列 | 说明 |")
|
||||
lines.append("|---|-----------|-----------|--------------|------|")
|
||||
|
||||
for idx, m in enumerate(mappings, 1):
|
||||
api_f = m["api_field"]
|
||||
api_t = m["api_type"]
|
||||
ods_c = m["ods_column"]
|
||||
note = m["mapping"]
|
||||
if api_f == "—":
|
||||
continue # 跳过 ETL 派生列,在下面单独说明
|
||||
ods_display = f"`{ods_c}`" if ods_c != "—" else "—"
|
||||
lines.append(f"| {idx} | `{api_f}` | {api_t} | {ods_display} | {note} |")
|
||||
|
||||
# 统计
|
||||
mapped_count = sum(1 for m in mappings if m["ods_column"] != "—" and m["api_field"] != "—")
|
||||
unmapped_count = sum(1 for m in mappings if m["ods_column"] == "—" and m["api_field"] != "—" and m["api_type"] not in ("object",))
|
||||
payload_only = [m["api_field"] for m in mappings if m["mapping"] == "未入 ODS 列(仅存于 payload)"]
|
||||
|
||||
lines.append("")
|
||||
if payload_only:
|
||||
lines.append(f"> 映射统计:{mapped_count} 个字段映射到 ODS 列,{len(payload_only)} 个字段仅存于 `payload` JSONB 中。")
|
||||
lines.append(f"> 仅存于 payload 的字段:{', '.join(f'`{f}`' for f in payload_only)}")
|
||||
else:
|
||||
lines.append(f"> 映射统计:{mapped_count} 个字段全部映射到 ODS 列。")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def insert_sections_into_doc(doc_text: str, sections: dict[str, str]) -> str:
|
||||
"""
|
||||
在现有文档的每个表章节中,在 "### ODS 表" 之前插入 API 源字段小节。
|
||||
如果已存在 "### API 源字段" 则替换。
|
||||
"""
|
||||
lines = doc_text.split("\n")
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# 检测 "## table_name" 章节标题
|
||||
m = re.match(r"^## (\w+)\s*$", line)
|
||||
if m:
|
||||
table_name = m.group(1)
|
||||
result.append(line)
|
||||
i += 1
|
||||
|
||||
if table_name in sections:
|
||||
# 跳过空行
|
||||
while i < len(lines) and lines[i].strip() == "":
|
||||
result.append(lines[i])
|
||||
i += 1
|
||||
|
||||
# 如果已存在 "### API 源字段",跳过旧内容直到下一个 ### 或 ##
|
||||
if i < len(lines) and lines[i].startswith("### API 源字段"):
|
||||
# 跳过旧的 API 源字段小节
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
if lines[i].startswith("### ") or lines[i].startswith("## "):
|
||||
break
|
||||
i += 1
|
||||
|
||||
# 插入新的 API 源字段小节
|
||||
result.append(sections[table_name])
|
||||
result.append("")
|
||||
continue
|
||||
|
||||
result.append(line)
|
||||
i += 1
|
||||
|
||||
return "\n".join(result)
|
||||
|
||||
|
||||
def detect_ods_schema(conn) -> str:
|
||||
"""自动检测 ODS schema 名(可能是 ods 或 billiards_ods)。"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT schema_name FROM information_schema.schemata
|
||||
WHERE schema_name IN ('ods', 'billiards_ods')
|
||||
ORDER BY schema_name
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
for row in rows:
|
||||
if row[0] == "ods":
|
||||
return "ods"
|
||||
for row in rows:
|
||||
if row[0] == "billiards_ods":
|
||||
return "billiards_ods"
|
||||
print("错误: 未找到 ods 或 billiards_ods schema", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
global ODS_SCHEMA
|
||||
dsn = get_db_dsn()
|
||||
conn = psycopg2.connect(dsn)
|
||||
conn.set_client_encoding("UTF8")
|
||||
|
||||
ODS_SCHEMA = detect_ods_schema(conn)
|
||||
print(f"检测到 ODS schema: {ODS_SCHEMA}")
|
||||
|
||||
print("正在从数据库提取 API 原始字段...")
|
||||
sections = {}
|
||||
|
||||
for table in ODS_TABLES:
|
||||
print(f" 处理: {table}")
|
||||
payloads = fetch_sample_payloads(conn, table, sample_count=10)
|
||||
if not payloads:
|
||||
print(f" 警告: {table} 无 payload 数据,跳过")
|
||||
continue
|
||||
|
||||
api_keys = merge_payloads_keys(payloads)
|
||||
ods_cols = get_ods_columns(conn, table)
|
||||
mappings = compute_mapping(api_keys, ods_cols)
|
||||
section_text = generate_api_section(table, api_keys, ods_cols, mappings)
|
||||
sections[table] = section_text
|
||||
|
||||
conn.close()
|
||||
|
||||
print(f"\n读取现有文档: {INPUT_DOC}")
|
||||
doc_text = INPUT_DOC.read_text(encoding="utf-8")
|
||||
|
||||
print("插入 API 源字段小节...")
|
||||
new_doc = insert_sections_into_doc(doc_text, sections)
|
||||
|
||||
OUTPUT_DOC.write_text(new_doc, encoding="utf-8")
|
||||
print(f"文档已更新: {OUTPUT_DOC}")
|
||||
print(f" 处理了 {len(sections)} 个表的 API 源字段映射")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user