在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

View File

@@ -0,0 +1,398 @@
# -*- coding: utf-8 -*-
"""
从数据库 payload 字段提取 API 原始 JSON 字段,生成 API 源字段 → ODS 映射文档。
直接从 API 返回的 JSON 分析,不依赖处理代码。
用法: python scripts/ops/gen_api_field_mapping.py
输出: 在 docs/reports/dataflow_api_ods_dwd.md 的每个表章节中插入 API 源字段小节
"""
import json
import os
import re
import sys
from collections import OrderedDict
from pathlib import Path
import psycopg2
ROOT = Path(__file__).resolve().parents[2]
INPUT_DOC = ROOT / "docs" / "reports" / "dataflow_api_ods_dwd.md"
OUTPUT_DOC = INPUT_DOC # 原地更新
# ODS schema 名(从数据库动态检测)
ODS_SCHEMA = None # 运行时自动检测
# ODS 表列表(与文档中的顺序一致)
ODS_TABLES = [
"assistant_accounts_master",
"assistant_cancellation_records",
"assistant_service_records",
"goods_stock_movements",
"goods_stock_summary",
"group_buy_packages",
"group_buy_redemption_records",
"member_balance_changes",
"member_profiles",
"member_stored_value_cards",
"payment_transactions",
"platform_coupon_redemption_records",
"recharge_settlements",
"refund_transactions",
"settlement_records",
"settlement_ticket_details",
"site_tables_master",
"stock_goods_category_tree",
"store_goods_master",
"store_goods_sales_records",
"table_fee_discount_records",
"table_fee_transactions",
"tenant_goods_master",
]
# ETL 元数据列(不来自 API
ETL_META_COLS = {
"content_hash", "source_file", "source_endpoint",
"fetched_at", "payload", "record_index",
}
# 需要展平的嵌套层merge_record_layers 逻辑)
FLATTEN_KEYS = {"data", "settleList"}
def get_db_dsn() -> str:
"""从 .env 文件读取数据库连接串。"""
from dotenv import load_dotenv
env_path = ROOT / "apps" / "etl" / "pipelines" / "feiqiu" / ".env"
if env_path.exists():
load_dotenv(env_path, override=True)
load_dotenv(ROOT / ".env")
dsn = os.environ.get("PG_DSN") or os.environ.get("DB_DSN") or os.environ.get("DATABASE_URL")
if not dsn:
print("错误: 未找到 PG_DSN / DB_DSN / DATABASE_URL 环境变量", file=sys.stderr)
sys.exit(1)
return dsn
def flatten_json_keys(obj: dict, prefix: str = "") -> list[tuple[str, str]]:
"""
递归提取 JSON 对象的所有叶子键及其值类型。
返回 [(key_path, value_type), ...]
对于嵌套对象,用 "." 连接路径。
对于数组,标记为 array 并递归展开元素。
"""
results = []
if not isinstance(obj, dict):
return results
for k, v in obj.items():
full_key = f"{prefix}.{k}" if prefix else k
if v is None:
results.append((full_key, "null"))
elif isinstance(v, bool):
results.append((full_key, "boolean"))
elif isinstance(v, int):
results.append((full_key, "integer"))
elif isinstance(v, float):
results.append((full_key, "number"))
elif isinstance(v, str):
results.append((full_key, "string"))
elif isinstance(v, list):
results.append((full_key, "array"))
# 递归展开数组中的第一个对象元素
for item in v:
if isinstance(item, dict):
results.extend(flatten_json_keys(item, f"{full_key}[]"))
break
elif isinstance(v, dict):
results.append((full_key, "object"))
results.extend(flatten_json_keys(v, full_key))
return results
def get_top_level_keys(obj: dict) -> list[tuple[str, str]]:
"""
提取 JSON 对象的顶层键及其值类型merge_record_layers 展平后的视角)。
模拟 ETL 的 merge_record_layers展平 data 和 settleList 嵌套层。
"""
merged = dict(obj)
# 展平 data 层
data_part = merged.get("data")
while isinstance(data_part, dict):
merged = {**data_part, **merged}
data_part = data_part.get("data")
# 展平 settleList 层
settle_inner = merged.get("settleList")
if isinstance(settle_inner, dict):
merged = {**settle_inner, **merged}
results = []
for k, v in merged.items():
if v is None:
vtype = "null"
elif isinstance(v, bool):
vtype = "boolean"
elif isinstance(v, int):
vtype = "integer"
elif isinstance(v, float):
vtype = "number"
elif isinstance(v, str):
vtype = "string"
elif isinstance(v, list):
vtype = "array"
elif isinstance(v, dict):
vtype = "object"
else:
vtype = type(v).__name__
results.append((k, vtype))
return results
def fetch_sample_payloads(conn, table: str, sample_count: int = 5) -> list[dict]:
"""从 ODS 表获取多条 payload 样本,合并字段以覆盖更多字段。"""
sql = f"""
SELECT payload
FROM {ODS_SCHEMA}.{table}
WHERE payload IS NOT NULL
ORDER BY fetched_at DESC
LIMIT {sample_count}
"""
with conn.cursor() as cur:
cur.execute(sql)
rows = cur.fetchall()
payloads = []
for row in rows:
p = row[0]
if isinstance(p, str):
p = json.loads(p)
if isinstance(p, dict):
payloads.append(p)
return payloads
def merge_payloads_keys(payloads: list[dict]) -> OrderedDict[str, str]:
"""合并多条 payload 的键,保留第一次出现的顺序和非 null 类型。"""
merged = OrderedDict()
for p in payloads:
keys = get_top_level_keys(p)
for k, vtype in keys:
if k not in merged:
merged[k] = vtype
elif merged[k] == "null" and vtype != "null":
merged[k] = vtype
return merged
def get_ods_columns(conn, table: str) -> list[tuple[str, str]]:
"""从数据库获取 ODS 表的列名和类型。"""
sql = """
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
ORDER BY ordinal_position
"""
with conn.cursor() as cur:
cur.execute(sql, (ODS_SCHEMA, table))
return [(r[0], r[1]) for r in cur.fetchall()]
def compute_mapping(api_keys: OrderedDict, ods_cols: list[tuple[str, str]]) -> list[dict]:
"""
计算 API 字段 → ODS 列的映射关系。
ETL 使用大小写不敏感匹配_get_value_case_insensitive
"""
# 构建 ODS 列名的小写查找表
ods_by_lower = {}
for col_name, col_type in ods_cols:
ods_by_lower[col_name.lower()] = (col_name, col_type)
mappings = []
matched_ods = set()
for api_key, api_type in api_keys.items():
api_lower = api_key.lower()
# 跳过嵌套对象键siteProfile, tableProfile 等)
if api_type == "object" and api_lower in ("siteprofile", "tableprofile", "data", "settlelist"):
mappings.append({
"api_field": api_key,
"api_type": api_type,
"ods_column": "",
"ods_type": "",
"mapping": "嵌套对象,展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象,不直接映射到列",
})
continue
if api_lower in ods_by_lower:
ods_col, ods_type = ods_by_lower[api_lower]
matched_ods.add(ods_col.lower())
note = "同名映射" if api_key == ods_col else "大小写不敏感匹配"
mappings.append({
"api_field": api_key,
"api_type": api_type,
"ods_column": ods_col,
"ods_type": ods_type,
"mapping": note,
})
else:
mappings.append({
"api_field": api_key,
"api_type": api_type,
"ods_column": "",
"ods_type": "",
"mapping": "未入 ODS 列(仅存于 payload",
})
# 找出 ODS 中有但 API 中没有的列ETL 元数据列)
for col_name, col_type in ods_cols:
if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS:
# 可能是从嵌套对象中提取的
mappings.append({
"api_field": "",
"api_type": "",
"ods_column": col_name,
"ods_type": col_type,
"mapping": "ETL 派生/嵌套提取",
})
return mappings
def generate_api_section(table: str, api_keys: OrderedDict, ods_cols: list[tuple[str, str]], mappings: list[dict]) -> str:
"""生成单个表的 API 源字段小节 Markdown。"""
lines = []
# API 源字段列表
lines.append(f"### API 源字段({len(api_keys)} 个)")
lines.append("")
lines.append("> 以下字段从 `payload` JSONB 中提取,展示 API 返回 JSON 的顶层结构(经 `merge_record_layers` 展平后)。")
lines.append("")
lines.append("| # | API 字段名 | JSON 类型 | 映射到 ODS 列 | 说明 |")
lines.append("|---|-----------|-----------|--------------|------|")
for idx, m in enumerate(mappings, 1):
api_f = m["api_field"]
api_t = m["api_type"]
ods_c = m["ods_column"]
note = m["mapping"]
if api_f == "":
continue # 跳过 ETL 派生列,在下面单独说明
ods_display = f"`{ods_c}`" if ods_c != "" else ""
lines.append(f"| {idx} | `{api_f}` | {api_t} | {ods_display} | {note} |")
# 统计
mapped_count = sum(1 for m in mappings if m["ods_column"] != "" and m["api_field"] != "")
unmapped_count = sum(1 for m in mappings if m["ods_column"] == "" and m["api_field"] != "" and m["api_type"] not in ("object",))
payload_only = [m["api_field"] for m in mappings if m["mapping"] == "未入 ODS 列(仅存于 payload"]
lines.append("")
if payload_only:
lines.append(f"> 映射统计:{mapped_count} 个字段映射到 ODS 列,{len(payload_only)} 个字段仅存于 `payload` JSONB 中。")
lines.append(f"> 仅存于 payload 的字段:{', '.join(f'`{f}`' for f in payload_only)}")
else:
lines.append(f"> 映射统计:{mapped_count} 个字段全部映射到 ODS 列。")
lines.append("")
return "\n".join(lines)
def insert_sections_into_doc(doc_text: str, sections: dict[str, str]) -> str:
"""
在现有文档的每个表章节中,在 "### ODS 表" 之前插入 API 源字段小节。
如果已存在 "### API 源字段" 则替换。
"""
lines = doc_text.split("\n")
result = []
i = 0
while i < len(lines):
line = lines[i]
# 检测 "## table_name" 章节标题
m = re.match(r"^## (\w+)\s*$", line)
if m:
table_name = m.group(1)
result.append(line)
i += 1
if table_name in sections:
# 跳过空行
while i < len(lines) and lines[i].strip() == "":
result.append(lines[i])
i += 1
# 如果已存在 "### API 源字段",跳过旧内容直到下一个 ### 或 ##
if i < len(lines) and lines[i].startswith("### API 源字段"):
# 跳过旧的 API 源字段小节
i += 1
while i < len(lines):
if lines[i].startswith("### ") or lines[i].startswith("## "):
break
i += 1
# 插入新的 API 源字段小节
result.append(sections[table_name])
result.append("")
continue
result.append(line)
i += 1
return "\n".join(result)
def detect_ods_schema(conn) -> str:
"""自动检测 ODS schema 名(可能是 ods 或 billiards_ods"""
with conn.cursor() as cur:
cur.execute("""
SELECT schema_name FROM information_schema.schemata
WHERE schema_name IN ('ods', 'billiards_ods')
ORDER BY schema_name
""")
rows = cur.fetchall()
for row in rows:
if row[0] == "ods":
return "ods"
for row in rows:
if row[0] == "billiards_ods":
return "billiards_ods"
print("错误: 未找到 ods 或 billiards_ods schema", file=sys.stderr)
sys.exit(1)
def main():
global ODS_SCHEMA
dsn = get_db_dsn()
conn = psycopg2.connect(dsn)
conn.set_client_encoding("UTF8")
ODS_SCHEMA = detect_ods_schema(conn)
print(f"检测到 ODS schema: {ODS_SCHEMA}")
print("正在从数据库提取 API 原始字段...")
sections = {}
for table in ODS_TABLES:
print(f" 处理: {table}")
payloads = fetch_sample_payloads(conn, table, sample_count=10)
if not payloads:
print(f" 警告: {table} 无 payload 数据,跳过")
continue
api_keys = merge_payloads_keys(payloads)
ods_cols = get_ods_columns(conn, table)
mappings = compute_mapping(api_keys, ods_cols)
section_text = generate_api_section(table, api_keys, ods_cols, mappings)
sections[table] = section_text
conn.close()
print(f"\n读取现有文档: {INPUT_DOC}")
doc_text = INPUT_DOC.read_text(encoding="utf-8")
print("插入 API 源字段小节...")
new_doc = insert_sections_into_doc(doc_text, sections)
OUTPUT_DOC.write_text(new_doc, encoding="utf-8")
print(f"文档已更新: {OUTPUT_DOC}")
print(f" 处理了 {len(sections)} 个表的 API 源字段映射")
if __name__ == "__main__":
main()