Files
Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/compare_api_ods_v2.py

462 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
API 参考文档 vs ODS 实际表结构 对比脚本 (v2)
从 docs/api-reference/*.md 的 JSON 样例中提取字段,
查询 PostgreSQL billiards_ods 的实际列,
输出差异报告 JSON 和 Markdown + ALTER SQL。
用法: python scripts/compare_api_ods_v2.py
"""
import json
import os
import re
import sys
from datetime import datetime
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, ROOT)
from dotenv import load_dotenv
load_dotenv(os.path.join(ROOT, ".env"))
import psycopg2
# ODS 元列ETL 管理列,不来自 API
ODS_META_COLS = {
"source_file", "source_endpoint", "fetched_at",
"payload", "content_hash",
}
def load_registry():
"""加载 API 注册表"""
path = os.path.join(ROOT, "docs", "api-reference", "api_registry.json")
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def extract_fields_from_md(md_path, api_id):
"""
从 md 文件的 JSON 样例(五、响应样例)中提取所有字段名(小写)。
对 settlement_records / recharge_settlements 等嵌套结构,
提取 settleList 内层字段 + siteProfile 字段。
"""
with open(md_path, "r", encoding="utf-8") as f:
content = f.read()
# 提取所有 ```json ... ``` 代码块
json_blocks = re.findall(r'```json\s*\n(.*?)\n```', content, re.DOTALL)
if not json_blocks:
return None, None, "无 JSON 样例"
# 找到最大的 JSON 对象(响应样例通常是最大的)
sample_json = None
for block in json_blocks:
try:
parsed = json.loads(block)
if isinstance(parsed, dict):
if sample_json is None or len(str(parsed)) > len(str(sample_json)):
sample_json = parsed
except json.JSONDecodeError:
continue
if sample_json is None:
return None, None, "无法解析 JSON 样例"
fields = set()
has_nested = False
# settlement_records / recharge_settlements 嵌套结构:
# { "siteProfile": {...}, "settleList": {...} }
if "siteProfile" in sample_json and "settleList" in sample_json:
has_nested = True
sl = sample_json.get("settleList", {})
if isinstance(sl, dict):
for k in sl:
fields.add(k.lower())
return fields, has_nested, None
# CHANGE: stock_goods_category_tree 特殊结构处理
# intent: goodsCategoryList 是数组包装ODS 存储的是展平后的分类节点字段
# assumptions: 外层 total/goodsCategoryList 不是 ODS 列
if "goodsCategoryList" in sample_json and isinstance(sample_json["goodsCategoryList"], list):
has_nested = True
arr = sample_json["goodsCategoryList"]
if arr and isinstance(arr[0], dict):
_extract_flat(arr[0], fields)
return fields, has_nested, None
for k in sample_json:
fields.add(k.lower())
return fields, has_nested, None
def _extract_flat(obj, fields):
"""递归提取字典的标量字段名(跳过数组/嵌套对象值,但保留键名)"""
if not isinstance(obj, dict):
return
for k, v in obj.items():
fields.add(k.lower())
def get_all_ods_columns(conn):
"""查询所有 ODS 表的列信息"""
cur = conn.cursor()
cur.execute("""
SELECT table_name, column_name, data_type, ordinal_position
FROM information_schema.columns
WHERE table_schema = 'billiards_ods'
ORDER BY table_name, ordinal_position
""")
rows = cur.fetchall()
cur.close()
tables = {}
for table_name, col_name, data_type, pos in rows:
if table_name not in tables:
tables[table_name] = {}
tables[table_name][col_name] = {
"data_type": data_type,
"ordinal_position": pos,
}
return tables
def guess_pg_type(name):
"""根据字段名猜测 PostgreSQL 类型(用于 ALTER TABLE ADD COLUMN"""
n = name.lower()
if n == "id" or n.endswith("_id") or n.endswith("id"):
return "bigint"
money_kw = ["amount", "money", "price", "cost", "fee", "discount",
"deduct", "balance", "charge", "sale", "refund",
"promotion", "adjust", "rounding", "prepay", "income",
"royalty", "grade", "point", "stock", "num"]
for kw in money_kw:
if kw in n:
return "numeric(18,2)"
if "time" in n or "date" in n:
return "timestamp without time zone"
if n.startswith("is_") or (n.startswith("is") and len(n) > 2 and n[2].isupper()):
return "boolean"
if n.startswith("able_") or n.startswith("can"):
return "boolean"
int_kw = ["status", "type", "sort", "count", "seconds", "level",
"channel", "method", "way", "enabled", "switch", "delete",
"first", "single", "trash", "confirm", "clock", "cycle",
"delay", "free", "virtual", "online", "show", "audit",
"freeze", "send", "required", "scene", "range", "tag",
"on", "minutes", "number", "duration"]
for kw in int_kw:
if kw in n:
return "integer"
return "text"
def compare_one(api_entry, md_path, ods_tables):
"""比较单个 API 与其 ODS 表"""
api_id = api_entry["id"]
ods_table = api_entry.get("ods_table")
name_zh = api_entry.get("name_zh", "")
result = {
"api_id": api_id,
"name_zh": name_zh,
"ods_table": ods_table,
}
if not ods_table:
result["status"] = "skip"
result["reason"] = "无对应 ODS 表ods_table=null"
return result
if api_entry.get("skip"):
result["status"] = "skip"
result["reason"] = "接口标记为 skip暂不可用"
return result
# 提取 API JSON 样例字段
api_fields, has_nested, err = extract_fields_from_md(md_path, api_id)
if err:
result["status"] = "error"
result["reason"] = err
return result
# 获取 ODS 表列
if ods_table not in ods_tables:
result["status"] = "error"
result["reason"] = f"ODS 表 {ods_table} 不存在"
return result
ods_cols = ods_tables[ods_table]
ods_biz_cols = {c for c in ods_cols if c not in ODS_META_COLS}
# 比较
api_lower = {f.lower() for f in api_fields}
ods_lower = {c.lower() for c in ods_biz_cols}
# API 有但 ODS 没有的字段
api_only = sorted(api_lower - ods_lower)
# ODS 有但 API 没有的字段(非元列)
ods_only = sorted(ods_lower - api_lower)
# 两边都有的字段
matched = sorted(api_lower & ods_lower)
result["status"] = "ok" if not api_only else "drift"
result["has_nested_structure"] = has_nested
result["api_field_count"] = len(api_lower)
result["ods_biz_col_count"] = len(ods_biz_cols)
result["ods_total_col_count"] = len(ods_cols)
result["matched_count"] = len(matched)
result["api_only"] = api_only
result["api_only_count"] = len(api_only)
result["ods_only"] = ods_only
result["ods_only_count"] = len(ods_only)
result["matched"] = matched
return result
def generate_alter_sql(results, ods_tables):
"""生成 ALTER TABLE SQL 语句"""
sqls = []
for r in results:
if r.get("status") != "drift" or not r.get("api_only"):
continue
table = r["ods_table"]
for field in r["api_only"]:
pg_type = guess_pg_type(field)
sqls.append(
f"ALTER TABLE billiards_ods.{table} "
f"ADD COLUMN IF NOT EXISTS {field} {pg_type};"
)
return sqls
def generate_markdown_report(results, alter_sqls):
"""生成 Markdown 报告"""
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
lines = [
"# API 参考文档 vs ODS 实际表结构 对比报告 (v2)",
"",
f"> 生成时间:{now}",
"> 数据来源:`docs/api-reference/*.md` JSON 样例 vs `billiards_ods` 实际列",
"",
"---",
"",
"## 一、汇总",
"",
"| API 接口 | 中文名 | ODS 表 | 状态 | API 字段数 | ODS 业务列数 | 匹配 | API 独有 | ODS 独有 |",
"|----------|--------|--------|------|-----------|-------------|------|---------|---------|",
]
total_api_only = 0
total_ods_only = 0
ok_count = 0
drift_count = 0
skip_count = 0
error_count = 0
for r in results:
status = r.get("status", "?")
if status == "skip":
skip_count += 1
lines.append(
f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
f"| ⏭️ 跳过 | - | - | - | - | - |"
)
continue
if status == "error":
error_count += 1
lines.append(
f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
f"| ❌ 错误 | - | - | - | - | - |"
)
continue
api_only_n = r.get("api_only_count", 0)
ods_only_n = r.get("ods_only_count", 0)
total_api_only += api_only_n
total_ods_only += ods_only_n
if status == "ok":
ok_count += 1
badge = "✅ 对齐"
else:
drift_count += 1
badge = "⚠️ 漂移"
lines.append(
f"| {r['api_id']} | {r['name_zh']} | {r['ods_table']} "
f"| {badge} | {r['api_field_count']} | {r['ods_biz_col_count']} "
f"| {r['matched_count']} | {api_only_n} | {ods_only_n} |"
)
lines.extend([
"",
f"**统计**:对齐 {ok_count} / 漂移 {drift_count} / 跳过 {skip_count} / 错误 {error_count}",
f"**API 独有字段总计**{total_api_only}(需要 ALTER TABLE ADD COLUMN",
f"**ODS 独有列总计**{total_ods_only}API 中不存在,可能是历史遗留或 ETL 派生列)",
"",
])
# 详情:每个漂移表的字段差异
drift_results = [r for r in results if r.get("status") == "drift"]
if drift_results:
lines.extend(["---", "", "## 二、漂移详情", ""])
for r in drift_results:
lines.extend([
f"### {r['api_id']}{r['name_zh']})→ `{r['ods_table']}`",
"",
])
if r["api_only"]:
lines.append("**API 有 / ODS 缺**")
for f in r["api_only"]:
pg_type = guess_pg_type(f)
lines.append(f"- `{f}` → 建议类型 `{pg_type}`")
lines.append("")
if r["ods_only"]:
lines.append("**ODS 有 / API 无**(非元列):")
for f in r["ods_only"]:
lines.append(f"- `{f}`")
lines.append("")
# ODS 独有列详情(所有表)
ods_only_results = [r for r in results if r.get("ods_only") and r.get("status") in ("ok", "drift")]
if ods_only_results:
lines.extend(["---", "", "## 三、ODS 独有列详情API 中不存在)", ""])
for r in ods_only_results:
if not r["ods_only"]:
continue
lines.extend([
f"### `{r['ods_table']}`{r['name_zh']}",
"",
"| 列名 | 说明 |",
"|------|------|",
])
for f in r["ods_only"]:
lines.append(f"| `{f}` | ODS 独有API JSON 样例中不存在 |")
lines.append("")
# ALTER SQL
if alter_sqls:
lines.extend([
"---", "",
"## 四、ALTER SQL对齐 ODS 表结构)", "",
"```sql",
"-- 自动生成的 ALTER TABLE 语句",
f"-- 生成时间:{now}",
"-- 注意:类型为根据字段名猜测,请人工复核后执行",
"",
])
lines.extend(alter_sqls)
lines.extend(["", "```", ""])
return "\n".join(lines)
def main():
dsn = os.environ.get("PG_DSN")
if not dsn:
print("错误:未设置 PG_DSN 环境变量", file=sys.stderr)
sys.exit(1)
print("连接数据库...")
conn = psycopg2.connect(dsn)
print("查询 ODS 表结构...")
ods_tables = get_all_ods_columns(conn)
print(f"{len(ods_tables)} 张 ODS 表")
print("加载 API 注册表...")
registry = load_registry()
print(f"{len(registry)} 个 API 端点")
results = []
for entry in registry:
api_id = entry["id"]
ods_table = entry.get("ods_table")
md_path = os.path.join(ROOT, "docs", "api-reference", f"{api_id}.md")
if not os.path.exists(md_path):
results.append({
"api_id": api_id,
"name_zh": entry.get("name_zh", ""),
"ods_table": ods_table,
"status": "error",
"reason": f"文档不存在: {md_path}",
})
continue
r = compare_one(entry, md_path, ods_tables)
results.append(r)
status_icon = {"ok": "", "drift": "⚠️", "skip": "⏭️", "error": ""}.get(r["status"], "?")
extra = ""
if r.get("api_only_count"):
extra = f" (API独有: {r['api_only_count']})"
if r.get("ods_only_count"):
extra += f" (ODS独有: {r['ods_only_count']})"
print(f" {status_icon} {api_id}{ods_table or '-'}{extra}")
conn.close()
# 生成 ALTER SQL
alter_sqls = generate_alter_sql(results, ods_tables)
# 输出 JSON 报告
json_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.json")
os.makedirs(os.path.dirname(json_path), exist_ok=True)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nJSON 报告: {json_path}")
# 输出 Markdown 报告
md_report = generate_markdown_report(results, alter_sqls)
md_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write(md_report)
print(f"Markdown 报告: {md_path}")
# 输出 ALTER SQL 文件
if alter_sqls:
sql_path = os.path.join(ROOT, "database", "migrations",
"20260213_align_ods_with_api_v2.sql")
os.makedirs(os.path.dirname(sql_path), exist_ok=True)
with open(sql_path, "w", encoding="utf-8") as f:
f.write("-- API vs ODS 对齐迁移脚本 (v2)\n")
f.write(f"-- 生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("-- 注意:类型为根据字段名猜测,请人工复核后执行\n\n")
f.write("BEGIN;\n\n")
for sql in alter_sqls:
f.write(sql + "\n")
f.write("\nCOMMIT;\n")
print(f"ALTER SQL: {sql_path}")
else:
print("无需 ALTER SQL所有表已对齐")
# 统计
ok_n = sum(1 for r in results if r.get("status") == "ok")
drift_n = sum(1 for r in results if r.get("status") == "drift")
skip_n = sum(1 for r in results if r.get("status") == "skip")
err_n = sum(1 for r in results if r.get("status") == "error")
print(f"\n汇总:对齐 {ok_n} / 漂移 {drift_n} / 跳过 {skip_n} / 错误 {err_n}")
print(f"ALTER SQL 语句数:{len(alter_sqls)}")
if __name__ == "__main__":
main()
# ──────────────────────────────────────────────
# AI_CHANGELOG:
# - 日期: 2026-02-13
# Prompt: P20260213-223000 — 用 API 参考文档比对数据库 ODS 实际表结构(重做,不依赖 DDL
# 直接原因: 前次比对脚本 stock_goods_category_tree 嵌套结构解析 bug需重写脚本
# 变更摘要: 完整重写脚本,从 api-reference/*.md JSON 样例提取字段,查询 PG billiards_ods 实际列,
# 处理三种特殊结构(标准/settleList 嵌套/goodsCategoryList 数组包装),输出 JSON+MD 报告
# 风险与验证: 纯分析脚本,不修改数据库;验证方式:运行脚本确认 "对齐 22 / 漂移 0"
# ──────────────────────────────────────────────