462 lines
16 KiB
Python
462 lines
16 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
API 参考文档 vs ODS 实际表结构 对比脚本 (v2)
|
||
|
||
从 docs/api-reference/*.md 的 JSON 样例中提取字段,
|
||
查询 PostgreSQL billiards_ods 的实际列,
|
||
输出差异报告 JSON 和 Markdown + ALTER SQL。
|
||
|
||
用法: python scripts/compare_api_ods_v2.py
|
||
"""
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
from datetime import datetime
|
||
|
||
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
sys.path.insert(0, ROOT)
|
||
|
||
from dotenv import load_dotenv
|
||
load_dotenv(os.path.join(ROOT, ".env"))
|
||
|
||
import psycopg2
|
||
|
||
# ODS 元列(ETL 管理列,不来自 API)
|
||
ODS_META_COLS = {
|
||
"source_file", "source_endpoint", "fetched_at",
|
||
"payload", "content_hash",
|
||
}
|
||
|
||
|
||
def load_registry():
|
||
"""加载 API 注册表"""
|
||
path = os.path.join(ROOT, "docs", "api-reference", "api_registry.json")
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return json.load(f)
|
||
|
||
|
||
def extract_fields_from_md(md_path, api_id):
|
||
"""
|
||
从 md 文件的 JSON 样例(五、响应样例)中提取所有字段名(小写)。
|
||
对 settlement_records / recharge_settlements 等嵌套结构,
|
||
提取 settleList 内层字段 + siteProfile 字段。
|
||
"""
|
||
with open(md_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
# 提取所有 ```json ... ``` 代码块
|
||
json_blocks = re.findall(r'```json\s*\n(.*?)\n```', content, re.DOTALL)
|
||
if not json_blocks:
|
||
return None, None, "无 JSON 样例"
|
||
|
||
# 找到最大的 JSON 对象(响应样例通常是最大的)
|
||
sample_json = None
|
||
for block in json_blocks:
|
||
try:
|
||
parsed = json.loads(block)
|
||
if isinstance(parsed, dict):
|
||
if sample_json is None or len(str(parsed)) > len(str(sample_json)):
|
||
sample_json = parsed
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
if sample_json is None:
|
||
return None, None, "无法解析 JSON 样例"
|
||
|
||
fields = set()
|
||
has_nested = False
|
||
|
||
# settlement_records / recharge_settlements 嵌套结构:
|
||
# { "siteProfile": {...}, "settleList": {...} }
|
||
if "siteProfile" in sample_json and "settleList" in sample_json:
|
||
has_nested = True
|
||
sl = sample_json.get("settleList", {})
|
||
if isinstance(sl, dict):
|
||
for k in sl:
|
||
fields.add(k.lower())
|
||
return fields, has_nested, None
|
||
|
||
# CHANGE: stock_goods_category_tree 特殊结构处理
|
||
# intent: goodsCategoryList 是数组包装,ODS 存储的是展平后的分类节点字段
|
||
# assumptions: 外层 total/goodsCategoryList 不是 ODS 列
|
||
if "goodsCategoryList" in sample_json and isinstance(sample_json["goodsCategoryList"], list):
|
||
has_nested = True
|
||
arr = sample_json["goodsCategoryList"]
|
||
if arr and isinstance(arr[0], dict):
|
||
_extract_flat(arr[0], fields)
|
||
return fields, has_nested, None
|
||
|
||
for k in sample_json:
|
||
fields.add(k.lower())
|
||
return fields, has_nested, None
|
||
|
||
|
||
def _extract_flat(obj, fields):
|
||
"""递归提取字典的标量字段名(跳过数组/嵌套对象值,但保留键名)"""
|
||
if not isinstance(obj, dict):
|
||
return
|
||
for k, v in obj.items():
|
||
fields.add(k.lower())
|
||
|
||
|
||
def get_all_ods_columns(conn):
|
||
"""查询所有 ODS 表的列信息"""
|
||
cur = conn.cursor()
|
||
cur.execute("""
|
||
SELECT table_name, column_name, data_type, ordinal_position
|
||
FROM information_schema.columns
|
||
WHERE table_schema = 'billiards_ods'
|
||
ORDER BY table_name, ordinal_position
|
||
""")
|
||
rows = cur.fetchall()
|
||
cur.close()
|
||
|
||
tables = {}
|
||
for table_name, col_name, data_type, pos in rows:
|
||
if table_name not in tables:
|
||
tables[table_name] = {}
|
||
tables[table_name][col_name] = {
|
||
"data_type": data_type,
|
||
"ordinal_position": pos,
|
||
}
|
||
return tables
|
||
|
||
|
||
|
||
def guess_pg_type(name):
|
||
"""根据字段名猜测 PostgreSQL 类型(用于 ALTER TABLE ADD COLUMN)"""
|
||
n = name.lower()
|
||
if n == "id" or n.endswith("_id") or n.endswith("id"):
|
||
return "bigint"
|
||
money_kw = ["amount", "money", "price", "cost", "fee", "discount",
|
||
"deduct", "balance", "charge", "sale", "refund",
|
||
"promotion", "adjust", "rounding", "prepay", "income",
|
||
"royalty", "grade", "point", "stock", "num"]
|
||
for kw in money_kw:
|
||
if kw in n:
|
||
return "numeric(18,2)"
|
||
if "time" in n or "date" in n:
|
||
return "timestamp without time zone"
|
||
if n.startswith("is_") or (n.startswith("is") and len(n) > 2 and n[2].isupper()):
|
||
return "boolean"
|
||
if n.startswith("able_") or n.startswith("can"):
|
||
return "boolean"
|
||
int_kw = ["status", "type", "sort", "count", "seconds", "level",
|
||
"channel", "method", "way", "enabled", "switch", "delete",
|
||
"first", "single", "trash", "confirm", "clock", "cycle",
|
||
"delay", "free", "virtual", "online", "show", "audit",
|
||
"freeze", "send", "required", "scene", "range", "tag",
|
||
"on", "minutes", "number", "duration"]
|
||
for kw in int_kw:
|
||
if kw in n:
|
||
return "integer"
|
||
return "text"
|
||
|
||
|
||
def compare_one(api_entry, md_path, ods_tables):
|
||
"""比较单个 API 与其 ODS 表"""
|
||
api_id = api_entry["id"]
|
||
ods_table = api_entry.get("ods_table")
|
||
name_zh = api_entry.get("name_zh", "")
|
||
|
||
result = {
|
||
"api_id": api_id,
|
||
"name_zh": name_zh,
|
||
"ods_table": ods_table,
|
||
}
|
||
|
||
if not ods_table:
|
||
result["status"] = "skip"
|
||
result["reason"] = "无对应 ODS 表(ods_table=null)"
|
||
return result
|
||
|
||
if api_entry.get("skip"):
|
||
result["status"] = "skip"
|
||
result["reason"] = "接口标记为 skip(暂不可用)"
|
||
return result
|
||
|
||
# 提取 API JSON 样例字段
|
||
api_fields, has_nested, err = extract_fields_from_md(md_path, api_id)
|
||
if err:
|
||
result["status"] = "error"
|
||
result["reason"] = err
|
||
return result
|
||
|
||
# 获取 ODS 表列
|
||
if ods_table not in ods_tables:
|
||
result["status"] = "error"
|
||
result["reason"] = f"ODS 表 {ods_table} 不存在"
|
||
return result
|
||
|
||
ods_cols = ods_tables[ods_table]
|
||
ods_biz_cols = {c for c in ods_cols if c not in ODS_META_COLS}
|
||
|
||
# 比较
|
||
api_lower = {f.lower() for f in api_fields}
|
||
ods_lower = {c.lower() for c in ods_biz_cols}
|
||
|
||
# API 有但 ODS 没有的字段
|
||
api_only = sorted(api_lower - ods_lower)
|
||
# ODS 有但 API 没有的字段(非元列)
|
||
ods_only = sorted(ods_lower - api_lower)
|
||
# 两边都有的字段
|
||
matched = sorted(api_lower & ods_lower)
|
||
|
||
result["status"] = "ok" if not api_only else "drift"
|
||
result["has_nested_structure"] = has_nested
|
||
result["api_field_count"] = len(api_lower)
|
||
result["ods_biz_col_count"] = len(ods_biz_cols)
|
||
result["ods_total_col_count"] = len(ods_cols)
|
||
result["matched_count"] = len(matched)
|
||
result["api_only"] = api_only
|
||
result["api_only_count"] = len(api_only)
|
||
result["ods_only"] = ods_only
|
||
result["ods_only_count"] = len(ods_only)
|
||
result["matched"] = matched
|
||
|
||
return result
|
||
|
||
|
||
def generate_alter_sql(results, ods_tables):
|
||
"""生成 ALTER TABLE SQL 语句"""
|
||
sqls = []
|
||
for r in results:
|
||
if r.get("status") != "drift" or not r.get("api_only"):
|
||
continue
|
||
table = r["ods_table"]
|
||
for field in r["api_only"]:
|
||
pg_type = guess_pg_type(field)
|
||
sqls.append(
|
||
f"ALTER TABLE billiards_ods.{table} "
|
||
f"ADD COLUMN IF NOT EXISTS {field} {pg_type};"
|
||
)
|
||
return sqls
|
||
|
||
|
||
def generate_markdown_report(results, alter_sqls):
|
||
"""生成 Markdown 报告"""
|
||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
lines = [
|
||
"# API 参考文档 vs ODS 实际表结构 对比报告 (v2)",
|
||
"",
|
||
f"> 生成时间:{now}",
|
||
"> 数据来源:`docs/api-reference/*.md` JSON 样例 vs `billiards_ods` 实际列",
|
||
"",
|
||
"---",
|
||
"",
|
||
"## 一、汇总",
|
||
"",
|
||
"| API 接口 | 中文名 | ODS 表 | 状态 | API 字段数 | ODS 业务列数 | 匹配 | API 独有 | ODS 独有 |",
|
||
"|----------|--------|--------|------|-----------|-------------|------|---------|---------|",
|
||
]
|
||
|
||
total_api_only = 0
|
||
total_ods_only = 0
|
||
ok_count = 0
|
||
drift_count = 0
|
||
skip_count = 0
|
||
error_count = 0
|
||
|
||
for r in results:
|
||
status = r.get("status", "?")
|
||
if status == "skip":
|
||
skip_count += 1
|
||
lines.append(
|
||
f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
|
||
f"| ⏭️ 跳过 | - | - | - | - | - |"
|
||
)
|
||
continue
|
||
if status == "error":
|
||
error_count += 1
|
||
lines.append(
|
||
f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} "
|
||
f"| ❌ 错误 | - | - | - | - | - |"
|
||
)
|
||
continue
|
||
|
||
api_only_n = r.get("api_only_count", 0)
|
||
ods_only_n = r.get("ods_only_count", 0)
|
||
total_api_only += api_only_n
|
||
total_ods_only += ods_only_n
|
||
|
||
if status == "ok":
|
||
ok_count += 1
|
||
badge = "✅ 对齐"
|
||
else:
|
||
drift_count += 1
|
||
badge = "⚠️ 漂移"
|
||
|
||
lines.append(
|
||
f"| {r['api_id']} | {r['name_zh']} | {r['ods_table']} "
|
||
f"| {badge} | {r['api_field_count']} | {r['ods_biz_col_count']} "
|
||
f"| {r['matched_count']} | {api_only_n} | {ods_only_n} |"
|
||
)
|
||
|
||
lines.extend([
|
||
"",
|
||
f"**统计**:对齐 {ok_count} / 漂移 {drift_count} / 跳过 {skip_count} / 错误 {error_count}",
|
||
f"**API 独有字段总计**:{total_api_only}(需要 ALTER TABLE ADD COLUMN)",
|
||
f"**ODS 独有列总计**:{total_ods_only}(API 中不存在,可能是历史遗留或 ETL 派生列)",
|
||
"",
|
||
])
|
||
|
||
# 详情:每个漂移表的字段差异
|
||
drift_results = [r for r in results if r.get("status") == "drift"]
|
||
if drift_results:
|
||
lines.extend(["---", "", "## 二、漂移详情", ""])
|
||
for r in drift_results:
|
||
lines.extend([
|
||
f"### {r['api_id']}({r['name_zh']})→ `{r['ods_table']}`",
|
||
"",
|
||
])
|
||
if r["api_only"]:
|
||
lines.append("**API 有 / ODS 缺**:")
|
||
for f in r["api_only"]:
|
||
pg_type = guess_pg_type(f)
|
||
lines.append(f"- `{f}` → 建议类型 `{pg_type}`")
|
||
lines.append("")
|
||
if r["ods_only"]:
|
||
lines.append("**ODS 有 / API 无**(非元列):")
|
||
for f in r["ods_only"]:
|
||
lines.append(f"- `{f}`")
|
||
lines.append("")
|
||
|
||
# ODS 独有列详情(所有表)
|
||
ods_only_results = [r for r in results if r.get("ods_only") and r.get("status") in ("ok", "drift")]
|
||
if ods_only_results:
|
||
lines.extend(["---", "", "## 三、ODS 独有列详情(API 中不存在)", ""])
|
||
for r in ods_only_results:
|
||
if not r["ods_only"]:
|
||
continue
|
||
lines.extend([
|
||
f"### `{r['ods_table']}`({r['name_zh']})",
|
||
"",
|
||
"| 列名 | 说明 |",
|
||
"|------|------|",
|
||
])
|
||
for f in r["ods_only"]:
|
||
lines.append(f"| `{f}` | ODS 独有,API JSON 样例中不存在 |")
|
||
lines.append("")
|
||
|
||
# ALTER SQL
|
||
if alter_sqls:
|
||
lines.extend([
|
||
"---", "",
|
||
"## 四、ALTER SQL(对齐 ODS 表结构)", "",
|
||
"```sql",
|
||
"-- 自动生成的 ALTER TABLE 语句",
|
||
f"-- 生成时间:{now}",
|
||
"-- 注意:类型为根据字段名猜测,请人工复核后执行",
|
||
"",
|
||
])
|
||
lines.extend(alter_sqls)
|
||
lines.extend(["", "```", ""])
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
|
||
def main():
|
||
dsn = os.environ.get("PG_DSN")
|
||
if not dsn:
|
||
print("错误:未设置 PG_DSN 环境变量", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
print("连接数据库...")
|
||
conn = psycopg2.connect(dsn)
|
||
|
||
print("查询 ODS 表结构...")
|
||
ods_tables = get_all_ods_columns(conn)
|
||
print(f" 共 {len(ods_tables)} 张 ODS 表")
|
||
|
||
print("加载 API 注册表...")
|
||
registry = load_registry()
|
||
print(f" 共 {len(registry)} 个 API 端点")
|
||
|
||
results = []
|
||
for entry in registry:
|
||
api_id = entry["id"]
|
||
ods_table = entry.get("ods_table")
|
||
md_path = os.path.join(ROOT, "docs", "api-reference", f"{api_id}.md")
|
||
|
||
if not os.path.exists(md_path):
|
||
results.append({
|
||
"api_id": api_id,
|
||
"name_zh": entry.get("name_zh", ""),
|
||
"ods_table": ods_table,
|
||
"status": "error",
|
||
"reason": f"文档不存在: {md_path}",
|
||
})
|
||
continue
|
||
|
||
r = compare_one(entry, md_path, ods_tables)
|
||
results.append(r)
|
||
|
||
status_icon = {"ok": "✅", "drift": "⚠️", "skip": "⏭️", "error": "❌"}.get(r["status"], "?")
|
||
extra = ""
|
||
if r.get("api_only_count"):
|
||
extra = f" (API独有: {r['api_only_count']})"
|
||
if r.get("ods_only_count"):
|
||
extra += f" (ODS独有: {r['ods_only_count']})"
|
||
print(f" {status_icon} {api_id} → {ods_table or '-'}{extra}")
|
||
|
||
conn.close()
|
||
|
||
# 生成 ALTER SQL
|
||
alter_sqls = generate_alter_sql(results, ods_tables)
|
||
|
||
# 输出 JSON 报告
|
||
json_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.json")
|
||
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
||
with open(json_path, "w", encoding="utf-8") as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
print(f"\nJSON 报告: {json_path}")
|
||
|
||
# 输出 Markdown 报告
|
||
md_report = generate_markdown_report(results, alter_sqls)
|
||
md_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.md")
|
||
with open(md_path, "w", encoding="utf-8") as f:
|
||
f.write(md_report)
|
||
print(f"Markdown 报告: {md_path}")
|
||
|
||
# 输出 ALTER SQL 文件
|
||
if alter_sqls:
|
||
sql_path = os.path.join(ROOT, "database", "migrations",
|
||
"20260213_align_ods_with_api_v2.sql")
|
||
os.makedirs(os.path.dirname(sql_path), exist_ok=True)
|
||
with open(sql_path, "w", encoding="utf-8") as f:
|
||
f.write("-- API vs ODS 对齐迁移脚本 (v2)\n")
|
||
f.write(f"-- 生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
f.write("-- 注意:类型为根据字段名猜测,请人工复核后执行\n\n")
|
||
f.write("BEGIN;\n\n")
|
||
for sql in alter_sqls:
|
||
f.write(sql + "\n")
|
||
f.write("\nCOMMIT;\n")
|
||
print(f"ALTER SQL: {sql_path}")
|
||
else:
|
||
print("无需 ALTER SQL(所有表已对齐)")
|
||
|
||
# 统计
|
||
ok_n = sum(1 for r in results if r.get("status") == "ok")
|
||
drift_n = sum(1 for r in results if r.get("status") == "drift")
|
||
skip_n = sum(1 for r in results if r.get("status") == "skip")
|
||
err_n = sum(1 for r in results if r.get("status") == "error")
|
||
print(f"\n汇总:对齐 {ok_n} / 漂移 {drift_n} / 跳过 {skip_n} / 错误 {err_n}")
|
||
print(f"ALTER SQL 语句数:{len(alter_sqls)}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|
||
|
||
# ──────────────────────────────────────────────
|
||
# AI_CHANGELOG:
|
||
# - 日期: 2026-02-13
|
||
# Prompt: P20260213-223000 — 用 API 参考文档比对数据库 ODS 实际表结构(重做,不依赖 DDL)
|
||
# 直接原因: 前次比对脚本 stock_goods_category_tree 嵌套结构解析 bug,需重写脚本
|
||
# 变更摘要: 完整重写脚本,从 api-reference/*.md JSON 样例提取字段,查询 PG billiards_ods 实际列,
|
||
# 处理三种特殊结构(标准/settleList 嵌套/goodsCategoryList 数组包装),输出 JSON+MD 报告
|
||
# 风险与验证: 纯分析脚本,不修改数据库;验证方式:运行脚本确认 "对齐 22 / 漂移 0"
|
||
# ──────────────────────────────────────────────
|