382 lines
15 KiB
Python
382 lines
15 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
比对 API 参考文档的 JSON 字段与 ODS 数据库表列,生成对比报告和 ALTER SQL。
|
||
支持 camelCase → snake_case 归一化匹配。
|
||
用法: python scripts/compare_api_ods.py
|
||
需要: psycopg2, python-dotenv
|
||
"""
|
||
import os, re, json, sys
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
|
||
from dotenv import load_dotenv
|
||
import psycopg2
|
||
|
||
load_dotenv()
|
||
|
||
PG_DSN = os.getenv("PG_DSN")
|
||
ENDPOINTS_DIR = os.path.join("docs", "api-reference", "endpoints")
|
||
REGISTRY_FILE = os.path.join("docs", "api-reference", "api_registry.json")
|
||
|
||
# ODS 元数据列(ETL 框架自动添加,不属于 API 字段)
|
||
ODS_META_COLUMNS = {
|
||
"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"
|
||
}
|
||
|
||
# JSON 类型 → 推荐 PG 类型映射
|
||
TYPE_MAP = {
|
||
"int": "bigint",
|
||
"float": "numeric(18,2)",
|
||
"string": "text",
|
||
"bool": "boolean",
|
||
"list": "jsonb",
|
||
"dict": "jsonb",
|
||
"object": "jsonb",
|
||
"array": "jsonb",
|
||
}
|
||
|
||
|
||
def camel_to_snake(name):
|
||
"""将 camelCase/PascalCase 转为 snake_case 小写"""
|
||
# 处理连续大写如 ABCDef → abc_def
|
||
s1 = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name)
|
||
s2 = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', s1)
|
||
return s2.lower()
|
||
|
||
|
||
def normalize_field_name(name):
|
||
"""统一字段名:camelCase → snake_case → 全小写"""
|
||
return camel_to_snake(name).replace(".", "_").strip("_")
|
||
|
||
|
||
def parse_api_fields(md_path):
|
||
"""从 API 文档 md 中解析响应字段表,返回 {原始字段名: json_type}
|
||
跳过嵌套对象的子字段(如 siteProfile.xxx)"""
|
||
fields = {}
|
||
with open(md_path, "r", encoding="utf-8") as f:
|
||
content = f.read()
|
||
|
||
# 格式: | # | 字段名 | 类型 | 示例值 |
|
||
pattern = r"\|\s*\d+\s*\|\s*`([^`]+)`\s*\|\s*(\w+)\s*\|"
|
||
for m in re.finditer(pattern, content):
|
||
field_name = m.group(1).strip()
|
||
field_type = m.group(2).strip().lower()
|
||
# 跳过嵌套子字段(如 siteProfile.address)
|
||
if "." in field_name:
|
||
continue
|
||
fields[field_name] = field_type
|
||
|
||
return fields
|
||
|
||
|
||
def get_ods_columns(cursor, table_name):
|
||
"""查询 ODS 表的列信息,返回 {column_name: data_type}"""
|
||
cursor.execute("""
|
||
SELECT column_name, data_type
|
||
FROM information_schema.columns
|
||
WHERE table_schema = 'billiards_ods' AND table_name = %s
|
||
ORDER BY ordinal_position
|
||
""", (table_name,))
|
||
cols = {}
|
||
for row in cursor.fetchall():
|
||
cols[row[0]] = row[1]
|
||
return cols
|
||
|
||
|
||
def suggest_pg_type(json_type):
|
||
"""根据 JSON 类型推荐 PG 类型"""
|
||
return TYPE_MAP.get(json_type, "text")
|
||
|
||
|
||
def compare_table(api_fields, ods_columns, table_name):
|
||
"""比对单张表,使用归一化名称匹配。
|
||
返回 (truly_missing, extra_in_ods, matched_pairs, case_matched)
|
||
- truly_missing: API 有但 ODS 确实没有的字段 {api_name: json_type}
|
||
- extra_in_ods: ODS 有但 API 没有的列 {col_name: pg_type}
|
||
- matched_pairs: 精确匹配的字段 [(api_name, ods_name)]
|
||
- case_matched: 通过归一化匹配的字段 [(api_name, ods_name)]
|
||
"""
|
||
# 排除 ODS 元数据列
|
||
ods_biz = {k: v for k, v in ods_columns.items() if k not in ODS_META_COLUMNS}
|
||
|
||
# 建立归一化索引
|
||
# api: normalized → (original_name, type)
|
||
api_norm = {}
|
||
for name, typ in api_fields.items():
|
||
norm = normalize_field_name(name)
|
||
api_norm[norm] = (name, typ)
|
||
|
||
# ods: normalized → (original_name, type)
|
||
ods_norm = {}
|
||
for name, typ in ods_biz.items():
|
||
norm = name.lower() # ODS 列名已经是小写
|
||
ods_norm[norm] = (name, typ)
|
||
|
||
matched_pairs = []
|
||
case_matched = []
|
||
api_matched_norms = set()
|
||
ods_matched_norms = set()
|
||
|
||
# 第一轮:精确匹配(API 字段名 == ODS 列名)
|
||
for api_name, api_type in api_fields.items():
|
||
if api_name in ods_biz:
|
||
matched_pairs.append((api_name, api_name))
|
||
api_matched_norms.add(normalize_field_name(api_name))
|
||
ods_matched_norms.add(api_name)
|
||
|
||
# 第二轮:归一化匹配(camelCase → snake_case)
|
||
for norm_name, (api_name, api_type) in api_norm.items():
|
||
if norm_name in api_matched_norms:
|
||
continue
|
||
if norm_name in ods_norm:
|
||
ods_name = ods_norm[norm_name][0]
|
||
if ods_name not in ods_matched_norms:
|
||
case_matched.append((api_name, ods_name))
|
||
api_matched_norms.add(norm_name)
|
||
ods_matched_norms.add(ods_name)
|
||
|
||
# 第三轮:尝试去掉下划线的纯小写匹配
|
||
for norm_name, (api_name, api_type) in api_norm.items():
|
||
if norm_name in api_matched_norms:
|
||
continue
|
||
flat = norm_name.replace("_", "")
|
||
for ods_col, (ods_name, ods_type) in ods_norm.items():
|
||
if ods_name in ods_matched_norms:
|
||
continue
|
||
if ods_col.replace("_", "") == flat:
|
||
case_matched.append((api_name, ods_name))
|
||
api_matched_norms.add(norm_name)
|
||
ods_matched_norms.add(ods_name)
|
||
break
|
||
|
||
# 计算真正缺失和多余
|
||
truly_missing = {}
|
||
for norm_name, (api_name, api_type) in api_norm.items():
|
||
if norm_name not in api_matched_norms:
|
||
truly_missing[api_name] = api_type
|
||
|
||
extra_in_ods = {}
|
||
for ods_name, ods_type in ods_biz.items():
|
||
if ods_name not in ods_matched_norms:
|
||
extra_in_ods[ods_name] = ods_type
|
||
|
||
return truly_missing, extra_in_ods, matched_pairs, case_matched
|
||
|
||
|
||
def generate_alter_sql(table_name, missing_fields):
|
||
"""生成 ALTER TABLE ADD COLUMN SQL,列名用 snake_case"""
|
||
sqls = []
|
||
for field_name, json_type in sorted(missing_fields.items()):
|
||
pg_type = suggest_pg_type(json_type)
|
||
col_name = normalize_field_name(field_name)
|
||
sqls.append(
|
||
f"ALTER TABLE billiards_ods.{table_name} ADD COLUMN IF NOT EXISTS "
|
||
f"{col_name} {pg_type}; -- API 字段: {field_name}"
|
||
)
|
||
return sqls
|
||
|
||
|
||
def main():
|
||
# 加载 API 注册表
|
||
with open(REGISTRY_FILE, "r", encoding="utf-8") as f:
|
||
registry = json.load(f)
|
||
|
||
# 建立 id → ods_table 映射
|
||
api_to_ods = {}
|
||
api_names = {}
|
||
for entry in registry:
|
||
if entry.get("ods_table") and not entry.get("skip"):
|
||
api_to_ods[entry["id"]] = entry["ods_table"]
|
||
api_names[entry["id"]] = entry.get("name_zh", entry["id"])
|
||
|
||
conn = psycopg2.connect(PG_DSN)
|
||
cursor = conn.cursor()
|
||
|
||
results = []
|
||
all_alter_sqls = []
|
||
|
||
for api_id, ods_table in sorted(api_to_ods.items()):
|
||
md_path = os.path.join(ENDPOINTS_DIR, f"{api_id}.md")
|
||
if not os.path.exists(md_path):
|
||
results.append({
|
||
"api_id": api_id, "name_zh": api_names.get(api_id, ""),
|
||
"ods_table": ods_table, "status": "NO_DOC",
|
||
"api_fields": 0, "ods_cols": 0,
|
||
})
|
||
continue
|
||
|
||
api_fields = parse_api_fields(md_path)
|
||
ods_columns = get_ods_columns(cursor, ods_table)
|
||
|
||
if not ods_columns:
|
||
results.append({
|
||
"api_id": api_id, "name_zh": api_names.get(api_id, ""),
|
||
"ods_table": ods_table, "status": "NO_TABLE",
|
||
"api_fields": len(api_fields), "ods_cols": 0,
|
||
})
|
||
continue
|
||
|
||
missing, extra, matched, case_matched = compare_table(
|
||
api_fields, ods_columns, ods_table
|
||
)
|
||
alter_sqls = generate_alter_sql(ods_table, missing)
|
||
all_alter_sqls.extend(alter_sqls)
|
||
|
||
ods_biz_count = len({k: v for k, v in ods_columns.items()
|
||
if k not in ODS_META_COLUMNS})
|
||
|
||
status = "OK" if not missing else "DRIFT"
|
||
results.append({
|
||
"api_id": api_id,
|
||
"name_zh": api_names.get(api_id, ""),
|
||
"ods_table": ods_table,
|
||
"status": status,
|
||
"api_fields": len(api_fields),
|
||
"ods_cols": ods_biz_count,
|
||
"exact_match": len(matched),
|
||
"case_match": len(case_matched),
|
||
"total_match": len(matched) + len(case_matched),
|
||
"missing_in_ods": missing,
|
||
"extra_in_ods": extra,
|
||
"case_matched_pairs": case_matched,
|
||
})
|
||
|
||
cursor.close()
|
||
conn.close()
|
||
|
||
# ── 输出 JSON 报告 ──
|
||
report_json = os.path.join("docs", "reports", "api_ods_comparison.json")
|
||
os.makedirs(os.path.dirname(report_json), exist_ok=True)
|
||
# 序列化时把 tuple 转 list
|
||
json_results = []
|
||
for r in results:
|
||
jr = dict(r)
|
||
if "case_matched_pairs" in jr:
|
||
jr["case_matched_pairs"] = [list(p) for p in jr["case_matched_pairs"]]
|
||
if "missing_in_ods" in jr:
|
||
jr["missing_in_ods"] = dict(jr["missing_in_ods"])
|
||
if "extra_in_ods" in jr:
|
||
jr["extra_in_ods"] = dict(jr["extra_in_ods"])
|
||
json_results.append(jr)
|
||
with open(report_json, "w", encoding="utf-8") as f:
|
||
json.dump(json_results, f, ensure_ascii=False, indent=2)
|
||
|
||
# ── 输出 Markdown 报告 ──
|
||
report_md = os.path.join("docs", "reports", "api_ods_comparison.md")
|
||
with open(report_md, "w", encoding="utf-8") as f:
|
||
f.write("# API JSON 字段 vs ODS 表列 对比报告\n\n")
|
||
f.write("> 自动生成于 2026-02-13 | 数据来源:数据库实际表结构 + API 参考文档\n")
|
||
f.write("> 比对逻辑:camelCase → snake_case 归一化匹配 + 去下划线纯小写兜底\n\n")
|
||
|
||
# 汇总
|
||
ok_count = sum(1 for r in results if r["status"] == "OK")
|
||
drift_count = sum(1 for r in results if r["status"] == "DRIFT")
|
||
total_missing = sum(len(r.get("missing_in_ods", {})) for r in results)
|
||
total_extra = sum(len(r.get("extra_in_ods", {})) for r in results)
|
||
|
||
f.write("## 汇总\n\n")
|
||
f.write("| 指标 | 值 |\n|------|----|")
|
||
f.write(f"\n| 比对表数 | {len(results)} |")
|
||
f.write(f"\n| 完全一致(含大小写归一化) | {ok_count} |")
|
||
f.write(f"\n| 存在差异 | {drift_count} |")
|
||
f.write(f"\n| ODS 缺失字段总数 | {total_missing} |")
|
||
f.write(f"\n| ODS 多余列总数 | {total_extra} |")
|
||
f.write(f"\n| 生成 ALTER SQL 数 | {len(all_alter_sqls)} |\n\n")
|
||
|
||
# 总览表
|
||
f.write("## 逐表对比总览\n\n")
|
||
f.write("| # | API ID | 中文名 | ODS 表 | 状态 | API字段 | ODS列 | 精确匹配 | 大小写匹配 | ODS缺失 | ODS多余 |\n")
|
||
f.write("|---|--------|--------|--------|------|---------|-------|----------|-----------|---------|--------|\n")
|
||
for i, r in enumerate(results, 1):
|
||
missing_count = len(r.get("missing_in_ods", {}))
|
||
extra_count = len(r.get("extra_in_ods", {}))
|
||
exact = r.get("exact_match", 0)
|
||
case = r.get("case_match", 0)
|
||
icon = "✅" if r["status"] == "OK" else "⚠️" if r["status"] == "DRIFT" else "❌"
|
||
f.write(f"| {i} | {r['api_id']} | {r.get('name_zh','')} | {r['ods_table']} | "
|
||
f"{icon} | {r['api_fields']} | {r['ods_cols']} | {exact} | {case} | "
|
||
f"{missing_count} | {extra_count} |\n")
|
||
|
||
# 差异详情
|
||
has_drift = any(r["status"] == "DRIFT" for r in results)
|
||
if has_drift:
|
||
f.write("\n## 差异详情\n\n")
|
||
for r in results:
|
||
if r["status"] != "DRIFT":
|
||
continue
|
||
f.write(f"### {r.get('name_zh','')}(`{r['ods_table']}`)\n\n")
|
||
|
||
missing = r.get("missing_in_ods", {})
|
||
extra = r.get("extra_in_ods", {})
|
||
case_pairs = r.get("case_matched_pairs", [])
|
||
|
||
if case_pairs:
|
||
f.write("**大小写归一化匹配(已自动对齐,无需操作):**\n\n")
|
||
f.write("| API 字段名 (camelCase) | ODS 列名 (lowercase) |\n")
|
||
f.write("|----------------------|---------------------|\n")
|
||
for api_n, ods_n in sorted(case_pairs):
|
||
f.write(f"| `{api_n}` | `{ods_n}` |\n")
|
||
f.write("\n")
|
||
|
||
if missing:
|
||
f.write("**ODS 真正缺失的字段(需要 ADD COLUMN):**\n\n")
|
||
f.write("| 字段名 | JSON 类型 | 建议 PG 列名 | 建议 PG 类型 |\n")
|
||
f.write("|--------|-----------|-------------|-------------|\n")
|
||
for fname, ftype in sorted(missing.items()):
|
||
f.write(f"| `{fname}` | {ftype} | `{normalize_field_name(fname)}` | {suggest_pg_type(ftype)} |\n")
|
||
f.write("\n")
|
||
|
||
if extra:
|
||
f.write("**ODS 多余的列(API 中不存在):**\n\n")
|
||
f.write("| 列名 | PG 类型 | 可能原因 |\n")
|
||
f.write("|------|---------|--------|\n")
|
||
for cname, ctype in sorted(extra.items()):
|
||
f.write(f"| `{cname}` | {ctype} | ETL 自行添加 / 历史遗留 / API 新版已移除 |\n")
|
||
f.write("\n")
|
||
|
||
# ── 输出 ALTER SQL ──
|
||
sql_path = os.path.join("database", "migrations", "20260213_align_ods_with_api.sql")
|
||
os.makedirs(os.path.dirname(sql_path), exist_ok=True)
|
||
with open(sql_path, "w", encoding="utf-8") as f:
|
||
f.write("-- ============================================================\n")
|
||
f.write("-- ODS 表与 API JSON 字段对齐迁移\n")
|
||
f.write("-- 自动生成于 2026-02-13\n")
|
||
f.write("-- 基于: docs/api-reference/ 文档 vs billiards_ods 实际表结构\n")
|
||
f.write("-- 比对逻辑: camelCase → snake_case 归一化后再比较\n")
|
||
f.write("-- ============================================================\n\n")
|
||
if all_alter_sqls:
|
||
f.write("BEGIN;\n\n")
|
||
current_table = ""
|
||
for sql in all_alter_sqls:
|
||
# 提取表名做分组注释
|
||
tbl = sql.split("billiards_ods.")[1].split(" ")[0]
|
||
if tbl != current_table:
|
||
if current_table:
|
||
f.write("\n")
|
||
f.write(f"-- ── {tbl} ──\n")
|
||
current_table = tbl
|
||
f.write(sql + "\n")
|
||
f.write("\nCOMMIT;\n")
|
||
else:
|
||
f.write("-- 无需变更,所有 ODS 表已与 API JSON 字段对齐。\n")
|
||
|
||
print(f"[完成] 比对 {len(results)} 张表")
|
||
print(f" - 完全一致: {ok_count}")
|
||
print(f" - 存在差异: {drift_count}")
|
||
print(f" - ODS 缺失字段: {total_missing}")
|
||
print(f" - ODS 多余列: {total_extra}")
|
||
print(f" - ALTER SQL: {len(all_alter_sqls)} 条")
|
||
print(f" - 报告: {report_md}")
|
||
print(f" - JSON: {report_json}")
|
||
print(f" - SQL: {sql_path}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
|
||
# AI_CHANGELOG:
|
||
# - 日期: 2026-02-13
|
||
# - Prompt: P20260213-210000 — "用新梳理的API返回的JSON文档比对数据库ODS层"
|
||
# - 直接原因: 用户要求比对 API 参考文档与 ODS 实际表结构,生成对比报告和 ALTER SQL
|
||
# - 变更摘要: 新建比对脚本,支持 camelCase→snake_case 归一化匹配,输出 MD/JSON 报告和迁移 SQL
|
||
# - 风险与验证: 纯分析脚本,不修改数据库;验证:python scripts/compare_api_ods.py 检查输出
|