# -*- coding: utf-8 -*- """ 比对 API 参考文档的 JSON 字段与 ODS 数据库表列,生成对比报告和 ALTER SQL。 支持 camelCase → snake_case 归一化匹配。 用法: python scripts/compare_api_ods.py 需要: psycopg2, python-dotenv """ import os, re, json, sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv import psycopg2 load_dotenv() PG_DSN = os.getenv("PG_DSN") ENDPOINTS_DIR = os.path.join("docs", "api-reference", "endpoints") REGISTRY_FILE = os.path.join("docs", "api-reference", "api_registry.json") # ODS 元数据列(ETL 框架自动添加,不属于 API 字段) ODS_META_COLUMNS = { "source_file", "source_endpoint", "fetched_at", "payload", "content_hash" } # JSON 类型 → 推荐 PG 类型映射 TYPE_MAP = { "int": "bigint", "float": "numeric(18,2)", "string": "text", "bool": "boolean", "list": "jsonb", "dict": "jsonb", "object": "jsonb", "array": "jsonb", } def camel_to_snake(name): """将 camelCase/PascalCase 转为 snake_case 小写""" # 处理连续大写如 ABCDef → abc_def s1 = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', name) s2 = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', s1) return s2.lower() def normalize_field_name(name): """统一字段名:camelCase → snake_case → 全小写""" return camel_to_snake(name).replace(".", "_").strip("_") def parse_api_fields(md_path): """从 API 文档 md 中解析响应字段表,返回 {原始字段名: json_type} 跳过嵌套对象的子字段(如 siteProfile.xxx)""" fields = {} with open(md_path, "r", encoding="utf-8") as f: content = f.read() # 格式: | # | 字段名 | 类型 | 示例值 | pattern = r"\|\s*\d+\s*\|\s*`([^`]+)`\s*\|\s*(\w+)\s*\|" for m in re.finditer(pattern, content): field_name = m.group(1).strip() field_type = m.group(2).strip().lower() # 跳过嵌套子字段(如 siteProfile.address) if "." in field_name: continue fields[field_name] = field_type return fields def get_ods_columns(cursor, table_name): """查询 ODS 表的列信息,返回 {column_name: data_type}""" cursor.execute(""" SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = 'billiards_ods' AND table_name = %s ORDER BY ordinal_position """, (table_name,)) cols = {} for row in cursor.fetchall(): cols[row[0]] = row[1] return cols def suggest_pg_type(json_type): """根据 JSON 类型推荐 PG 类型""" return TYPE_MAP.get(json_type, "text") def compare_table(api_fields, ods_columns, table_name): """比对单张表,使用归一化名称匹配。 返回 (truly_missing, extra_in_ods, matched_pairs, case_matched) - truly_missing: API 有但 ODS 确实没有的字段 {api_name: json_type} - extra_in_ods: ODS 有但 API 没有的列 {col_name: pg_type} - matched_pairs: 精确匹配的字段 [(api_name, ods_name)] - case_matched: 通过归一化匹配的字段 [(api_name, ods_name)] """ # 排除 ODS 元数据列 ods_biz = {k: v for k, v in ods_columns.items() if k not in ODS_META_COLUMNS} # 建立归一化索引 # api: normalized → (original_name, type) api_norm = {} for name, typ in api_fields.items(): norm = normalize_field_name(name) api_norm[norm] = (name, typ) # ods: normalized → (original_name, type) ods_norm = {} for name, typ in ods_biz.items(): norm = name.lower() # ODS 列名已经是小写 ods_norm[norm] = (name, typ) matched_pairs = [] case_matched = [] api_matched_norms = set() ods_matched_norms = set() # 第一轮:精确匹配(API 字段名 == ODS 列名) for api_name, api_type in api_fields.items(): if api_name in ods_biz: matched_pairs.append((api_name, api_name)) api_matched_norms.add(normalize_field_name(api_name)) ods_matched_norms.add(api_name) # 第二轮:归一化匹配(camelCase → snake_case) for norm_name, (api_name, api_type) in api_norm.items(): if norm_name in api_matched_norms: continue if norm_name in ods_norm: ods_name = ods_norm[norm_name][0] if ods_name not in ods_matched_norms: case_matched.append((api_name, ods_name)) api_matched_norms.add(norm_name) ods_matched_norms.add(ods_name) # 第三轮:尝试去掉下划线的纯小写匹配 for norm_name, (api_name, api_type) in api_norm.items(): if norm_name in api_matched_norms: continue flat = norm_name.replace("_", "") for ods_col, (ods_name, ods_type) in ods_norm.items(): if ods_name in ods_matched_norms: continue if ods_col.replace("_", "") == flat: case_matched.append((api_name, ods_name)) api_matched_norms.add(norm_name) ods_matched_norms.add(ods_name) break # 计算真正缺失和多余 truly_missing = {} for norm_name, (api_name, api_type) in api_norm.items(): if norm_name not in api_matched_norms: truly_missing[api_name] = api_type extra_in_ods = {} for ods_name, ods_type in ods_biz.items(): if ods_name not in ods_matched_norms: extra_in_ods[ods_name] = ods_type return truly_missing, extra_in_ods, matched_pairs, case_matched def generate_alter_sql(table_name, missing_fields): """生成 ALTER TABLE ADD COLUMN SQL,列名用 snake_case""" sqls = [] for field_name, json_type in sorted(missing_fields.items()): pg_type = suggest_pg_type(json_type) col_name = normalize_field_name(field_name) sqls.append( f"ALTER TABLE billiards_ods.{table_name} ADD COLUMN IF NOT EXISTS " f"{col_name} {pg_type}; -- API 字段: {field_name}" ) return sqls def main(): # 加载 API 注册表 with open(REGISTRY_FILE, "r", encoding="utf-8") as f: registry = json.load(f) # 建立 id → ods_table 映射 api_to_ods = {} api_names = {} for entry in registry: if entry.get("ods_table") and not entry.get("skip"): api_to_ods[entry["id"]] = entry["ods_table"] api_names[entry["id"]] = entry.get("name_zh", entry["id"]) conn = psycopg2.connect(PG_DSN) cursor = conn.cursor() results = [] all_alter_sqls = [] for api_id, ods_table in sorted(api_to_ods.items()): md_path = os.path.join(ENDPOINTS_DIR, f"{api_id}.md") if not os.path.exists(md_path): results.append({ "api_id": api_id, "name_zh": api_names.get(api_id, ""), "ods_table": ods_table, "status": "NO_DOC", "api_fields": 0, "ods_cols": 0, }) continue api_fields = parse_api_fields(md_path) ods_columns = get_ods_columns(cursor, ods_table) if not ods_columns: results.append({ "api_id": api_id, "name_zh": api_names.get(api_id, ""), "ods_table": ods_table, "status": "NO_TABLE", "api_fields": len(api_fields), "ods_cols": 0, }) continue missing, extra, matched, case_matched = compare_table( api_fields, ods_columns, ods_table ) alter_sqls = generate_alter_sql(ods_table, missing) all_alter_sqls.extend(alter_sqls) ods_biz_count = len({k: v for k, v in ods_columns.items() if k not in ODS_META_COLUMNS}) status = "OK" if not missing else "DRIFT" results.append({ "api_id": api_id, "name_zh": api_names.get(api_id, ""), "ods_table": ods_table, "status": status, "api_fields": len(api_fields), "ods_cols": ods_biz_count, "exact_match": len(matched), "case_match": len(case_matched), "total_match": len(matched) + len(case_matched), "missing_in_ods": missing, "extra_in_ods": extra, "case_matched_pairs": case_matched, }) cursor.close() conn.close() # ── 输出 JSON 报告 ── report_json = os.path.join("docs", "reports", "api_ods_comparison.json") os.makedirs(os.path.dirname(report_json), exist_ok=True) # 序列化时把 tuple 转 list json_results = [] for r in results: jr = dict(r) if "case_matched_pairs" in jr: jr["case_matched_pairs"] = [list(p) for p in jr["case_matched_pairs"]] if "missing_in_ods" in jr: jr["missing_in_ods"] = dict(jr["missing_in_ods"]) if "extra_in_ods" in jr: jr["extra_in_ods"] = dict(jr["extra_in_ods"]) json_results.append(jr) with open(report_json, "w", encoding="utf-8") as f: json.dump(json_results, f, ensure_ascii=False, indent=2) # ── 输出 Markdown 报告 ── report_md = os.path.join("docs", "reports", "api_ods_comparison.md") with open(report_md, "w", encoding="utf-8") as f: f.write("# API JSON 字段 vs ODS 表列 对比报告\n\n") f.write("> 自动生成于 2026-02-13 | 数据来源:数据库实际表结构 + API 参考文档\n") f.write("> 比对逻辑:camelCase → snake_case 归一化匹配 + 去下划线纯小写兜底\n\n") # 汇总 ok_count = sum(1 for r in results if r["status"] == "OK") drift_count = sum(1 for r in results if r["status"] == "DRIFT") total_missing = sum(len(r.get("missing_in_ods", {})) for r in results) total_extra = sum(len(r.get("extra_in_ods", {})) for r in results) f.write("## 汇总\n\n") f.write("| 指标 | 值 |\n|------|----|") f.write(f"\n| 比对表数 | {len(results)} |") f.write(f"\n| 完全一致(含大小写归一化) | {ok_count} |") f.write(f"\n| 存在差异 | {drift_count} |") f.write(f"\n| ODS 缺失字段总数 | {total_missing} |") f.write(f"\n| ODS 多余列总数 | {total_extra} |") f.write(f"\n| 生成 ALTER SQL 数 | {len(all_alter_sqls)} |\n\n") # 总览表 f.write("## 逐表对比总览\n\n") f.write("| # | API ID | 中文名 | ODS 表 | 状态 | API字段 | ODS列 | 精确匹配 | 大小写匹配 | ODS缺失 | ODS多余 |\n") f.write("|---|--------|--------|--------|------|---------|-------|----------|-----------|---------|--------|\n") for i, r in enumerate(results, 1): missing_count = len(r.get("missing_in_ods", {})) extra_count = len(r.get("extra_in_ods", {})) exact = r.get("exact_match", 0) case = r.get("case_match", 0) icon = "✅" if r["status"] == "OK" else "⚠️" if r["status"] == "DRIFT" else "❌" f.write(f"| {i} | {r['api_id']} | {r.get('name_zh','')} | {r['ods_table']} | " f"{icon} | {r['api_fields']} | {r['ods_cols']} | {exact} | {case} | " f"{missing_count} | {extra_count} |\n") # 差异详情 has_drift = any(r["status"] == "DRIFT" for r in results) if has_drift: f.write("\n## 差异详情\n\n") for r in results: if r["status"] != "DRIFT": continue f.write(f"### {r.get('name_zh','')}(`{r['ods_table']}`)\n\n") missing = r.get("missing_in_ods", {}) extra = r.get("extra_in_ods", {}) case_pairs = r.get("case_matched_pairs", []) if case_pairs: f.write("**大小写归一化匹配(已自动对齐,无需操作):**\n\n") f.write("| API 字段名 (camelCase) | ODS 列名 (lowercase) |\n") f.write("|----------------------|---------------------|\n") for api_n, ods_n in sorted(case_pairs): f.write(f"| `{api_n}` | `{ods_n}` |\n") f.write("\n") if missing: f.write("**ODS 真正缺失的字段(需要 ADD COLUMN):**\n\n") f.write("| 字段名 | JSON 类型 | 建议 PG 列名 | 建议 PG 类型 |\n") f.write("|--------|-----------|-------------|-------------|\n") for fname, ftype in sorted(missing.items()): f.write(f"| `{fname}` | {ftype} | `{normalize_field_name(fname)}` | {suggest_pg_type(ftype)} |\n") f.write("\n") if extra: f.write("**ODS 多余的列(API 中不存在):**\n\n") f.write("| 列名 | PG 类型 | 可能原因 |\n") f.write("|------|---------|--------|\n") for cname, ctype in sorted(extra.items()): f.write(f"| `{cname}` | {ctype} | ETL 自行添加 / 历史遗留 / API 新版已移除 |\n") f.write("\n") # ── 输出 ALTER SQL ── sql_path = os.path.join("database", "migrations", "20260213_align_ods_with_api.sql") os.makedirs(os.path.dirname(sql_path), exist_ok=True) with open(sql_path, "w", encoding="utf-8") as f: f.write("-- ============================================================\n") f.write("-- ODS 表与 API JSON 字段对齐迁移\n") f.write("-- 自动生成于 2026-02-13\n") f.write("-- 基于: docs/api-reference/ 文档 vs billiards_ods 实际表结构\n") f.write("-- 比对逻辑: camelCase → snake_case 归一化后再比较\n") f.write("-- ============================================================\n\n") if all_alter_sqls: f.write("BEGIN;\n\n") current_table = "" for sql in all_alter_sqls: # 提取表名做分组注释 tbl = sql.split("billiards_ods.")[1].split(" ")[0] if tbl != current_table: if current_table: f.write("\n") f.write(f"-- ── {tbl} ──\n") current_table = tbl f.write(sql + "\n") f.write("\nCOMMIT;\n") else: f.write("-- 无需变更,所有 ODS 表已与 API JSON 字段对齐。\n") print(f"[完成] 比对 {len(results)} 张表") print(f" - 完全一致: {ok_count}") print(f" - 存在差异: {drift_count}") print(f" - ODS 缺失字段: {total_missing}") print(f" - ODS 多余列: {total_extra}") print(f" - ALTER SQL: {len(all_alter_sqls)} 条") print(f" - 报告: {report_md}") print(f" - JSON: {report_json}") print(f" - SQL: {sql_path}") if __name__ == "__main__": main() # AI_CHANGELOG: # - 日期: 2026-02-13 # - Prompt: P20260213-210000 — "用新梳理的API返回的JSON文档比对数据库ODS层" # - 直接原因: 用户要求比对 API 参考文档与 ODS 实际表结构,生成对比报告和 ALTER SQL # - 变更摘要: 新建比对脚本,支持 camelCase→snake_case 归一化匹配,输出 MD/JSON 报告和迁移 SQL # - 风险与验证: 纯分析脚本,不修改数据库;验证:python scripts/compare_api_ods.py 检查输出