# -*- coding: utf-8 -*- """ API 参考文档 vs ODS 实际表结构 对比脚本 (v2) 从 docs/api-reference/*.md 的 JSON 样例中提取字段, 查询 PostgreSQL billiards_ods 的实际列, 输出差异报告 JSON 和 Markdown + ALTER SQL。 用法: python scripts/compare_api_ods_v2.py """ import json import os import re import sys from datetime import datetime ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, ROOT) from dotenv import load_dotenv load_dotenv(os.path.join(ROOT, ".env")) import psycopg2 # ODS 元列(ETL 管理列,不来自 API) ODS_META_COLS = { "source_file", "source_endpoint", "fetched_at", "payload", "content_hash", } def load_registry(): """加载 API 注册表""" path = os.path.join(ROOT, "docs", "api-reference", "api_registry.json") with open(path, "r", encoding="utf-8") as f: return json.load(f) def extract_fields_from_md(md_path, api_id): """ 从 md 文件的 JSON 样例(五、响应样例)中提取所有字段名(小写)。 对 settlement_records / recharge_settlements 等嵌套结构, 提取 settleList 内层字段 + siteProfile 字段。 """ with open(md_path, "r", encoding="utf-8") as f: content = f.read() # 提取所有 ```json ... ``` 代码块 json_blocks = re.findall(r'```json\s*\n(.*?)\n```', content, re.DOTALL) if not json_blocks: return None, None, "无 JSON 样例" # 找到最大的 JSON 对象(响应样例通常是最大的) sample_json = None for block in json_blocks: try: parsed = json.loads(block) if isinstance(parsed, dict): if sample_json is None or len(str(parsed)) > len(str(sample_json)): sample_json = parsed except json.JSONDecodeError: continue if sample_json is None: return None, None, "无法解析 JSON 样例" fields = set() has_nested = False # settlement_records / recharge_settlements 嵌套结构: # { "siteProfile": {...}, "settleList": {...} } if "siteProfile" in sample_json and "settleList" in sample_json: has_nested = True sl = sample_json.get("settleList", {}) if isinstance(sl, dict): for k in sl: fields.add(k.lower()) return fields, has_nested, None # CHANGE: stock_goods_category_tree 特殊结构处理 # intent: goodsCategoryList 是数组包装,ODS 存储的是展平后的分类节点字段 # assumptions: 外层 total/goodsCategoryList 不是 ODS 列 if "goodsCategoryList" in sample_json and isinstance(sample_json["goodsCategoryList"], list): has_nested = True arr = sample_json["goodsCategoryList"] if arr and isinstance(arr[0], dict): _extract_flat(arr[0], fields) return fields, has_nested, None for k in sample_json: fields.add(k.lower()) return fields, has_nested, None def _extract_flat(obj, fields): """递归提取字典的标量字段名(跳过数组/嵌套对象值,但保留键名)""" if not isinstance(obj, dict): return for k, v in obj.items(): fields.add(k.lower()) def get_all_ods_columns(conn): """查询所有 ODS 表的列信息""" cur = conn.cursor() cur.execute(""" SELECT table_name, column_name, data_type, ordinal_position FROM information_schema.columns WHERE table_schema = 'billiards_ods' ORDER BY table_name, ordinal_position """) rows = cur.fetchall() cur.close() tables = {} for table_name, col_name, data_type, pos in rows: if table_name not in tables: tables[table_name] = {} tables[table_name][col_name] = { "data_type": data_type, "ordinal_position": pos, } return tables def guess_pg_type(name): """根据字段名猜测 PostgreSQL 类型(用于 ALTER TABLE ADD COLUMN)""" n = name.lower() if n == "id" or n.endswith("_id") or n.endswith("id"): return "bigint" money_kw = ["amount", "money", "price", "cost", "fee", "discount", "deduct", "balance", "charge", "sale", "refund", "promotion", "adjust", "rounding", "prepay", "income", "royalty", "grade", "point", "stock", "num"] for kw in money_kw: if kw in n: return "numeric(18,2)" if "time" in n or "date" in n: return "timestamp without time zone" if n.startswith("is_") or (n.startswith("is") and len(n) > 2 and n[2].isupper()): return "boolean" if n.startswith("able_") or n.startswith("can"): return "boolean" int_kw = ["status", "type", "sort", "count", "seconds", "level", "channel", "method", "way", "enabled", "switch", "delete", "first", "single", "trash", "confirm", "clock", "cycle", "delay", "free", "virtual", "online", "show", "audit", "freeze", "send", "required", "scene", "range", "tag", "on", "minutes", "number", "duration"] for kw in int_kw: if kw in n: return "integer" return "text" def compare_one(api_entry, md_path, ods_tables): """比较单个 API 与其 ODS 表""" api_id = api_entry["id"] ods_table = api_entry.get("ods_table") name_zh = api_entry.get("name_zh", "") result = { "api_id": api_id, "name_zh": name_zh, "ods_table": ods_table, } if not ods_table: result["status"] = "skip" result["reason"] = "无对应 ODS 表(ods_table=null)" return result if api_entry.get("skip"): result["status"] = "skip" result["reason"] = "接口标记为 skip(暂不可用)" return result # 提取 API JSON 样例字段 api_fields, has_nested, err = extract_fields_from_md(md_path, api_id) if err: result["status"] = "error" result["reason"] = err return result # 获取 ODS 表列 if ods_table not in ods_tables: result["status"] = "error" result["reason"] = f"ODS 表 {ods_table} 不存在" return result ods_cols = ods_tables[ods_table] ods_biz_cols = {c for c in ods_cols if c not in ODS_META_COLS} # 比较 api_lower = {f.lower() for f in api_fields} ods_lower = {c.lower() for c in ods_biz_cols} # API 有但 ODS 没有的字段 api_only = sorted(api_lower - ods_lower) # ODS 有但 API 没有的字段(非元列) ods_only = sorted(ods_lower - api_lower) # 两边都有的字段 matched = sorted(api_lower & ods_lower) result["status"] = "ok" if not api_only else "drift" result["has_nested_structure"] = has_nested result["api_field_count"] = len(api_lower) result["ods_biz_col_count"] = len(ods_biz_cols) result["ods_total_col_count"] = len(ods_cols) result["matched_count"] = len(matched) result["api_only"] = api_only result["api_only_count"] = len(api_only) result["ods_only"] = ods_only result["ods_only_count"] = len(ods_only) result["matched"] = matched return result def generate_alter_sql(results, ods_tables): """生成 ALTER TABLE SQL 语句""" sqls = [] for r in results: if r.get("status") != "drift" or not r.get("api_only"): continue table = r["ods_table"] for field in r["api_only"]: pg_type = guess_pg_type(field) sqls.append( f"ALTER TABLE billiards_ods.{table} " f"ADD COLUMN IF NOT EXISTS {field} {pg_type};" ) return sqls def generate_markdown_report(results, alter_sqls): """生成 Markdown 报告""" now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") lines = [ "# API 参考文档 vs ODS 实际表结构 对比报告 (v2)", "", f"> 生成时间:{now}", "> 数据来源:`docs/api-reference/*.md` JSON 样例 vs `billiards_ods` 实际列", "", "---", "", "## 一、汇总", "", "| API 接口 | 中文名 | ODS 表 | 状态 | API 字段数 | ODS 业务列数 | 匹配 | API 独有 | ODS 独有 |", "|----------|--------|--------|------|-----------|-------------|------|---------|---------|", ] total_api_only = 0 total_ods_only = 0 ok_count = 0 drift_count = 0 skip_count = 0 error_count = 0 for r in results: status = r.get("status", "?") if status == "skip": skip_count += 1 lines.append( f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} " f"| ⏭️ 跳过 | - | - | - | - | - |" ) continue if status == "error": error_count += 1 lines.append( f"| {r['api_id']} | {r['name_zh']} | {r.get('ods_table', '-')} " f"| ❌ 错误 | - | - | - | - | - |" ) continue api_only_n = r.get("api_only_count", 0) ods_only_n = r.get("ods_only_count", 0) total_api_only += api_only_n total_ods_only += ods_only_n if status == "ok": ok_count += 1 badge = "✅ 对齐" else: drift_count += 1 badge = "⚠️ 漂移" lines.append( f"| {r['api_id']} | {r['name_zh']} | {r['ods_table']} " f"| {badge} | {r['api_field_count']} | {r['ods_biz_col_count']} " f"| {r['matched_count']} | {api_only_n} | {ods_only_n} |" ) lines.extend([ "", f"**统计**:对齐 {ok_count} / 漂移 {drift_count} / 跳过 {skip_count} / 错误 {error_count}", f"**API 独有字段总计**:{total_api_only}(需要 ALTER TABLE ADD COLUMN)", f"**ODS 独有列总计**:{total_ods_only}(API 中不存在,可能是历史遗留或 ETL 派生列)", "", ]) # 详情:每个漂移表的字段差异 drift_results = [r for r in results if r.get("status") == "drift"] if drift_results: lines.extend(["---", "", "## 二、漂移详情", ""]) for r in drift_results: lines.extend([ f"### {r['api_id']}({r['name_zh']})→ `{r['ods_table']}`", "", ]) if r["api_only"]: lines.append("**API 有 / ODS 缺**:") for f in r["api_only"]: pg_type = guess_pg_type(f) lines.append(f"- `{f}` → 建议类型 `{pg_type}`") lines.append("") if r["ods_only"]: lines.append("**ODS 有 / API 无**(非元列):") for f in r["ods_only"]: lines.append(f"- `{f}`") lines.append("") # ODS 独有列详情(所有表) ods_only_results = [r for r in results if r.get("ods_only") and r.get("status") in ("ok", "drift")] if ods_only_results: lines.extend(["---", "", "## 三、ODS 独有列详情(API 中不存在)", ""]) for r in ods_only_results: if not r["ods_only"]: continue lines.extend([ f"### `{r['ods_table']}`({r['name_zh']})", "", "| 列名 | 说明 |", "|------|------|", ]) for f in r["ods_only"]: lines.append(f"| `{f}` | ODS 独有,API JSON 样例中不存在 |") lines.append("") # ALTER SQL if alter_sqls: lines.extend([ "---", "", "## 四、ALTER SQL(对齐 ODS 表结构)", "", "```sql", "-- 自动生成的 ALTER TABLE 语句", f"-- 生成时间:{now}", "-- 注意:类型为根据字段名猜测,请人工复核后执行", "", ]) lines.extend(alter_sqls) lines.extend(["", "```", ""]) return "\n".join(lines) def main(): dsn = os.environ.get("PG_DSN") if not dsn: print("错误:未设置 PG_DSN 环境变量", file=sys.stderr) sys.exit(1) print("连接数据库...") conn = psycopg2.connect(dsn) print("查询 ODS 表结构...") ods_tables = get_all_ods_columns(conn) print(f" 共 {len(ods_tables)} 张 ODS 表") print("加载 API 注册表...") registry = load_registry() print(f" 共 {len(registry)} 个 API 端点") results = [] for entry in registry: api_id = entry["id"] ods_table = entry.get("ods_table") md_path = os.path.join(ROOT, "docs", "api-reference", f"{api_id}.md") if not os.path.exists(md_path): results.append({ "api_id": api_id, "name_zh": entry.get("name_zh", ""), "ods_table": ods_table, "status": "error", "reason": f"文档不存在: {md_path}", }) continue r = compare_one(entry, md_path, ods_tables) results.append(r) status_icon = {"ok": "✅", "drift": "⚠️", "skip": "⏭️", "error": "❌"}.get(r["status"], "?") extra = "" if r.get("api_only_count"): extra = f" (API独有: {r['api_only_count']})" if r.get("ods_only_count"): extra += f" (ODS独有: {r['ods_only_count']})" print(f" {status_icon} {api_id} → {ods_table or '-'}{extra}") conn.close() # 生成 ALTER SQL alter_sqls = generate_alter_sql(results, ods_tables) # 输出 JSON 报告 json_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.json") os.makedirs(os.path.dirname(json_path), exist_ok=True) with open(json_path, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nJSON 报告: {json_path}") # 输出 Markdown 报告 md_report = generate_markdown_report(results, alter_sqls) md_path = os.path.join(ROOT, "docs", "reports", "api_ods_comparison_v2.md") with open(md_path, "w", encoding="utf-8") as f: f.write(md_report) print(f"Markdown 报告: {md_path}") # 输出 ALTER SQL 文件 if alter_sqls: sql_path = os.path.join(ROOT, "database", "migrations", "20260213_align_ods_with_api_v2.sql") os.makedirs(os.path.dirname(sql_path), exist_ok=True) with open(sql_path, "w", encoding="utf-8") as f: f.write("-- API vs ODS 对齐迁移脚本 (v2)\n") f.write(f"-- 生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write("-- 注意:类型为根据字段名猜测,请人工复核后执行\n\n") f.write("BEGIN;\n\n") for sql in alter_sqls: f.write(sql + "\n") f.write("\nCOMMIT;\n") print(f"ALTER SQL: {sql_path}") else: print("无需 ALTER SQL(所有表已对齐)") # 统计 ok_n = sum(1 for r in results if r.get("status") == "ok") drift_n = sum(1 for r in results if r.get("status") == "drift") skip_n = sum(1 for r in results if r.get("status") == "skip") err_n = sum(1 for r in results if r.get("status") == "error") print(f"\n汇总:对齐 {ok_n} / 漂移 {drift_n} / 跳过 {skip_n} / 错误 {err_n}") print(f"ALTER SQL 语句数:{len(alter_sqls)}") if __name__ == "__main__": main() # ────────────────────────────────────────────── # AI_CHANGELOG: # - 日期: 2026-02-13 # Prompt: P20260213-223000 — 用 API 参考文档比对数据库 ODS 实际表结构(重做,不依赖 DDL) # 直接原因: 前次比对脚本 stock_goods_category_tree 嵌套结构解析 bug,需重写脚本 # 变更摘要: 完整重写脚本,从 api-reference/*.md JSON 样例提取字段,查询 PG billiards_ods 实际列, # 处理三种特殊结构(标准/settleList 嵌套/goodsCategoryList 数组包装),输出 JSON+MD 报告 # 风险与验证: 纯分析脚本,不修改数据库;验证方式:运行脚本确认 "对齐 22 / 漂移 0" # ──────────────────────────────────────────────