# -*- coding: utf-8 -*- """比对 BD_Manual 文档中的字段列表与数据库实际列,输出差异报告。 用法:python scripts/ops/_verify_bd_manual_fields.py 输出:stdout(差异报告) """ import os import sys import re # 加载 .env from dotenv import load_dotenv load_dotenv(os.path.join(os.path.dirname(__file__), '..', '..', '.env')) import psycopg2 DSN = os.environ.get("TEST_DB_DSN") if not DSN: print("ERROR: TEST_DB_DSN 未设置", file=sys.stderr) sys.exit(1) # 要验证的表(BD_Manual 文件 → 表列表) TABLES_TO_CHECK = [ # assistant_service_records "dwd.dwd_assistant_service_log", "dwd.dwd_assistant_service_log_ex", # recharge_settlements "dwd.dwd_recharge_order", "dwd.dwd_recharge_order_ex", # store_goods_master "dwd.dim_store_goods", "dwd.dim_store_goods_ex", # site_tables_master "dwd.dim_table", "dwd.dim_table_ex", # goods_stock_movements "dwd.dwd_goods_stock_movement", # goods_stock_summary "dwd.dwd_goods_stock_summary", # member_balance_changes "dwd.dwd_member_balance_change", "dwd.dwd_member_balance_change_ex", # store_goods_sales_records "dwd.dwd_store_goods_sale", "dwd.dwd_store_goods_sale_ex", # DWS "dws.dws_goods_stock_daily_summary", "dws.dws_goods_stock_monthly_summary", ] # BD_Manual 文件 → 文档中列出的列名 BD_MANUAL_COLS: dict[str, list[str]] = {} def parse_md_table_cols(filepath: str) -> dict[str, list[str]]: """从 BD_Manual markdown 文件中提取每个表的列名列表。""" result = {} current_table = None in_table = False with open(filepath, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() # 检测表名(如 "## 1. dwd_assistant_service_log(主表)") m = re.match(r'^##\s+\d+\.\s+(\w+)', line) if m: current_table = m.group(1) in_table = False continue # 检测 markdown 表格行 if current_table and '|' in line: cells = [c.strip() for c in line.split('|')] cells = [c for c in cells if c] if len(cells) >= 2: first = cells[0] # 跳过表头分隔行 if first.startswith('---') or first.startswith(':---'): continue # 跳过表头行 if first in ('DWD 列名', 'DWS 列名', 'ODS 字段', '日期'): in_table = True continue if in_table: # 提取列名(去掉 backtick) col = first.strip('`').strip() if col and not col.startswith('~~') and col != 'scd2_*': if current_table not in result: result[current_table] = [] result[current_table].append(col) return result # 解析所有 BD_Manual 文件 BD_FILES = [ "docs/database/BD_Manual_assistant_service_records.md", "docs/database/BD_Manual_recharge_settlements.md", "docs/database/BD_Manual_store_goods_master.md", "docs/database/BD_Manual_site_tables_master.md", "docs/database/BD_Manual_goods_stock_movements.md", "docs/database/BD_Manual_goods_stock_summary.md", "docs/database/BD_Manual_member_balance_changes.md", "docs/database/BD_Manual_store_goods_sales_records.md", "docs/database/BD_Manual_dws_goods_stock_summary.md", ] all_doc_cols: dict[str, list[str]] = {} for f in BD_FILES: parsed = parse_md_table_cols(f) for table, cols in parsed.items(): all_doc_cols[table] = cols # 查询数据库实际列 SCD2_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"} conn = psycopg2.connect(DSN) try: cur = conn.cursor() for full_table in TABLES_TO_CHECK: schema, table = full_table.split('.') cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s ORDER BY ordinal_position """, (schema, table)) db_cols = [row[0] for row in cur.fetchall()] db_cols_no_scd2 = [c for c in db_cols if c not in SCD2_COLS] doc_cols = all_doc_cols.get(table, []) if not doc_cols: print(f"\n⚠️ {full_table}: 文档中未找到列定义(表名 '{table}' 未匹配)") print(f" DB 列 ({len(db_cols)}): {db_cols}") continue doc_set = set(doc_cols) db_set = set(db_cols_no_scd2) in_doc_not_db = doc_set - db_set in_db_not_doc = db_set - doc_set status = "✅" if not in_doc_not_db and not in_db_not_doc else "❌" print(f"\n{status} {full_table}: 文档 {len(doc_cols)} 列, DB {len(db_cols_no_scd2)} 列 (不含 SCD2)") if in_doc_not_db: print(f" 📄 文档有但 DB 无: {sorted(in_doc_not_db)}") if in_db_not_doc: print(f" 🗄️ DB 有但文档无: {sorted(in_db_not_doc)}") finally: conn.close()