Files
Neo-ZQYY/scripts/ops/_verify_bd_manual_fields.py

154 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""比对 BD_Manual 文档中的字段列表与数据库实际列,输出差异报告。
用法python scripts/ops/_verify_bd_manual_fields.py
输出stdout差异报告
"""
import os
import sys
import re
# 加载 .env
from dotenv import load_dotenv
load_dotenv(os.path.join(os.path.dirname(__file__), '..', '..', '.env'))
import psycopg2
DSN = os.environ.get("TEST_DB_DSN")
if not DSN:
print("ERROR: TEST_DB_DSN 未设置", file=sys.stderr)
sys.exit(1)
# 要验证的表BD_Manual 文件 → 表列表)
TABLES_TO_CHECK = [
# assistant_service_records
"dwd.dwd_assistant_service_log",
"dwd.dwd_assistant_service_log_ex",
# recharge_settlements
"dwd.dwd_recharge_order",
"dwd.dwd_recharge_order_ex",
# store_goods_master
"dwd.dim_store_goods",
"dwd.dim_store_goods_ex",
# site_tables_master
"dwd.dim_table",
"dwd.dim_table_ex",
# goods_stock_movements
"dwd.dwd_goods_stock_movement",
# goods_stock_summary
"dwd.dwd_goods_stock_summary",
# member_balance_changes
"dwd.dwd_member_balance_change",
"dwd.dwd_member_balance_change_ex",
# store_goods_sales_records
"dwd.dwd_store_goods_sale",
"dwd.dwd_store_goods_sale_ex",
# DWS
"dws.dws_goods_stock_daily_summary",
"dws.dws_goods_stock_monthly_summary",
]
# BD_Manual 文件 → 文档中列出的列名
BD_MANUAL_COLS: dict[str, list[str]] = {}
def parse_md_table_cols(filepath: str) -> dict[str, list[str]]:
"""从 BD_Manual markdown 文件中提取每个表的列名列表。"""
result = {}
current_table = None
in_table = False
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
# 检测表名(如 "## 1. dwd_assistant_service_log主表"
m = re.match(r'^##\s+\d+\.\s+(\w+)', line)
if m:
current_table = m.group(1)
in_table = False
continue
# 检测 markdown 表格行
if current_table and '|' in line:
cells = [c.strip() for c in line.split('|')]
cells = [c for c in cells if c]
if len(cells) >= 2:
first = cells[0]
# 跳过表头分隔行
if first.startswith('---') or first.startswith(':---'):
continue
# 跳过表头行
if first in ('DWD 列名', 'DWS 列名', 'ODS 字段', '日期'):
in_table = True
continue
if in_table:
# 提取列名(去掉 backtick
col = first.strip('`').strip()
if col and not col.startswith('~~') and col != 'scd2_*':
if current_table not in result:
result[current_table] = []
result[current_table].append(col)
return result
# 解析所有 BD_Manual 文件
BD_FILES = [
"docs/database/BD_Manual_assistant_service_records.md",
"docs/database/BD_Manual_recharge_settlements.md",
"docs/database/BD_Manual_store_goods_master.md",
"docs/database/BD_Manual_site_tables_master.md",
"docs/database/BD_Manual_goods_stock_movements.md",
"docs/database/BD_Manual_goods_stock_summary.md",
"docs/database/BD_Manual_member_balance_changes.md",
"docs/database/BD_Manual_store_goods_sales_records.md",
"docs/database/BD_Manual_dws_goods_stock_summary.md",
]
all_doc_cols: dict[str, list[str]] = {}
for f in BD_FILES:
parsed = parse_md_table_cols(f)
for table, cols in parsed.items():
all_doc_cols[table] = cols
# 查询数据库实际列
SCD2_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
conn = psycopg2.connect(DSN)
try:
cur = conn.cursor()
for full_table in TABLES_TO_CHECK:
schema, table = full_table.split('.')
cur.execute("""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
ORDER BY ordinal_position
""", (schema, table))
db_cols = [row[0] for row in cur.fetchall()]
db_cols_no_scd2 = [c for c in db_cols if c not in SCD2_COLS]
doc_cols = all_doc_cols.get(table, [])
if not doc_cols:
print(f"\n⚠️ {full_table}: 文档中未找到列定义(表名 '{table}' 未匹配)")
print(f" DB 列 ({len(db_cols)}): {db_cols}")
continue
doc_set = set(doc_cols)
db_set = set(db_cols_no_scd2)
in_doc_not_db = doc_set - db_set
in_db_not_doc = db_set - doc_set
status = "" if not in_doc_not_db and not in_db_not_doc else ""
print(f"\n{status} {full_table}: 文档 {len(doc_cols)} 列, DB {len(db_cols_no_scd2)} 列 (不含 SCD2)")
if in_doc_not_db:
print(f" 📄 文档有但 DB 无: {sorted(in_doc_not_db)}")
if in_db_not_doc:
print(f" 🗄️ DB 有但文档无: {sorted(in_db_not_doc)}")
finally:
conn.close()