""" DB 文档全量对账脚本(审计用,一次性)。 连接测试库,查询 information_schema,与 docs/database/ 现有文档对比。 输出 JSON 摘要到 stdout。 """ from __future__ import annotations import json import os import re import sys from pathlib import Path from dotenv import load_dotenv # 加载根 .env load_dotenv(Path(__file__).resolve().parents[2] / ".env") TEST_ETL_DSN = os.environ.get("TEST_DB_DSN") TEST_APP_DSN = os.environ.get("TEST_APP_DB_DSN") if not TEST_ETL_DSN or not TEST_APP_DSN: print("ERROR: TEST_DB_DSN or TEST_APP_DB_DSN not set", file=sys.stderr) sys.exit(1) import psycopg2 # noqa: E402 def query_tables_and_columns(dsn: str, schemas: list[str]) -> dict: """查询指定 schema 下所有表和字段。""" conn = psycopg2.connect(dsn) try: with conn.cursor() as cur: placeholders = ",".join(["%s"] * len(schemas)) # 查询表 cur.execute( f""" SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema IN ({placeholders}) AND table_type = 'BASE TABLE' ORDER BY table_schema, table_name """, schemas, ) tables = cur.fetchall() # 查询字段 cur.execute( f""" SELECT table_schema, table_name, column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_schema IN ({placeholders}) ORDER BY table_schema, table_name, ordinal_position """, schemas, ) columns = cur.fetchall() finally: conn.close() result = {} for schema, table in tables: key = f"{schema}.{table}" result[key] = {"schema": schema, "table": table, "columns": []} for schema, table, col_name, data_type, nullable, default in columns: key = f"{schema}.{table}" if key in result: result[key]["columns"].append({ "name": col_name, "type": data_type, "nullable": nullable, "default": default, }) return result def scan_existing_docs(docs_dir: Path) -> set[str]: """扫描 docs/database/ 下的 BD_Manual_*.md,提取已文档化的表名关键词。""" documented = set() for f in docs_dir.glob("BD_Manual_*.md"): # 从文件名提取表名关键词 stem = f.stem.replace("BD_Manual_", "") documented.add(stem.lower()) # 也从文件内容提取 schema.table 引用 try: content = f.read_text(encoding="utf-8") # 匹配 schema.table_name 模式 for m in re.finditer(r"(\w+)\.(\w+)", content): schema, table = m.group(1), m.group(2) if schema in ( "ods", "dwd", "dws", "meta", "core", "app", "public", "auth", ): documented.add(f"{schema}.{table}".lower()) except Exception: pass return documented def reconcile(db_tables: dict, documented: set[str]) -> dict: """对账:找出缺失文档的表。""" missing = [] for key, info in sorted(db_tables.items()): key_lower = key.lower() table_lower = info["table"].lower() # 检查是否有文档覆盖 if key_lower not in documented and table_lower not in documented: missing.append({ "schema_table": key, "column_count": len(info["columns"]), }) return { "total_db_tables": len(db_tables), "documented_refs": len(documented), "missing_docs": missing, "missing_count": len(missing), } def main(): docs_dir = Path(__file__).resolve().parents[2] / "docs" / "database" # ETL 库(六层 schema) etl_schemas = ["ods", "dwd", "dws", "meta", "core", "app"] etl_tables = query_tables_and_columns(TEST_ETL_DSN, etl_schemas) # 业务库 app_schemas = ["public", "auth"] app_tables = query_tables_and_columns(TEST_APP_DSN, app_schemas) # 合并 all_tables = {**etl_tables, **app_tables} # 扫描现有文档 documented = scan_existing_docs(docs_dir) # 对账 result = reconcile(all_tables, documented) # 输出 JSON print(json.dumps(result, ensure_ascii=False, indent=2)) if __name__ == "__main__": main()