150 lines
4.5 KiB
Python
150 lines
4.5 KiB
Python
"""
|
||
DB 文档全量对账脚本(审计用,一次性)。
|
||
连接测试库,查询 information_schema,与 docs/database/ 现有文档对比。
|
||
输出 JSON 摘要到 stdout。
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
from dotenv import load_dotenv
|
||
|
||
# 加载根 .env
|
||
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
|
||
|
||
TEST_ETL_DSN = os.environ.get("TEST_DB_DSN")
|
||
TEST_APP_DSN = os.environ.get("TEST_APP_DB_DSN")
|
||
|
||
if not TEST_ETL_DSN or not TEST_APP_DSN:
|
||
print("ERROR: TEST_DB_DSN or TEST_APP_DB_DSN not set", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
import psycopg2 # noqa: E402
|
||
|
||
|
||
def query_tables_and_columns(dsn: str, schemas: list[str]) -> dict:
|
||
"""查询指定 schema 下所有表和字段。"""
|
||
conn = psycopg2.connect(dsn)
|
||
try:
|
||
with conn.cursor() as cur:
|
||
placeholders = ",".join(["%s"] * len(schemas))
|
||
# 查询表
|
||
cur.execute(
|
||
f"""
|
||
SELECT table_schema, table_name
|
||
FROM information_schema.tables
|
||
WHERE table_schema IN ({placeholders})
|
||
AND table_type = 'BASE TABLE'
|
||
ORDER BY table_schema, table_name
|
||
""",
|
||
schemas,
|
||
)
|
||
tables = cur.fetchall()
|
||
|
||
# 查询字段
|
||
cur.execute(
|
||
f"""
|
||
SELECT table_schema, table_name, column_name,
|
||
data_type, is_nullable, column_default
|
||
FROM information_schema.columns
|
||
WHERE table_schema IN ({placeholders})
|
||
ORDER BY table_schema, table_name, ordinal_position
|
||
""",
|
||
schemas,
|
||
)
|
||
columns = cur.fetchall()
|
||
finally:
|
||
conn.close()
|
||
|
||
result = {}
|
||
for schema, table in tables:
|
||
key = f"{schema}.{table}"
|
||
result[key] = {"schema": schema, "table": table, "columns": []}
|
||
|
||
for schema, table, col_name, data_type, nullable, default in columns:
|
||
key = f"{schema}.{table}"
|
||
if key in result:
|
||
result[key]["columns"].append({
|
||
"name": col_name,
|
||
"type": data_type,
|
||
"nullable": nullable,
|
||
"default": default,
|
||
})
|
||
|
||
return result
|
||
|
||
|
||
def scan_existing_docs(docs_dir: Path) -> set[str]:
|
||
"""扫描 docs/database/ 下的 BD_Manual_*.md,提取已文档化的表名关键词。"""
|
||
documented = set()
|
||
for f in docs_dir.glob("BD_Manual_*.md"):
|
||
# 从文件名提取表名关键词
|
||
stem = f.stem.replace("BD_Manual_", "")
|
||
documented.add(stem.lower())
|
||
# 也从文件内容提取 schema.table 引用
|
||
try:
|
||
content = f.read_text(encoding="utf-8")
|
||
# 匹配 schema.table_name 模式
|
||
for m in re.finditer(r"(\w+)\.(\w+)", content):
|
||
schema, table = m.group(1), m.group(2)
|
||
if schema in (
|
||
"ods", "dwd", "dws", "meta", "core", "app",
|
||
"public", "auth",
|
||
):
|
||
documented.add(f"{schema}.{table}".lower())
|
||
except Exception:
|
||
pass
|
||
return documented
|
||
|
||
|
||
def reconcile(db_tables: dict, documented: set[str]) -> dict:
|
||
"""对账:找出缺失文档的表。"""
|
||
missing = []
|
||
for key, info in sorted(db_tables.items()):
|
||
key_lower = key.lower()
|
||
table_lower = info["table"].lower()
|
||
# 检查是否有文档覆盖
|
||
if key_lower not in documented and table_lower not in documented:
|
||
missing.append({
|
||
"schema_table": key,
|
||
"column_count": len(info["columns"]),
|
||
})
|
||
return {
|
||
"total_db_tables": len(db_tables),
|
||
"documented_refs": len(documented),
|
||
"missing_docs": missing,
|
||
"missing_count": len(missing),
|
||
}
|
||
|
||
|
||
def main():
|
||
docs_dir = Path(__file__).resolve().parents[2] / "docs" / "database"
|
||
|
||
# ETL 库(六层 schema)
|
||
etl_schemas = ["ods", "dwd", "dws", "meta", "core", "app"]
|
||
etl_tables = query_tables_and_columns(TEST_ETL_DSN, etl_schemas)
|
||
|
||
# 业务库
|
||
app_schemas = ["public", "auth"]
|
||
app_tables = query_tables_and_columns(TEST_APP_DSN, app_schemas)
|
||
|
||
# 合并
|
||
all_tables = {**etl_tables, **app_tables}
|
||
|
||
# 扫描现有文档
|
||
documented = scan_existing_docs(docs_dir)
|
||
|
||
# 对账
|
||
result = reconcile(all_tables, documented)
|
||
|
||
# 输出 JSON
|
||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|