Files
Neo-ZQYY/scripts/ops/_db_docs_reconcile.py

150 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
DB 文档全量对账脚本(审计用,一次性)。
连接测试库,查询 information_schema与 docs/database/ 现有文档对比。
输出 JSON 摘要到 stdout。
"""
from __future__ import annotations
import json
import os
import re
import sys
from pathlib import Path
from dotenv import load_dotenv
# 加载根 .env
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
TEST_ETL_DSN = os.environ.get("TEST_DB_DSN")
TEST_APP_DSN = os.environ.get("TEST_APP_DB_DSN")
if not TEST_ETL_DSN or not TEST_APP_DSN:
print("ERROR: TEST_DB_DSN or TEST_APP_DB_DSN not set", file=sys.stderr)
sys.exit(1)
import psycopg2 # noqa: E402
def query_tables_and_columns(dsn: str, schemas: list[str]) -> dict:
"""查询指定 schema 下所有表和字段。"""
conn = psycopg2.connect(dsn)
try:
with conn.cursor() as cur:
placeholders = ",".join(["%s"] * len(schemas))
# 查询表
cur.execute(
f"""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema IN ({placeholders})
AND table_type = 'BASE TABLE'
ORDER BY table_schema, table_name
""",
schemas,
)
tables = cur.fetchall()
# 查询字段
cur.execute(
f"""
SELECT table_schema, table_name, column_name,
data_type, is_nullable, column_default
FROM information_schema.columns
WHERE table_schema IN ({placeholders})
ORDER BY table_schema, table_name, ordinal_position
""",
schemas,
)
columns = cur.fetchall()
finally:
conn.close()
result = {}
for schema, table in tables:
key = f"{schema}.{table}"
result[key] = {"schema": schema, "table": table, "columns": []}
for schema, table, col_name, data_type, nullable, default in columns:
key = f"{schema}.{table}"
if key in result:
result[key]["columns"].append({
"name": col_name,
"type": data_type,
"nullable": nullable,
"default": default,
})
return result
def scan_existing_docs(docs_dir: Path) -> set[str]:
"""扫描 docs/database/ 下的 BD_Manual_*.md提取已文档化的表名关键词。"""
documented = set()
for f in docs_dir.glob("BD_Manual_*.md"):
# 从文件名提取表名关键词
stem = f.stem.replace("BD_Manual_", "")
documented.add(stem.lower())
# 也从文件内容提取 schema.table 引用
try:
content = f.read_text(encoding="utf-8")
# 匹配 schema.table_name 模式
for m in re.finditer(r"(\w+)\.(\w+)", content):
schema, table = m.group(1), m.group(2)
if schema in (
"ods", "dwd", "dws", "meta", "core", "app",
"public", "auth",
):
documented.add(f"{schema}.{table}".lower())
except Exception:
pass
return documented
def reconcile(db_tables: dict, documented: set[str]) -> dict:
"""对账:找出缺失文档的表。"""
missing = []
for key, info in sorted(db_tables.items()):
key_lower = key.lower()
table_lower = info["table"].lower()
# 检查是否有文档覆盖
if key_lower not in documented and table_lower not in documented:
missing.append({
"schema_table": key,
"column_count": len(info["columns"]),
})
return {
"total_db_tables": len(db_tables),
"documented_refs": len(documented),
"missing_docs": missing,
"missing_count": len(missing),
}
def main():
docs_dir = Path(__file__).resolve().parents[2] / "docs" / "database"
# ETL 库(六层 schema
etl_schemas = ["ods", "dwd", "dws", "meta", "core", "app"]
etl_tables = query_tables_and_columns(TEST_ETL_DSN, etl_schemas)
# 业务库
app_schemas = ["public", "auth"]
app_tables = query_tables_and_columns(TEST_APP_DSN, app_schemas)
# 合并
all_tables = {**etl_tables, **app_tables}
# 扫描现有文档
documented = scan_existing_docs(docs_dir)
# 对账
result = reconcile(all_tables, documented)
# 输出 JSON
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()