Neo-ZQYY/scripts/ops/_db_docs_reconcile.py

"""
DB 文档全量对账脚本（审计用，一次性）。
连接测试库，查询 information_schema，与 docs/database/ 现有文档对比。
输出 JSON 摘要到 stdout。
"""
from __future__ import annotations

import json
import os
import re
import sys
from pathlib import Path

from dotenv import load_dotenv

# 加载根 .env
load_dotenv(Path(__file__).resolve().parents[2] / ".env")

TEST_ETL_DSN = os.environ.get("TEST_DB_DSN")
TEST_APP_DSN = os.environ.get("TEST_APP_DB_DSN")

if not TEST_ETL_DSN or not TEST_APP_DSN:
    print("ERROR: TEST_DB_DSN or TEST_APP_DB_DSN not set", file=sys.stderr)
    sys.exit(1)

import psycopg2  # noqa: E402


def query_tables_and_columns(dsn: str, schemas: list[str]) -> dict:
    """查询指定 schema 下所有表和字段。"""
    conn = psycopg2.connect(dsn)
    try:
        with conn.cursor() as cur:
            placeholders = ",".join(["%s"] * len(schemas))
            # 查询表
            cur.execute(
                f"""
                SELECT table_schema, table_name
                FROM information_schema.tables
                WHERE table_schema IN ({placeholders})
                  AND table_type = 'BASE TABLE'
                ORDER BY table_schema, table_name
                """,
                schemas,
            )
            tables = cur.fetchall()

            # 查询字段
            cur.execute(
                f"""
                SELECT table_schema, table_name, column_name,
                       data_type, is_nullable, column_default
                FROM information_schema.columns
                WHERE table_schema IN ({placeholders})
                ORDER BY table_schema, table_name, ordinal_position
                """,
                schemas,
            )
            columns = cur.fetchall()
    finally:
        conn.close()

    result = {}
    for schema, table in tables:
        key = f"{schema}.{table}"
        result[key] = {"schema": schema, "table": table, "columns": []}

    for schema, table, col_name, data_type, nullable, default in columns:
        key = f"{schema}.{table}"
        if key in result:
            result[key]["columns"].append({
                "name": col_name,
                "type": data_type,
                "nullable": nullable,
                "default": default,
            })

    return result


def scan_existing_docs(docs_dir: Path) -> set[str]:
    """扫描 docs/database/ 下的 BD_Manual_*.md，提取已文档化的表名关键词。"""
    documented = set()
    for f in docs_dir.glob("BD_Manual_*.md"):
        # 从文件名提取表名关键词
        stem = f.stem.replace("BD_Manual_", "")
        documented.add(stem.lower())
        # 也从文件内容提取 schema.table 引用
        try:
            content = f.read_text(encoding="utf-8")
            # 匹配 schema.table_name 模式
            for m in re.finditer(r"(\w+)\.(\w+)", content):
                schema, table = m.group(1), m.group(2)
                if schema in (
                    "ods", "dwd", "dws", "meta", "core", "app",
                    "public", "auth",
                ):
                    documented.add(f"{schema}.{table}".lower())
        except Exception:
            pass
    return documented


def reconcile(db_tables: dict, documented: set[str]) -> dict:
    """对账：找出缺失文档的表。"""
    missing = []
    for key, info in sorted(db_tables.items()):
        key_lower = key.lower()
        table_lower = info["table"].lower()
        # 检查是否有文档覆盖
        if key_lower not in documented and table_lower not in documented:
            missing.append({
                "schema_table": key,
                "column_count": len(info["columns"]),
            })
    return {
        "total_db_tables": len(db_tables),
        "documented_refs": len(documented),
        "missing_docs": missing,
        "missing_count": len(missing),
    }


def main():
    docs_dir = Path(__file__).resolve().parents[2] / "docs" / "database"

    # ETL 库（六层 schema）
    etl_schemas = ["ods", "dwd", "dws", "meta", "core", "app"]
    etl_tables = query_tables_and_columns(TEST_ETL_DSN, etl_schemas)

    # 业务库
    app_schemas = ["public", "auth"]
    app_tables = query_tables_and_columns(TEST_APP_DSN, app_schemas)

    # 合并
    all_tables = {**etl_tables, **app_tables}

    # 扫描现有文档
    documented = scan_existing_docs(docs_dir)

    # 对账
    result = reconcile(all_tables, documented)

    # 输出 JSON
    print(json.dumps(result, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()