#!/usr/bin/env python3 """ BD_Manual 文档体系验证脚本。 # AI_CHANGELOG [2026-02-13] 新增:验证 Property 1/4/5/6/7/8/9/10,支持 --pg-dsn 参数 验证 docs/database/ 下的目录结构、文档覆盖率、格式完整性和命名规范。 需要连接 PostgreSQL 获取 ods schema 的表清单作为基准。 用法: python scripts/validate_bd_manual.py --pg-dsn "postgresql://user:pass@host/db" python scripts/validate_bd_manual.py # 从 PG_DSN 环境变量或 .env 读取 """ from __future__ import annotations import argparse import os import re import sys from pathlib import Path from dataclasses import dataclass, field # --------------------------------------------------------------------------- # 常量 # --------------------------------------------------------------------------- BD_MANUAL_ROOT = Path("docs/database") ODS_MAIN_DIR = BD_MANUAL_ROOT / "ODS" / "main" ODS_MAPPINGS_DIR = BD_MANUAL_ROOT / "ODS" / "mappings" ODS_DICT_PATH = Path("docs/database/overview/ods_tables_dictionary.md") # 四个数据层,每层都应有 main/ 和 changes/ DATA_LAYERS = ["ODS", "DWD", "DWS", "ETL_Admin"] # ODS 文档必须包含的章节标题(Property 5) ODS_DOC_REQUIRED_SECTIONS = [ "表信息", "字段说明", "使用说明", "可回溯性", ] # ODS 文档"表信息"表格中必须出现的属性关键词 ODS_DOC_TABLE_INFO_KEYS = ["Schema", "表名", "主键", "数据来源", "说明"] # ODS 文档必须提及的 ETL 元数据字段 ODS_DOC_ETL_META_FIELDS = [ "content_hash", "source_file", "source_endpoint", "fetched_at", "payload", ] # 映射文档必须包含的章节/关键内容(Property 8) MAPPING_DOC_REQUIRED_SECTIONS = [ "端点信息", "字段映射", "ETL 补充字段", ] # 映射文档"端点信息"表格中必须出现的属性关键词 MAPPING_DOC_ENDPOINT_KEYS = ["接口路径", "ODS 对应表", "JSON 数据路径"] # --------------------------------------------------------------------------- # 数据结构 # --------------------------------------------------------------------------- @dataclass class CheckResult: """单条验证结果。""" property_id: str # 如 "Property 1" description: str passed: bool details: list[str] = field(default_factory=list) # 失败时的具体说明 # --------------------------------------------------------------------------- # 数据库查询:获取 ODS 表清单 # --------------------------------------------------------------------------- def fetch_ods_tables(pg_dsn: str) -> list[str]: """从 ods schema 获取所有用户表名(排除系统表)。""" import psycopg2 sql = """ SELECT table_name FROM information_schema.tables WHERE table_schema = 'ods' AND table_type = 'BASE TABLE' ORDER BY table_name; """ with psycopg2.connect(pg_dsn) as conn: with conn.cursor() as cur: cur.execute(sql) return [row[0] for row in cur.fetchall()] # --------------------------------------------------------------------------- # Property 1: 数据层目录结构一致性 # --------------------------------------------------------------------------- def check_directory_structure() -> CheckResult: """ODS/DWD/DWS/ETL_Admin 各层都应有 main/ 和 changes/ 子目录。""" missing: list[str] = [] for layer in DATA_LAYERS: for sub in ("main", "changes"): p = BD_MANUAL_ROOT / layer / sub if not p.is_dir(): missing.append(str(p)) return CheckResult( property_id="Property 1", description="数据层目录结构一致性(main/ + changes/)", passed=len(missing) == 0, details=[f"缺失目录: {d}" for d in missing], ) # --------------------------------------------------------------------------- # Property 4: ODS 表级文档覆盖率 # --------------------------------------------------------------------------- def check_ods_doc_coverage(ods_tables: list[str]) -> CheckResult: """ods 中每张表都应有 BD_manual_{表名}.md。""" missing: list[str] = [] for tbl in ods_tables: expected = ODS_MAIN_DIR / f"BD_manual_{tbl}.md" if not expected.is_file(): missing.append(tbl) return CheckResult( property_id="Property 4", description="ODS 表级文档覆盖率", passed=len(missing) == 0, details=[f"缺失文档: BD_manual_{t}.md" for t in missing], ) # --------------------------------------------------------------------------- # Property 5: ODS 表级文档格式完整性 # --------------------------------------------------------------------------- def _check_single_ods_doc(filepath: Path) -> list[str]: """检查单份 ODS 文档是否包含必要章节和内容,返回问题列表。""" issues: list[str] = [] name = filepath.name try: content = filepath.read_text(encoding="utf-8") except Exception as e: return [f"{name}: 无法读取 ({e})"] # 检查必要章节 for section in ODS_DOC_REQUIRED_SECTIONS: # 匹配 ## 章节标题(允许前后有空格) pattern = rf"^##\s+.*{re.escape(section)}" if not re.search(pattern, content, re.MULTILINE): issues.append(f"{name}: 缺少「{section}」章节") # 检查"表信息"表格中的关键属性 for key in ODS_DOC_TABLE_INFO_KEYS: if key not in content: issues.append(f"{name}: 表信息缺少「{key}」属性") # 检查 ETL 元数据字段是否被提及 meta_missing = [f for f in ODS_DOC_ETL_META_FIELDS if f not in content] if meta_missing: issues.append(f"{name}: 未提及 ETL 元数据字段: {', '.join(meta_missing)}") return issues def check_ods_doc_format() -> CheckResult: """每份 ODS 文档应包含:表信息、字段说明、使用说明、可回溯性、ETL 元数据字段。""" all_issues: list[str] = [] if not ODS_MAIN_DIR.is_dir(): return CheckResult( property_id="Property 5", description="ODS 表级文档格式完整性", passed=False, details=["ODS/main/ 目录不存在"], ) for f in sorted(ODS_MAIN_DIR.glob("BD_manual_*.md")): all_issues.extend(_check_single_ods_doc(f)) return CheckResult( property_id="Property 5", description="ODS 表级文档格式完整性", passed=len(all_issues) == 0, details=all_issues, ) # --------------------------------------------------------------------------- # Property 6: ODS 表级文档命名规范 # --------------------------------------------------------------------------- def check_ods_doc_naming() -> CheckResult: """ODS/main/ 下的文件名应匹配 BD_manual_{表名}.md。""" bad: list[str] = [] if not ODS_MAIN_DIR.is_dir(): return CheckResult( property_id="Property 6", description="ODS 表级文档命名规范", passed=False, details=["ODS/main/ 目录不存在"], ) pattern = re.compile(r"^BD_manual_[a-z][a-z0-9_]*\.md$") for f in sorted(ODS_MAIN_DIR.iterdir()): if f.suffix == ".md" and not pattern.match(f.name): bad.append(f.name) return CheckResult( property_id="Property 6", description="ODS 表级文档命名规范(BD_manual_{表名}.md)", passed=len(bad) == 0, details=[f"命名不规范: {n}" for n in bad], ) # --------------------------------------------------------------------------- # Property 7: 映射文档覆盖率 # --------------------------------------------------------------------------- def check_mapping_doc_coverage(ods_tables: list[str]) -> CheckResult: """每个有 ODS 表的 API 端点都应有映射文档。 策略:遍历 ODS 表,检查 mappings/ 下是否存在至少一个 mapping_*_{表名}.md 文件。 """ missing: list[str] = [] if not ODS_MAPPINGS_DIR.is_dir(): return CheckResult( property_id="Property 7", description="映射文档覆盖率", passed=False, details=["ODS/mappings/ 目录不存在"], ) existing_mappings = {f.name for f in ODS_MAPPINGS_DIR.glob("mapping_*.md")} for tbl in ods_tables: # 查找 mapping_*_{表名}.md found = any( name.endswith(f"_{tbl}.md") and name.startswith("mapping_") for name in existing_mappings ) if not found: missing.append(tbl) return CheckResult( property_id="Property 7", description="映射文档覆盖率(每张 ODS 表至少一份映射文档)", passed=len(missing) == 0, details=[f"缺失映射文档: mapping_*_{t}.md" for t in missing], ) # --------------------------------------------------------------------------- # Property 8: 映射文档内容完整性 # --------------------------------------------------------------------------- def _check_single_mapping_doc(filepath: Path) -> list[str]: """检查单份映射文档是否包含必要章节和内容。""" issues: list[str] = [] name = filepath.name try: content = filepath.read_text(encoding="utf-8") except Exception as e: return [f"{name}: 无法读取 ({e})"] # 检查必要章节 for section in MAPPING_DOC_REQUIRED_SECTIONS: pattern = rf"^##\s+.*{re.escape(section)}" if not re.search(pattern, content, re.MULTILINE): issues.append(f"{name}: 缺少「{section}」章节") # 检查端点信息表格中的关键属性 for key in MAPPING_DOC_ENDPOINT_KEYS: if key not in content: issues.append(f"{name}: 端点信息缺少「{key}」属性") # 检查 ETL 补充字段是否被提及 etl_missing = [f for f in ODS_DOC_ETL_META_FIELDS if f not in content] if etl_missing: issues.append(f"{name}: 未提及 ETL 补充字段: {', '.join(etl_missing)}") return issues def check_mapping_doc_content() -> CheckResult: """每份映射文档应包含:端点路径、ODS 表名、JSON 数据路径、字段映射表、ETL 补充字段。""" all_issues: list[str] = [] if not ODS_MAPPINGS_DIR.is_dir(): return CheckResult( property_id="Property 8", description="映射文档内容完整性", passed=False, details=["ODS/mappings/ 目录不存在"], ) for f in sorted(ODS_MAPPINGS_DIR.glob("mapping_*.md")): all_issues.extend(_check_single_mapping_doc(f)) return CheckResult( property_id="Property 8", description="映射文档内容完整性", passed=len(all_issues) == 0, details=all_issues, ) # --------------------------------------------------------------------------- # Property 9: 映射文档命名规范 # --------------------------------------------------------------------------- def check_mapping_doc_naming() -> CheckResult: """映射文档文件名应匹配 mapping_{API端点名}_{ODS表名}.md。""" bad: list[str] = [] if not ODS_MAPPINGS_DIR.is_dir(): return CheckResult( property_id="Property 9", description="映射文档命名规范", passed=False, details=["ODS/mappings/ 目录不存在"], ) # mapping_{EndpointName}_{table_name}.md # 端点名:PascalCase(字母数字),表名:snake_case pattern = re.compile(r"^mapping_[A-Z][A-Za-z0-9]+_[a-z][a-z0-9_]*\.md$") for f in sorted(ODS_MAPPINGS_DIR.iterdir()): if f.suffix == ".md" and f.name.startswith("mapping_"): if not pattern.match(f.name): bad.append(f.name) return CheckResult( property_id="Property 9", description="映射文档命名规范(mapping_{API端点名}_{ODS表名}.md)", passed=len(bad) == 0, details=[f"命名不规范: {n}" for n in bad], ) # --------------------------------------------------------------------------- # Property 10: ODS 数据字典覆盖率 # --------------------------------------------------------------------------- def check_ods_dictionary_coverage(ods_tables: list[str]) -> CheckResult: """数据字典中应包含所有 ODS 表条目。""" if not ODS_DICT_PATH.is_file(): return CheckResult( property_id="Property 10", description="ODS 数据字典覆盖率", passed=False, details=[f"数据字典文件不存在: {ODS_DICT_PATH}"], ) try: content = ODS_DICT_PATH.read_text(encoding="utf-8") except Exception as e: return CheckResult( property_id="Property 10", description="ODS 数据字典覆盖率", passed=False, details=[f"无法读取数据字典: {e}"], ) missing: list[str] = [] for tbl in ods_tables: # 在字典内容中查找表名(反引号包裹或直接出现) if tbl not in content: missing.append(tbl) return CheckResult( property_id="Property 10", description="ODS 数据字典覆盖率", passed=len(missing) == 0, details=[f"数据字典缺失条目: {t}" for t in missing], ) # --------------------------------------------------------------------------- # 报告输出 # --------------------------------------------------------------------------- def print_report(results: list[CheckResult]) -> None: """打印验证报告。""" print("=" * 60) print("BD_Manual 文档体系验证报告") print("=" * 60) passed_count = sum(1 for r in results if r.passed) total = len(results) for r in results: status = "✓ PASS" if r.passed else "✗ FAIL" print(f"\n[{status}] {r.property_id}: {r.description}") if not r.passed: for d in r.details[:20]: # 最多显示 20 条 print(f" - {d}") if len(r.details) > 20: print(f" ... 还有 {len(r.details) - 20} 条问题") print("\n" + "-" * 60) print(f"结果: {passed_count}/{total} 项通过") if passed_count < total: print("存在未通过的验证项,请检查上述详情。") else: print("所有验证项均通过 ✓") print("=" * 60) # --------------------------------------------------------------------------- # 主入口 # --------------------------------------------------------------------------- def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( description="验证 BD_Manual 文档体系的覆盖率、格式和命名规范", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: # 从 .env 或 PG_DSN 环境变量读取连接字符串 python scripts/validate_bd_manual.py # 指定连接字符串 python scripts/validate_bd_manual.py --pg-dsn "postgresql://user:pass@host/db" """, ) parser.add_argument( "--pg-dsn", help="PostgreSQL 连接字符串(默认从 PG_DSN 环境变量或 .env 读取)", ) args = parser.parse_args(argv) # 加载 .env try: from dotenv import load_dotenv load_dotenv() except ImportError: pass pg_dsn = args.pg_dsn or os.environ.get("PG_DSN") if not pg_dsn: print( "✗ 未提供 PG_DSN,请通过 --pg-dsn 参数或 PG_DSN 环境变量指定", file=sys.stderr, ) return 1 # 获取 ODS 表清单 try: ods_tables = fetch_ods_tables(pg_dsn) except Exception as e: print(f"✗ 连接数据库失败: {e}", file=sys.stderr) return 1 if not ods_tables: print("⚠ ods schema 中未找到任何表", file=sys.stderr) return 1 print(f"从数据库获取到 {len(ods_tables)} 张 ODS 表\n") # 运行所有验证 results: list[CheckResult] = [ check_directory_structure(), # Property 1 check_ods_doc_coverage(ods_tables), # Property 4 check_ods_doc_format(), # Property 5 check_ods_doc_naming(), # Property 6 check_mapping_doc_coverage(ods_tables),# Property 7 check_mapping_doc_content(), # Property 8 check_mapping_doc_naming(), # Property 9 check_ods_dictionary_coverage(ods_tables), # Property 10 ] print_report(results) # 任一验证失败则返回非零退出码 if any(not r.passed for r in results): return 1 return 0 if __name__ == "__main__": sys.exit(main())