Files
Neo-ZQYY/apps/etl/connectors/feiqiu/scripts/validate_bd_manual.py

489 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
BD_Manual 文档体系验证脚本。
# AI_CHANGELOG [2026-02-13] 新增:验证 Property 1/4/5/6/7/8/9/10支持 --pg-dsn 参数
验证 docs/database/ 下的目录结构、文档覆盖率、格式完整性和命名规范。
需要连接 PostgreSQL 获取 ods schema 的表清单作为基准。
用法:
python scripts/validate_bd_manual.py --pg-dsn "postgresql://user:pass@host/db"
python scripts/validate_bd_manual.py # 从 PG_DSN 环境变量或 .env 读取
"""
from __future__ import annotations
import argparse
import os
import re
import sys
from pathlib import Path
from dataclasses import dataclass, field
# ---------------------------------------------------------------------------
# 常量
# ---------------------------------------------------------------------------
BD_MANUAL_ROOT = Path("docs/database")
ODS_MAIN_DIR = BD_MANUAL_ROOT / "ODS" / "main"
ODS_MAPPINGS_DIR = BD_MANUAL_ROOT / "ODS" / "mappings"
ODS_DICT_PATH = Path("docs/database/overview/ods_tables_dictionary.md")
# 四个数据层,每层都应有 main/ 和 changes/
DATA_LAYERS = ["ODS", "DWD", "DWS", "ETL_Admin"]
# ODS 文档必须包含的章节标题Property 5
ODS_DOC_REQUIRED_SECTIONS = [
"表信息",
"字段说明",
"使用说明",
"可回溯性",
]
# ODS 文档"表信息"表格中必须出现的属性关键词
ODS_DOC_TABLE_INFO_KEYS = ["Schema", "表名", "主键", "数据来源", "说明"]
# ODS 文档必须提及的 ETL 元数据字段
ODS_DOC_ETL_META_FIELDS = [
"content_hash",
"source_file",
"source_endpoint",
"fetched_at",
"payload",
]
# 映射文档必须包含的章节/关键内容Property 8
MAPPING_DOC_REQUIRED_SECTIONS = [
"端点信息",
"字段映射",
"ETL 补充字段",
]
# 映射文档"端点信息"表格中必须出现的属性关键词
MAPPING_DOC_ENDPOINT_KEYS = ["接口路径", "ODS 对应表", "JSON 数据路径"]
# ---------------------------------------------------------------------------
# 数据结构
# ---------------------------------------------------------------------------
@dataclass
class CheckResult:
"""单条验证结果。"""
property_id: str # 如 "Property 1"
description: str
passed: bool
details: list[str] = field(default_factory=list) # 失败时的具体说明
# ---------------------------------------------------------------------------
# 数据库查询:获取 ODS 表清单
# ---------------------------------------------------------------------------
def fetch_ods_tables(pg_dsn: str) -> list[str]:
"""从 ods schema 获取所有用户表名(排除系统表)。"""
import psycopg2
sql = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'ods'
AND table_type = 'BASE TABLE'
ORDER BY table_name;
"""
with psycopg2.connect(pg_dsn) as conn:
with conn.cursor() as cur:
cur.execute(sql)
return [row[0] for row in cur.fetchall()]
# ---------------------------------------------------------------------------
# Property 1: 数据层目录结构一致性
# ---------------------------------------------------------------------------
def check_directory_structure() -> CheckResult:
"""ODS/DWD/DWS/ETL_Admin 各层都应有 main/ 和 changes/ 子目录。"""
missing: list[str] = []
for layer in DATA_LAYERS:
for sub in ("main", "changes"):
p = BD_MANUAL_ROOT / layer / sub
if not p.is_dir():
missing.append(str(p))
return CheckResult(
property_id="Property 1",
description="数据层目录结构一致性main/ + changes/",
passed=len(missing) == 0,
details=[f"缺失目录: {d}" for d in missing],
)
# ---------------------------------------------------------------------------
# Property 4: ODS 表级文档覆盖率
# ---------------------------------------------------------------------------
def check_ods_doc_coverage(ods_tables: list[str]) -> CheckResult:
"""ods 中每张表都应有 BD_manual_{表名}.md。"""
missing: list[str] = []
for tbl in ods_tables:
expected = ODS_MAIN_DIR / f"BD_manual_{tbl}.md"
if not expected.is_file():
missing.append(tbl)
return CheckResult(
property_id="Property 4",
description="ODS 表级文档覆盖率",
passed=len(missing) == 0,
details=[f"缺失文档: BD_manual_{t}.md" for t in missing],
)
# ---------------------------------------------------------------------------
# Property 5: ODS 表级文档格式完整性
# ---------------------------------------------------------------------------
def _check_single_ods_doc(filepath: Path) -> list[str]:
"""检查单份 ODS 文档是否包含必要章节和内容,返回问题列表。"""
issues: list[str] = []
name = filepath.name
try:
content = filepath.read_text(encoding="utf-8")
except Exception as e:
return [f"{name}: 无法读取 ({e})"]
# 检查必要章节
for section in ODS_DOC_REQUIRED_SECTIONS:
# 匹配 ## 章节标题(允许前后有空格)
pattern = rf"^##\s+.*{re.escape(section)}"
if not re.search(pattern, content, re.MULTILINE):
issues.append(f"{name}: 缺少「{section}」章节")
# 检查"表信息"表格中的关键属性
for key in ODS_DOC_TABLE_INFO_KEYS:
if key not in content:
issues.append(f"{name}: 表信息缺少「{key}」属性")
# 检查 ETL 元数据字段是否被提及
meta_missing = [f for f in ODS_DOC_ETL_META_FIELDS if f not in content]
if meta_missing:
issues.append(f"{name}: 未提及 ETL 元数据字段: {', '.join(meta_missing)}")
return issues
def check_ods_doc_format() -> CheckResult:
"""每份 ODS 文档应包含表信息、字段说明、使用说明、可回溯性、ETL 元数据字段。"""
all_issues: list[str] = []
if not ODS_MAIN_DIR.is_dir():
return CheckResult(
property_id="Property 5",
description="ODS 表级文档格式完整性",
passed=False,
details=["ODS/main/ 目录不存在"],
)
for f in sorted(ODS_MAIN_DIR.glob("BD_manual_*.md")):
all_issues.extend(_check_single_ods_doc(f))
return CheckResult(
property_id="Property 5",
description="ODS 表级文档格式完整性",
passed=len(all_issues) == 0,
details=all_issues,
)
# ---------------------------------------------------------------------------
# Property 6: ODS 表级文档命名规范
# ---------------------------------------------------------------------------
def check_ods_doc_naming() -> CheckResult:
"""ODS/main/ 下的文件名应匹配 BD_manual_{表名}.md。"""
bad: list[str] = []
if not ODS_MAIN_DIR.is_dir():
return CheckResult(
property_id="Property 6",
description="ODS 表级文档命名规范",
passed=False,
details=["ODS/main/ 目录不存在"],
)
pattern = re.compile(r"^BD_manual_[a-z][a-z0-9_]*\.md$")
for f in sorted(ODS_MAIN_DIR.iterdir()):
if f.suffix == ".md" and not pattern.match(f.name):
bad.append(f.name)
return CheckResult(
property_id="Property 6",
description="ODS 表级文档命名规范BD_manual_{表名}.md",
passed=len(bad) == 0,
details=[f"命名不规范: {n}" for n in bad],
)
# ---------------------------------------------------------------------------
# Property 7: 映射文档覆盖率
# ---------------------------------------------------------------------------
def check_mapping_doc_coverage(ods_tables: list[str]) -> CheckResult:
"""每个有 ODS 表的 API 端点都应有映射文档。
策略:遍历 ODS 表,检查 mappings/ 下是否存在至少一个
mapping_*_{表名}.md 文件。
"""
missing: list[str] = []
if not ODS_MAPPINGS_DIR.is_dir():
return CheckResult(
property_id="Property 7",
description="映射文档覆盖率",
passed=False,
details=["ODS/mappings/ 目录不存在"],
)
existing_mappings = {f.name for f in ODS_MAPPINGS_DIR.glob("mapping_*.md")}
for tbl in ods_tables:
# 查找 mapping_*_{表名}.md
found = any(
name.endswith(f"_{tbl}.md") and name.startswith("mapping_")
for name in existing_mappings
)
if not found:
missing.append(tbl)
return CheckResult(
property_id="Property 7",
description="映射文档覆盖率(每张 ODS 表至少一份映射文档)",
passed=len(missing) == 0,
details=[f"缺失映射文档: mapping_*_{t}.md" for t in missing],
)
# ---------------------------------------------------------------------------
# Property 8: 映射文档内容完整性
# ---------------------------------------------------------------------------
def _check_single_mapping_doc(filepath: Path) -> list[str]:
"""检查单份映射文档是否包含必要章节和内容。"""
issues: list[str] = []
name = filepath.name
try:
content = filepath.read_text(encoding="utf-8")
except Exception as e:
return [f"{name}: 无法读取 ({e})"]
# 检查必要章节
for section in MAPPING_DOC_REQUIRED_SECTIONS:
pattern = rf"^##\s+.*{re.escape(section)}"
if not re.search(pattern, content, re.MULTILINE):
issues.append(f"{name}: 缺少「{section}」章节")
# 检查端点信息表格中的关键属性
for key in MAPPING_DOC_ENDPOINT_KEYS:
if key not in content:
issues.append(f"{name}: 端点信息缺少「{key}」属性")
# 检查 ETL 补充字段是否被提及
etl_missing = [f for f in ODS_DOC_ETL_META_FIELDS if f not in content]
if etl_missing:
issues.append(f"{name}: 未提及 ETL 补充字段: {', '.join(etl_missing)}")
return issues
def check_mapping_doc_content() -> CheckResult:
"""每份映射文档应包含端点路径、ODS 表名、JSON 数据路径、字段映射表、ETL 补充字段。"""
all_issues: list[str] = []
if not ODS_MAPPINGS_DIR.is_dir():
return CheckResult(
property_id="Property 8",
description="映射文档内容完整性",
passed=False,
details=["ODS/mappings/ 目录不存在"],
)
for f in sorted(ODS_MAPPINGS_DIR.glob("mapping_*.md")):
all_issues.extend(_check_single_mapping_doc(f))
return CheckResult(
property_id="Property 8",
description="映射文档内容完整性",
passed=len(all_issues) == 0,
details=all_issues,
)
# ---------------------------------------------------------------------------
# Property 9: 映射文档命名规范
# ---------------------------------------------------------------------------
def check_mapping_doc_naming() -> CheckResult:
"""映射文档文件名应匹配 mapping_{API端点名}_{ODS表名}.md。"""
bad: list[str] = []
if not ODS_MAPPINGS_DIR.is_dir():
return CheckResult(
property_id="Property 9",
description="映射文档命名规范",
passed=False,
details=["ODS/mappings/ 目录不存在"],
)
# mapping_{EndpointName}_{table_name}.md
# 端点名PascalCase字母数字表名snake_case
pattern = re.compile(r"^mapping_[A-Z][A-Za-z0-9]+_[a-z][a-z0-9_]*\.md$")
for f in sorted(ODS_MAPPINGS_DIR.iterdir()):
if f.suffix == ".md" and f.name.startswith("mapping_"):
if not pattern.match(f.name):
bad.append(f.name)
return CheckResult(
property_id="Property 9",
description="映射文档命名规范mapping_{API端点名}_{ODS表名}.md",
passed=len(bad) == 0,
details=[f"命名不规范: {n}" for n in bad],
)
# ---------------------------------------------------------------------------
# Property 10: ODS 数据字典覆盖率
# ---------------------------------------------------------------------------
def check_ods_dictionary_coverage(ods_tables: list[str]) -> CheckResult:
"""数据字典中应包含所有 ODS 表条目。"""
if not ODS_DICT_PATH.is_file():
return CheckResult(
property_id="Property 10",
description="ODS 数据字典覆盖率",
passed=False,
details=[f"数据字典文件不存在: {ODS_DICT_PATH}"],
)
try:
content = ODS_DICT_PATH.read_text(encoding="utf-8")
except Exception as e:
return CheckResult(
property_id="Property 10",
description="ODS 数据字典覆盖率",
passed=False,
details=[f"无法读取数据字典: {e}"],
)
missing: list[str] = []
for tbl in ods_tables:
# 在字典内容中查找表名(反引号包裹或直接出现)
if tbl not in content:
missing.append(tbl)
return CheckResult(
property_id="Property 10",
description="ODS 数据字典覆盖率",
passed=len(missing) == 0,
details=[f"数据字典缺失条目: {t}" for t in missing],
)
# ---------------------------------------------------------------------------
# 报告输出
# ---------------------------------------------------------------------------
def print_report(results: list[CheckResult]) -> None:
"""打印验证报告。"""
print("=" * 60)
print("BD_Manual 文档体系验证报告")
print("=" * 60)
passed_count = sum(1 for r in results if r.passed)
total = len(results)
for r in results:
status = "✓ PASS" if r.passed else "✗ FAIL"
print(f"\n[{status}] {r.property_id}: {r.description}")
if not r.passed:
for d in r.details[:20]: # 最多显示 20 条
print(f" - {d}")
if len(r.details) > 20:
print(f" ... 还有 {len(r.details) - 20} 条问题")
print("\n" + "-" * 60)
print(f"结果: {passed_count}/{total} 项通过")
if passed_count < total:
print("存在未通过的验证项,请检查上述详情。")
else:
print("所有验证项均通过 ✓")
print("=" * 60)
# ---------------------------------------------------------------------------
# 主入口
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="验证 BD_Manual 文档体系的覆盖率、格式和命名规范",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
# 从 .env 或 PG_DSN 环境变量读取连接字符串
python scripts/validate_bd_manual.py
# 指定连接字符串
python scripts/validate_bd_manual.py --pg-dsn "postgresql://user:pass@host/db"
""",
)
parser.add_argument(
"--pg-dsn",
help="PostgreSQL 连接字符串(默认从 PG_DSN 环境变量或 .env 读取)",
)
args = parser.parse_args(argv)
# 加载 .env
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass
pg_dsn = args.pg_dsn or os.environ.get("PG_DSN")
if not pg_dsn:
print(
"✗ 未提供 PG_DSN请通过 --pg-dsn 参数或 PG_DSN 环境变量指定",
file=sys.stderr,
)
return 1
# 获取 ODS 表清单
try:
ods_tables = fetch_ods_tables(pg_dsn)
except Exception as e:
print(f"✗ 连接数据库失败: {e}", file=sys.stderr)
return 1
if not ods_tables:
print("⚠ ods schema 中未找到任何表", file=sys.stderr)
return 1
print(f"从数据库获取到 {len(ods_tables)} 张 ODS 表\n")
# 运行所有验证
results: list[CheckResult] = [
check_directory_structure(), # Property 1
check_ods_doc_coverage(ods_tables), # Property 4
check_ods_doc_format(), # Property 5
check_ods_doc_naming(), # Property 6
check_mapping_doc_coverage(ods_tables),# Property 7
check_mapping_doc_content(), # Property 8
check_mapping_doc_naming(), # Property 9
check_ods_dictionary_coverage(ods_tables), # Property 10
]
print_report(results)
# 任一验证失败则返回非零退出码
if any(not r.passed for r in results):
return 1
return 0
if __name__ == "__main__":
sys.exit(main())