在准备环境前提交次全部更改。
This commit is contained in:
488
apps/etl/connectors/feiqiu/scripts/validate_bd_manual.py
Normal file
488
apps/etl/connectors/feiqiu/scripts/validate_bd_manual.py
Normal file
@@ -0,0 +1,488 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
BD_Manual 文档体系验证脚本。
|
||||
|
||||
# AI_CHANGELOG [2026-02-13] 新增:验证 Property 1/4/5/6/7/8/9/10,支持 --pg-dsn 参数
|
||||
|
||||
验证 docs/database/ 下的目录结构、文档覆盖率、格式完整性和命名规范。
|
||||
需要连接 PostgreSQL 获取 ods schema 的表清单作为基准。
|
||||
|
||||
用法:
|
||||
python scripts/validate_bd_manual.py --pg-dsn "postgresql://user:pass@host/db"
|
||||
python scripts/validate_bd_manual.py # 从 PG_DSN 环境变量或 .env 读取
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 常量
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BD_MANUAL_ROOT = Path("docs/database")
|
||||
ODS_MAIN_DIR = BD_MANUAL_ROOT / "ODS" / "main"
|
||||
ODS_MAPPINGS_DIR = BD_MANUAL_ROOT / "ODS" / "mappings"
|
||||
ODS_DICT_PATH = Path("docs/database/overview/ods_tables_dictionary.md")
|
||||
|
||||
# 四个数据层,每层都应有 main/ 和 changes/
|
||||
DATA_LAYERS = ["ODS", "DWD", "DWS", "ETL_Admin"]
|
||||
|
||||
# ODS 文档必须包含的章节标题(Property 5)
|
||||
ODS_DOC_REQUIRED_SECTIONS = [
|
||||
"表信息",
|
||||
"字段说明",
|
||||
"使用说明",
|
||||
"可回溯性",
|
||||
]
|
||||
|
||||
# ODS 文档"表信息"表格中必须出现的属性关键词
|
||||
ODS_DOC_TABLE_INFO_KEYS = ["Schema", "表名", "主键", "数据来源", "说明"]
|
||||
|
||||
# ODS 文档必须提及的 ETL 元数据字段
|
||||
ODS_DOC_ETL_META_FIELDS = [
|
||||
"content_hash",
|
||||
"source_file",
|
||||
"source_endpoint",
|
||||
"fetched_at",
|
||||
"payload",
|
||||
]
|
||||
|
||||
# 映射文档必须包含的章节/关键内容(Property 8)
|
||||
MAPPING_DOC_REQUIRED_SECTIONS = [
|
||||
"端点信息",
|
||||
"字段映射",
|
||||
"ETL 补充字段",
|
||||
]
|
||||
|
||||
# 映射文档"端点信息"表格中必须出现的属性关键词
|
||||
MAPPING_DOC_ENDPOINT_KEYS = ["接口路径", "ODS 对应表", "JSON 数据路径"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 数据结构
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class CheckResult:
|
||||
"""单条验证结果。"""
|
||||
property_id: str # 如 "Property 1"
|
||||
description: str
|
||||
passed: bool
|
||||
details: list[str] = field(default_factory=list) # 失败时的具体说明
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 数据库查询:获取 ODS 表清单
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fetch_ods_tables(pg_dsn: str) -> list[str]:
|
||||
"""从 ods schema 获取所有用户表名(排除系统表)。"""
|
||||
import psycopg2
|
||||
sql = """
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = 'ods'
|
||||
AND table_type = 'BASE TABLE'
|
||||
ORDER BY table_name;
|
||||
"""
|
||||
with psycopg2.connect(pg_dsn) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql)
|
||||
return [row[0] for row in cur.fetchall()]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 1: 数据层目录结构一致性
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_directory_structure() -> CheckResult:
|
||||
"""ODS/DWD/DWS/ETL_Admin 各层都应有 main/ 和 changes/ 子目录。"""
|
||||
missing: list[str] = []
|
||||
for layer in DATA_LAYERS:
|
||||
for sub in ("main", "changes"):
|
||||
p = BD_MANUAL_ROOT / layer / sub
|
||||
if not p.is_dir():
|
||||
missing.append(str(p))
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 1",
|
||||
description="数据层目录结构一致性(main/ + changes/)",
|
||||
passed=len(missing) == 0,
|
||||
details=[f"缺失目录: {d}" for d in missing],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 4: ODS 表级文档覆盖率
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_ods_doc_coverage(ods_tables: list[str]) -> CheckResult:
|
||||
"""ods 中每张表都应有 BD_manual_{表名}.md。"""
|
||||
missing: list[str] = []
|
||||
for tbl in ods_tables:
|
||||
expected = ODS_MAIN_DIR / f"BD_manual_{tbl}.md"
|
||||
if not expected.is_file():
|
||||
missing.append(tbl)
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 4",
|
||||
description="ODS 表级文档覆盖率",
|
||||
passed=len(missing) == 0,
|
||||
details=[f"缺失文档: BD_manual_{t}.md" for t in missing],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 5: ODS 表级文档格式完整性
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _check_single_ods_doc(filepath: Path) -> list[str]:
|
||||
"""检查单份 ODS 文档是否包含必要章节和内容,返回问题列表。"""
|
||||
issues: list[str] = []
|
||||
name = filepath.name
|
||||
try:
|
||||
content = filepath.read_text(encoding="utf-8")
|
||||
except Exception as e:
|
||||
return [f"{name}: 无法读取 ({e})"]
|
||||
|
||||
# 检查必要章节
|
||||
for section in ODS_DOC_REQUIRED_SECTIONS:
|
||||
# 匹配 ## 章节标题(允许前后有空格)
|
||||
pattern = rf"^##\s+.*{re.escape(section)}"
|
||||
if not re.search(pattern, content, re.MULTILINE):
|
||||
issues.append(f"{name}: 缺少「{section}」章节")
|
||||
|
||||
# 检查"表信息"表格中的关键属性
|
||||
for key in ODS_DOC_TABLE_INFO_KEYS:
|
||||
if key not in content:
|
||||
issues.append(f"{name}: 表信息缺少「{key}」属性")
|
||||
|
||||
# 检查 ETL 元数据字段是否被提及
|
||||
meta_missing = [f for f in ODS_DOC_ETL_META_FIELDS if f not in content]
|
||||
if meta_missing:
|
||||
issues.append(f"{name}: 未提及 ETL 元数据字段: {', '.join(meta_missing)}")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def check_ods_doc_format() -> CheckResult:
|
||||
"""每份 ODS 文档应包含:表信息、字段说明、使用说明、可回溯性、ETL 元数据字段。"""
|
||||
all_issues: list[str] = []
|
||||
if not ODS_MAIN_DIR.is_dir():
|
||||
return CheckResult(
|
||||
property_id="Property 5",
|
||||
description="ODS 表级文档格式完整性",
|
||||
passed=False,
|
||||
details=["ODS/main/ 目录不存在"],
|
||||
)
|
||||
|
||||
for f in sorted(ODS_MAIN_DIR.glob("BD_manual_*.md")):
|
||||
all_issues.extend(_check_single_ods_doc(f))
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 5",
|
||||
description="ODS 表级文档格式完整性",
|
||||
passed=len(all_issues) == 0,
|
||||
details=all_issues,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 6: ODS 表级文档命名规范
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_ods_doc_naming() -> CheckResult:
|
||||
"""ODS/main/ 下的文件名应匹配 BD_manual_{表名}.md。"""
|
||||
bad: list[str] = []
|
||||
if not ODS_MAIN_DIR.is_dir():
|
||||
return CheckResult(
|
||||
property_id="Property 6",
|
||||
description="ODS 表级文档命名规范",
|
||||
passed=False,
|
||||
details=["ODS/main/ 目录不存在"],
|
||||
)
|
||||
|
||||
pattern = re.compile(r"^BD_manual_[a-z][a-z0-9_]*\.md$")
|
||||
for f in sorted(ODS_MAIN_DIR.iterdir()):
|
||||
if f.suffix == ".md" and not pattern.match(f.name):
|
||||
bad.append(f.name)
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 6",
|
||||
description="ODS 表级文档命名规范(BD_manual_{表名}.md)",
|
||||
passed=len(bad) == 0,
|
||||
details=[f"命名不规范: {n}" for n in bad],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 7: 映射文档覆盖率
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_mapping_doc_coverage(ods_tables: list[str]) -> CheckResult:
|
||||
"""每个有 ODS 表的 API 端点都应有映射文档。
|
||||
|
||||
策略:遍历 ODS 表,检查 mappings/ 下是否存在至少一个
|
||||
mapping_*_{表名}.md 文件。
|
||||
"""
|
||||
missing: list[str] = []
|
||||
if not ODS_MAPPINGS_DIR.is_dir():
|
||||
return CheckResult(
|
||||
property_id="Property 7",
|
||||
description="映射文档覆盖率",
|
||||
passed=False,
|
||||
details=["ODS/mappings/ 目录不存在"],
|
||||
)
|
||||
|
||||
existing_mappings = {f.name for f in ODS_MAPPINGS_DIR.glob("mapping_*.md")}
|
||||
for tbl in ods_tables:
|
||||
# 查找 mapping_*_{表名}.md
|
||||
found = any(
|
||||
name.endswith(f"_{tbl}.md") and name.startswith("mapping_")
|
||||
for name in existing_mappings
|
||||
)
|
||||
if not found:
|
||||
missing.append(tbl)
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 7",
|
||||
description="映射文档覆盖率(每张 ODS 表至少一份映射文档)",
|
||||
passed=len(missing) == 0,
|
||||
details=[f"缺失映射文档: mapping_*_{t}.md" for t in missing],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 8: 映射文档内容完整性
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _check_single_mapping_doc(filepath: Path) -> list[str]:
|
||||
"""检查单份映射文档是否包含必要章节和内容。"""
|
||||
issues: list[str] = []
|
||||
name = filepath.name
|
||||
try:
|
||||
content = filepath.read_text(encoding="utf-8")
|
||||
except Exception as e:
|
||||
return [f"{name}: 无法读取 ({e})"]
|
||||
|
||||
# 检查必要章节
|
||||
for section in MAPPING_DOC_REQUIRED_SECTIONS:
|
||||
pattern = rf"^##\s+.*{re.escape(section)}"
|
||||
if not re.search(pattern, content, re.MULTILINE):
|
||||
issues.append(f"{name}: 缺少「{section}」章节")
|
||||
|
||||
# 检查端点信息表格中的关键属性
|
||||
for key in MAPPING_DOC_ENDPOINT_KEYS:
|
||||
if key not in content:
|
||||
issues.append(f"{name}: 端点信息缺少「{key}」属性")
|
||||
|
||||
# 检查 ETL 补充字段是否被提及
|
||||
etl_missing = [f for f in ODS_DOC_ETL_META_FIELDS if f not in content]
|
||||
if etl_missing:
|
||||
issues.append(f"{name}: 未提及 ETL 补充字段: {', '.join(etl_missing)}")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def check_mapping_doc_content() -> CheckResult:
|
||||
"""每份映射文档应包含:端点路径、ODS 表名、JSON 数据路径、字段映射表、ETL 补充字段。"""
|
||||
all_issues: list[str] = []
|
||||
if not ODS_MAPPINGS_DIR.is_dir():
|
||||
return CheckResult(
|
||||
property_id="Property 8",
|
||||
description="映射文档内容完整性",
|
||||
passed=False,
|
||||
details=["ODS/mappings/ 目录不存在"],
|
||||
)
|
||||
|
||||
for f in sorted(ODS_MAPPINGS_DIR.glob("mapping_*.md")):
|
||||
all_issues.extend(_check_single_mapping_doc(f))
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 8",
|
||||
description="映射文档内容完整性",
|
||||
passed=len(all_issues) == 0,
|
||||
details=all_issues,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 9: 映射文档命名规范
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_mapping_doc_naming() -> CheckResult:
|
||||
"""映射文档文件名应匹配 mapping_{API端点名}_{ODS表名}.md。"""
|
||||
bad: list[str] = []
|
||||
if not ODS_MAPPINGS_DIR.is_dir():
|
||||
return CheckResult(
|
||||
property_id="Property 9",
|
||||
description="映射文档命名规范",
|
||||
passed=False,
|
||||
details=["ODS/mappings/ 目录不存在"],
|
||||
)
|
||||
|
||||
# mapping_{EndpointName}_{table_name}.md
|
||||
# 端点名:PascalCase(字母数字),表名:snake_case
|
||||
pattern = re.compile(r"^mapping_[A-Z][A-Za-z0-9]+_[a-z][a-z0-9_]*\.md$")
|
||||
for f in sorted(ODS_MAPPINGS_DIR.iterdir()):
|
||||
if f.suffix == ".md" and f.name.startswith("mapping_"):
|
||||
if not pattern.match(f.name):
|
||||
bad.append(f.name)
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 9",
|
||||
description="映射文档命名规范(mapping_{API端点名}_{ODS表名}.md)",
|
||||
passed=len(bad) == 0,
|
||||
details=[f"命名不规范: {n}" for n in bad],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Property 10: ODS 数据字典覆盖率
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def check_ods_dictionary_coverage(ods_tables: list[str]) -> CheckResult:
|
||||
"""数据字典中应包含所有 ODS 表条目。"""
|
||||
if not ODS_DICT_PATH.is_file():
|
||||
return CheckResult(
|
||||
property_id="Property 10",
|
||||
description="ODS 数据字典覆盖率",
|
||||
passed=False,
|
||||
details=[f"数据字典文件不存在: {ODS_DICT_PATH}"],
|
||||
)
|
||||
|
||||
try:
|
||||
content = ODS_DICT_PATH.read_text(encoding="utf-8")
|
||||
except Exception as e:
|
||||
return CheckResult(
|
||||
property_id="Property 10",
|
||||
description="ODS 数据字典覆盖率",
|
||||
passed=False,
|
||||
details=[f"无法读取数据字典: {e}"],
|
||||
)
|
||||
|
||||
missing: list[str] = []
|
||||
for tbl in ods_tables:
|
||||
# 在字典内容中查找表名(反引号包裹或直接出现)
|
||||
if tbl not in content:
|
||||
missing.append(tbl)
|
||||
|
||||
return CheckResult(
|
||||
property_id="Property 10",
|
||||
description="ODS 数据字典覆盖率",
|
||||
passed=len(missing) == 0,
|
||||
details=[f"数据字典缺失条目: {t}" for t in missing],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 报告输出
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def print_report(results: list[CheckResult]) -> None:
|
||||
"""打印验证报告。"""
|
||||
print("=" * 60)
|
||||
print("BD_Manual 文档体系验证报告")
|
||||
print("=" * 60)
|
||||
|
||||
passed_count = sum(1 for r in results if r.passed)
|
||||
total = len(results)
|
||||
|
||||
for r in results:
|
||||
status = "✓ PASS" if r.passed else "✗ FAIL"
|
||||
print(f"\n[{status}] {r.property_id}: {r.description}")
|
||||
if not r.passed:
|
||||
for d in r.details[:20]: # 最多显示 20 条
|
||||
print(f" - {d}")
|
||||
if len(r.details) > 20:
|
||||
print(f" ... 还有 {len(r.details) - 20} 条问题")
|
||||
|
||||
print("\n" + "-" * 60)
|
||||
print(f"结果: {passed_count}/{total} 项通过")
|
||||
if passed_count < total:
|
||||
print("存在未通过的验证项,请检查上述详情。")
|
||||
else:
|
||||
print("所有验证项均通过 ✓")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 主入口
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="验证 BD_Manual 文档体系的覆盖率、格式和命名规范",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
# 从 .env 或 PG_DSN 环境变量读取连接字符串
|
||||
python scripts/validate_bd_manual.py
|
||||
|
||||
# 指定连接字符串
|
||||
python scripts/validate_bd_manual.py --pg-dsn "postgresql://user:pass@host/db"
|
||||
""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pg-dsn",
|
||||
help="PostgreSQL 连接字符串(默认从 PG_DSN 环境变量或 .env 读取)",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# 加载 .env
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
pg_dsn = args.pg_dsn or os.environ.get("PG_DSN")
|
||||
if not pg_dsn:
|
||||
print(
|
||||
"✗ 未提供 PG_DSN,请通过 --pg-dsn 参数或 PG_DSN 环境变量指定",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
# 获取 ODS 表清单
|
||||
try:
|
||||
ods_tables = fetch_ods_tables(pg_dsn)
|
||||
except Exception as e:
|
||||
print(f"✗ 连接数据库失败: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if not ods_tables:
|
||||
print("⚠ ods schema 中未找到任何表", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
print(f"从数据库获取到 {len(ods_tables)} 张 ODS 表\n")
|
||||
|
||||
# 运行所有验证
|
||||
results: list[CheckResult] = [
|
||||
check_directory_structure(), # Property 1
|
||||
check_ods_doc_coverage(ods_tables), # Property 4
|
||||
check_ods_doc_format(), # Property 5
|
||||
check_ods_doc_naming(), # Property 6
|
||||
check_mapping_doc_coverage(ods_tables),# Property 7
|
||||
check_mapping_doc_content(), # Property 8
|
||||
check_mapping_doc_naming(), # Property 9
|
||||
check_ods_dictionary_coverage(ods_tables), # Property 10
|
||||
]
|
||||
|
||||
print_report(results)
|
||||
|
||||
# 任一验证失败则返回非零退出码
|
||||
if any(not r.passed for r in results):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user