Files
Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/audit/doc_alignment_analyzer.py

609 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
文档对齐分析器 — 检查文档与代码之间的映射关系、过期点、冲突点和缺失点。
文档来源:
- docs/ 目录(.md, .txt, .csv, .json
- 根目录 README.md
- 各模块内的 README.md
- .kiro/steering/ 引导文件
- docs/test-json-doc/ API 响应样本
"""
from __future__ import annotations
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from scripts.audit import AlignmentIssue, DocMapping
# ---------------------------------------------------------------------------
# 常量
# ---------------------------------------------------------------------------
# 文档文件扩展名
_DOC_EXTENSIONS = {".md", ".txt", ".csv"}
# 核心代码目录——缺少文档时应报告
_CORE_CODE_DIRS = {
"tasks",
"loaders",
"orchestration",
"quality",
"models",
"utils",
"api",
"scd",
"config",
"database",
}
# ODS 表中的通用元数据列,比对时忽略
_ODS_META_COLUMNS = {"content_hash", "payload", "created_at", "updated_at", "id"}
# SQL 关键字,解析 DDL 列名时排除
_SQL_KEYWORDS = {
"primary", "key", "not", "null", "default", "unique", "check",
"references", "foreign", "constraint", "index", "create", "table",
"if", "exists", "serial", "bigserial", "true", "false",
}
# ---------------------------------------------------------------------------
# 安全读取文件(编码回退)
# ---------------------------------------------------------------------------
def _safe_read(path: Path) -> str:
"""尝试以 utf-8 → gbk → latin-1 回退读取文件内容。"""
for enc in ("utf-8", "gbk", "latin-1"):
try:
return path.read_text(encoding=enc)
except (UnicodeDecodeError, UnicodeError):
continue
return ""
# ---------------------------------------------------------------------------
# scan_docs — 扫描所有文档来源
# ---------------------------------------------------------------------------
def scan_docs(repo_root: Path) -> list[str]:
"""扫描所有文档文件路径,返回相对路径列表(已排序)。
文档来源:
1. docs/ 目录下的 .md, .txt, .csv, .json 文件
2. 根目录 README.md
3. 各模块内的 README.md如 gui/README.md
4. .kiro/steering/ 引导文件
"""
results: list[str] = []
def _rel(p: Path) -> str:
"""返回归一化的正斜杠相对路径。"""
return str(p.relative_to(repo_root)).replace("\\", "/")
# 1. docs/ 目录(递归,含 test-json-doc 下的 .json
docs_dir = repo_root / "docs"
if docs_dir.is_dir():
for p in docs_dir.rglob("*"):
if p.is_file():
ext = p.suffix.lower()
if ext in _DOC_EXTENSIONS or ext == ".json":
results.append(_rel(p))
# 2. 根目录 README.md
root_readme = repo_root / "README.md"
if root_readme.is_file():
results.append("README.md")
# 3. 各模块内的 README.md
for child in sorted(repo_root.iterdir()):
if child.is_dir() and child.name not in ("docs", ".kiro"):
readme = child / "README.md"
if readme.is_file():
results.append(_rel(readme))
# 4. .kiro/steering/
steering_dir = repo_root / ".kiro" / "steering"
if steering_dir.is_dir():
for p in sorted(steering_dir.iterdir()):
if p.is_file():
results.append(_rel(p))
return sorted(set(results))
# ---------------------------------------------------------------------------
# extract_code_references — 从文档提取代码引用
# ---------------------------------------------------------------------------
def extract_code_references(doc_path: Path) -> list[str]:
"""从文档中提取代码引用(反引号内的文件路径、类名、函数名等)。
规则:
- 提取反引号内的内容
- 跳过单字符引用
- 跳过纯数字/版本号
- 反斜杠归一化为正斜杠
- 去重
"""
if not doc_path.is_file():
return []
text = _safe_read(doc_path)
if not text:
return []
# 提取反引号内容
backtick_refs = re.findall(r"`([^`]+)`", text)
seen: set[str] = set()
results: list[str] = []
for raw in backtick_refs:
ref = raw.strip()
# 归一化反斜杠
ref = ref.replace("\\", "/")
# 跳过单字符
if len(ref) <= 1:
continue
# 跳过纯数字和版本号
if re.fullmatch(r"[\d.]+", ref):
continue
# 去重
if ref in seen:
continue
seen.add(ref)
results.append(ref)
return results
# ---------------------------------------------------------------------------
# check_reference_validity — 检查引用有效性
# ---------------------------------------------------------------------------
def check_reference_validity(ref: str, repo_root: Path) -> bool:
"""检查文档中的代码引用是否仍然有效。
检查策略:
1. 直接作为文件/目录路径检查
2. 去掉 FQ-ETL/ 前缀后检查(兼容旧文档引用)
3. 将点号路径转为文件路径检查(如 config.settings → config/settings.py
"""
# 1. 直接路径
if (repo_root / ref).exists():
return True
# 2. 去掉旧包名前缀(兼容历史文档)
for prefix in ("FQ-ETL/", "etl_billiards/"):
if ref.startswith(prefix):
stripped = ref[len(prefix):]
if (repo_root / stripped).exists():
return True
# 3. 点号模块路径 → 文件路径
if "." in ref and "/" not in ref:
as_path = ref.replace(".", "/") + ".py"
if (repo_root / as_path).exists():
return True
# 也可能是目录(包)
as_dir = ref.replace(".", "/")
if (repo_root / as_dir).is_dir():
return True
return False
# ---------------------------------------------------------------------------
# find_undocumented_modules — 找出缺少文档的核心代码模块
# ---------------------------------------------------------------------------
def find_undocumented_modules(
repo_root: Path,
documented: set[str],
) -> list[str]:
"""找出缺少文档的核心代码模块。
只检查 _CORE_CODE_DIRS 中的 .py 文件(排除 __init__.py
返回已排序的相对路径列表。
"""
undocumented: list[str] = []
for core_dir in sorted(_CORE_CODE_DIRS):
dir_path = repo_root / core_dir
if not dir_path.is_dir():
continue
for py_file in dir_path.rglob("*.py"):
if py_file.name == "__init__.py":
continue
rel = str(py_file.relative_to(repo_root))
# 归一化路径分隔符
rel = rel.replace("\\", "/")
if rel not in documented:
undocumented.append(rel)
return sorted(undocumented)
# ---------------------------------------------------------------------------
# DDL / 数据字典解析辅助函数
# ---------------------------------------------------------------------------
def _parse_ddl_tables(sql: str) -> dict[str, set[str]]:
"""从 DDL SQL 中提取表名和列名。
返回 {表名: {列名集合}} 字典。
支持带 schema 前缀的表名(如 billiards_dwd.dim_member → dim_member
"""
tables: dict[str, set[str]] = {}
# 匹配 CREATE TABLE [IF NOT EXISTS] [schema.]table_name (
create_re = re.compile(
r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
r"(?:\w+\.)?(\w+)\s*\(",
re.IGNORECASE,
)
for match in create_re.finditer(sql):
table_name = match.group(1)
# 找到对应的括号内容
start = match.end()
depth = 1
pos = start
while pos < len(sql) and depth > 0:
if sql[pos] == "(":
depth += 1
elif sql[pos] == ")":
depth -= 1
pos += 1
body = sql[start:pos - 1]
columns: set[str] = set()
# 逐行提取列名——取每行第一个标识符
for line in body.split("\n"):
line = line.strip().rstrip(",")
if not line:
continue
# 提取第一个单词
col_match = re.match(r"(\w+)", line)
if col_match:
col_name = col_match.group(1).lower()
# 排除 SQL 关键字
if col_name not in _SQL_KEYWORDS:
columns.add(col_name)
tables[table_name] = columns
return tables
def _parse_dictionary_tables(md: str) -> dict[str, set[str]]:
"""从数据字典 Markdown 中提取表名和字段名。
约定:
- 表名出现在 ## 标题中(可能带反引号)
- 字段名出现在 Markdown 表格的第一列
- 跳过表头行(含"字段"字样)和分隔行(含 ---
"""
tables: dict[str, set[str]] = {}
current_table: str | None = None
for line in md.split("\n"):
# 匹配 ## 标题中的表名
heading = re.match(r"^##\s+`?(\w+)`?", line)
if heading:
current_table = heading.group(1)
tables[current_table] = set()
continue
if current_table is None:
continue
# 跳过分隔行
if re.match(r"^\s*\|[-\s|]+\|\s*$", line):
continue
# 解析表格行
row_match = re.match(r"^\s*\|\s*(\S+)", line)
if row_match:
field = row_match.group(1)
# 跳过表头(含"字段"字样)
if field in ("字段",):
continue
tables[current_table].add(field)
return tables
# ---------------------------------------------------------------------------
# check_ddl_vs_dictionary — DDL 与数据字典比对
# ---------------------------------------------------------------------------
def check_ddl_vs_dictionary(repo_root: Path) -> list[AlignmentIssue]:
"""比对 DDL 文件与数据字典文档的覆盖度。
检查:
1. DDL 中有但字典中没有的表 → missing
2. 同名表中 DDL 有但字典没有的列 → conflict
"""
issues: list[AlignmentIssue] = []
# 收集所有 DDL 表定义
ddl_tables: dict[str, set[str]] = {}
db_dir = repo_root / "database"
if db_dir.is_dir():
for sql_file in sorted(db_dir.glob("schema_*.sql")):
content = _safe_read(sql_file)
for tbl, cols in _parse_ddl_tables(content).items():
if tbl in ddl_tables:
ddl_tables[tbl] |= cols
else:
ddl_tables[tbl] = set(cols)
# 收集所有数据字典表定义
dict_tables: dict[str, set[str]] = {}
docs_dir = repo_root / "docs"
if docs_dir.is_dir():
for dict_file in sorted(docs_dir.glob("*dictionary*.md")):
content = _safe_read(dict_file)
for tbl, fields in _parse_dictionary_tables(content).items():
if tbl in dict_tables:
dict_tables[tbl] |= fields
else:
dict_tables[tbl] = set(fields)
# 比对
for tbl, ddl_cols in sorted(ddl_tables.items()):
if tbl not in dict_tables:
issues.append(AlignmentIssue(
doc_path="docs/*dictionary*.md",
issue_type="missing",
description=f"DDL 定义了表 `{tbl}`,但数据字典中未收录",
related_code=f"database/schema_*.sql ({tbl})",
))
else:
# 检查列差异
dict_cols = dict_tables[tbl]
missing_cols = ddl_cols - dict_cols
for col in sorted(missing_cols):
issues.append(AlignmentIssue(
doc_path="docs/*dictionary*.md",
issue_type="conflict",
description=f"表 `{tbl}` 的列 `{col}` 在 DDL 中存在但数据字典中缺失",
related_code=f"database/schema_*.sql ({tbl}.{col})",
))
return issues
# ---------------------------------------------------------------------------
# check_api_samples_vs_parsers — API 样本与解析器比对
# ---------------------------------------------------------------------------
def check_api_samples_vs_parsers(repo_root: Path) -> list[AlignmentIssue]:
"""比对 API 响应样本与 ODS 表结构的一致性。
策略:
1. 扫描 docs/test-json-doc/ 下的 .json 文件
2. 提取 JSON 中的顶层字段名
3. 从 ODS DDL 中查找同名表
4. 比对字段差异(忽略 ODS 元数据列)
"""
issues: list[AlignmentIssue] = []
sample_dir = repo_root / "docs" / "test-json-doc"
if not sample_dir.is_dir():
return issues
# 收集 ODS 表定义(保留全部列,比对时忽略元数据列)
ods_tables: dict[str, set[str]] = {}
db_dir = repo_root / "database"
if db_dir.is_dir():
for sql_file in sorted(db_dir.glob("schema_*ODS*.sql")):
content = _safe_read(sql_file)
for tbl, cols in _parse_ddl_tables(content).items():
ods_tables[tbl] = cols
# 逐个样本文件比对
for json_file in sorted(sample_dir.glob("*.json")):
entity_name = json_file.stem # 文件名(不含扩展名)作为实体名
# 解析 JSON 样本
try:
content = _safe_read(json_file)
data = json.loads(content)
except (json.JSONDecodeError, ValueError):
continue
# 提取顶层字段名
sample_fields: set[str] = set()
if isinstance(data, list) and data:
# 数组格式——取第一个元素的键
first = data[0]
if isinstance(first, dict):
sample_fields = set(first.keys())
elif isinstance(data, dict):
sample_fields = set(data.keys())
if not sample_fields:
continue
# 查找匹配的 ODS 表
matched_table: str | None = None
matched_cols: set[str] = set()
for tbl, cols in ods_tables.items():
# 表名包含实体名(如 test_entity 匹配 billiards_ods.test_entity
tbl_lower = tbl.lower()
entity_lower = entity_name.lower()
if entity_lower in tbl_lower or tbl_lower == entity_lower:
matched_table = tbl
matched_cols = cols
break
if matched_table is None:
continue
# 比对:样本中有但 ODS 表中没有的字段
extra_fields = sample_fields - matched_cols
for field in sorted(extra_fields):
issues.append(AlignmentIssue(
doc_path=f"docs/test-json-doc/{json_file.name}",
issue_type="conflict",
description=(
f"API 样本字段 `{field}` 在 ODS 表 `{matched_table}` 中未定义"
),
related_code=f"database/schema_*ODS*.sql ({matched_table})",
))
return issues
# ---------------------------------------------------------------------------
# build_mappings — 构建文档与代码的映射关系
# ---------------------------------------------------------------------------
def build_mappings(
doc_paths: list[str],
repo_root: Path,
) -> list[DocMapping]:
"""为每份文档建立与代码模块的映射关系。"""
mappings: list[DocMapping] = []
for doc_rel in doc_paths:
doc_path = repo_root / doc_rel
refs = extract_code_references(doc_path)
# 确定关联代码和状态
valid_refs: list[str] = []
has_stale = False
for ref in refs:
if check_reference_validity(ref, repo_root):
valid_refs.append(ref)
else:
has_stale = True
# 推断文档主题(取文件名或第一行标题)
topic = _infer_topic(doc_path, doc_rel)
if not refs:
status = "orphan"
elif has_stale:
status = "stale"
else:
status = "aligned"
mappings.append(DocMapping(
doc_path=doc_rel,
doc_topic=topic,
related_code=valid_refs,
status=status,
))
return mappings
def _infer_topic(doc_path: Path, doc_rel: str) -> str:
"""从文档推断主题——优先取 Markdown 一级标题,否则用文件名。"""
if doc_path.is_file() and doc_path.suffix.lower() in (".md", ".txt"):
try:
text = _safe_read(doc_path)
for line in text.split("\n"):
line = line.strip()
if line.startswith("# "):
return line[2:].strip()
except Exception:
pass
return doc_rel
# ---------------------------------------------------------------------------
# render_alignment_report — 生成 Markdown 格式的文档对齐报告
# ---------------------------------------------------------------------------
def render_alignment_report(
mappings: list[DocMapping],
issues: list[AlignmentIssue],
repo_root: str,
) -> str:
"""生成 Markdown 格式的文档对齐报告。
分区:映射关系表、过期点列表、冲突点列表、缺失点列表、统计摘要。
"""
lines: list[str] = []
# --- 头部 ---
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
lines.append("# 文档对齐报告")
lines.append("")
lines.append(f"- 生成时间:{now}")
lines.append(f"- 仓库路径:`{repo_root}`")
lines.append("")
# --- 映射关系 ---
lines.append("## 映射关系")
lines.append("")
if mappings:
lines.append("| 文档路径 | 主题 | 关联代码 | 状态 |")
lines.append("|---|---|---|---|")
for m in mappings:
code_str = ", ".join(f"`{c}`" for c in m.related_code) if m.related_code else ""
lines.append(f"| `{m.doc_path}` | {m.doc_topic} | {code_str} | {m.status} |")
else:
lines.append("未发现文档映射关系。")
lines.append("")
# --- 按 issue_type 分组 ---
stale = [i for i in issues if i.issue_type == "stale"]
conflict = [i for i in issues if i.issue_type == "conflict"]
missing = [i for i in issues if i.issue_type == "missing"]
# --- 过期点 ---
lines.append("## 过期点")
lines.append("")
if stale:
lines.append("| 文档路径 | 描述 | 关联代码 |")
lines.append("|---|---|---|")
for i in stale:
lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
else:
lines.append("未发现过期点。")
lines.append("")
# --- 冲突点 ---
lines.append("## 冲突点")
lines.append("")
if conflict:
lines.append("| 文档路径 | 描述 | 关联代码 |")
lines.append("|---|---|---|")
for i in conflict:
lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
else:
lines.append("未发现冲突点。")
lines.append("")
# --- 缺失点 ---
lines.append("## 缺失点")
lines.append("")
if missing:
lines.append("| 文档路径 | 描述 | 关联代码 |")
lines.append("|---|---|---|")
for i in missing:
lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
else:
lines.append("未发现缺失点。")
lines.append("")
# --- 统计摘要 ---
lines.append("## 统计摘要")
lines.append("")
lines.append(f"- 文档总数:{len(mappings)}")
lines.append(f"- 过期点数量:{len(stale)}")
lines.append(f"- 冲突点数量:{len(conflict)}")
lines.append(f"- 缺失点数量:{len(missing)}")
lines.append("")
return "\n".join(lines)