609 lines
20 KiB
Python
609 lines
20 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
文档对齐分析器 — 检查文档与代码之间的映射关系、过期点、冲突点和缺失点。
|
||
|
||
文档来源:
|
||
- docs/ 目录(.md, .txt, .csv, .json)
|
||
- 根目录 README.md
|
||
- 各模块内的 README.md
|
||
- .kiro/steering/ 引导文件
|
||
- docs/test-json-doc/ API 响应样本
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
from scripts.audit import AlignmentIssue, DocMapping
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 常量
|
||
# ---------------------------------------------------------------------------
|
||
|
||
# 文档文件扩展名
|
||
_DOC_EXTENSIONS = {".md", ".txt", ".csv"}
|
||
|
||
# 核心代码目录——缺少文档时应报告
|
||
_CORE_CODE_DIRS = {
|
||
"tasks",
|
||
"loaders",
|
||
"orchestration",
|
||
"quality",
|
||
"models",
|
||
"utils",
|
||
"api",
|
||
"scd",
|
||
"config",
|
||
"database",
|
||
}
|
||
|
||
# ODS 表中的通用元数据列,比对时忽略
|
||
_ODS_META_COLUMNS = {"content_hash", "payload", "created_at", "updated_at", "id"}
|
||
|
||
# SQL 关键字,解析 DDL 列名时排除
|
||
_SQL_KEYWORDS = {
|
||
"primary", "key", "not", "null", "default", "unique", "check",
|
||
"references", "foreign", "constraint", "index", "create", "table",
|
||
"if", "exists", "serial", "bigserial", "true", "false",
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 安全读取文件(编码回退)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _safe_read(path: Path) -> str:
|
||
"""尝试以 utf-8 → gbk → latin-1 回退读取文件内容。"""
|
||
for enc in ("utf-8", "gbk", "latin-1"):
|
||
try:
|
||
return path.read_text(encoding=enc)
|
||
except (UnicodeDecodeError, UnicodeError):
|
||
continue
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# scan_docs — 扫描所有文档来源
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def scan_docs(repo_root: Path) -> list[str]:
|
||
"""扫描所有文档文件路径,返回相对路径列表(已排序)。
|
||
|
||
文档来源:
|
||
1. docs/ 目录下的 .md, .txt, .csv, .json 文件
|
||
2. 根目录 README.md
|
||
3. 各模块内的 README.md(如 gui/README.md)
|
||
4. .kiro/steering/ 引导文件
|
||
"""
|
||
results: list[str] = []
|
||
|
||
def _rel(p: Path) -> str:
|
||
"""返回归一化的正斜杠相对路径。"""
|
||
return str(p.relative_to(repo_root)).replace("\\", "/")
|
||
|
||
# 1. docs/ 目录(递归,含 test-json-doc 下的 .json)
|
||
docs_dir = repo_root / "docs"
|
||
if docs_dir.is_dir():
|
||
for p in docs_dir.rglob("*"):
|
||
if p.is_file():
|
||
ext = p.suffix.lower()
|
||
if ext in _DOC_EXTENSIONS or ext == ".json":
|
||
results.append(_rel(p))
|
||
|
||
# 2. 根目录 README.md
|
||
root_readme = repo_root / "README.md"
|
||
if root_readme.is_file():
|
||
results.append("README.md")
|
||
|
||
# 3. 各模块内的 README.md
|
||
for child in sorted(repo_root.iterdir()):
|
||
if child.is_dir() and child.name not in ("docs", ".kiro"):
|
||
readme = child / "README.md"
|
||
if readme.is_file():
|
||
results.append(_rel(readme))
|
||
|
||
# 4. .kiro/steering/
|
||
steering_dir = repo_root / ".kiro" / "steering"
|
||
if steering_dir.is_dir():
|
||
for p in sorted(steering_dir.iterdir()):
|
||
if p.is_file():
|
||
results.append(_rel(p))
|
||
|
||
return sorted(set(results))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# extract_code_references — 从文档提取代码引用
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def extract_code_references(doc_path: Path) -> list[str]:
|
||
"""从文档中提取代码引用(反引号内的文件路径、类名、函数名等)。
|
||
|
||
规则:
|
||
- 提取反引号内的内容
|
||
- 跳过单字符引用
|
||
- 跳过纯数字/版本号
|
||
- 反斜杠归一化为正斜杠
|
||
- 去重
|
||
"""
|
||
if not doc_path.is_file():
|
||
return []
|
||
|
||
text = _safe_read(doc_path)
|
||
if not text:
|
||
return []
|
||
|
||
# 提取反引号内容
|
||
backtick_refs = re.findall(r"`([^`]+)`", text)
|
||
|
||
seen: set[str] = set()
|
||
results: list[str] = []
|
||
|
||
for raw in backtick_refs:
|
||
ref = raw.strip()
|
||
# 归一化反斜杠
|
||
ref = ref.replace("\\", "/")
|
||
# 跳过单字符
|
||
if len(ref) <= 1:
|
||
continue
|
||
# 跳过纯数字和版本号
|
||
if re.fullmatch(r"[\d.]+", ref):
|
||
continue
|
||
# 去重
|
||
if ref in seen:
|
||
continue
|
||
seen.add(ref)
|
||
results.append(ref)
|
||
|
||
return results
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# check_reference_validity — 检查引用有效性
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def check_reference_validity(ref: str, repo_root: Path) -> bool:
|
||
"""检查文档中的代码引用是否仍然有效。
|
||
|
||
检查策略:
|
||
1. 直接作为文件/目录路径检查
|
||
2. 去掉 FQ-ETL/ 前缀后检查(兼容旧文档引用)
|
||
3. 将点号路径转为文件路径检查(如 config.settings → config/settings.py)
|
||
"""
|
||
# 1. 直接路径
|
||
if (repo_root / ref).exists():
|
||
return True
|
||
|
||
# 2. 去掉旧包名前缀(兼容历史文档)
|
||
for prefix in ("FQ-ETL/", "etl_billiards/"):
|
||
if ref.startswith(prefix):
|
||
stripped = ref[len(prefix):]
|
||
if (repo_root / stripped).exists():
|
||
return True
|
||
|
||
# 3. 点号模块路径 → 文件路径
|
||
if "." in ref and "/" not in ref:
|
||
as_path = ref.replace(".", "/") + ".py"
|
||
if (repo_root / as_path).exists():
|
||
return True
|
||
# 也可能是目录(包)
|
||
as_dir = ref.replace(".", "/")
|
||
if (repo_root / as_dir).is_dir():
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# find_undocumented_modules — 找出缺少文档的核心代码模块
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def find_undocumented_modules(
|
||
repo_root: Path,
|
||
documented: set[str],
|
||
) -> list[str]:
|
||
"""找出缺少文档的核心代码模块。
|
||
|
||
只检查 _CORE_CODE_DIRS 中的 .py 文件(排除 __init__.py)。
|
||
返回已排序的相对路径列表。
|
||
"""
|
||
undocumented: list[str] = []
|
||
|
||
for core_dir in sorted(_CORE_CODE_DIRS):
|
||
dir_path = repo_root / core_dir
|
||
if not dir_path.is_dir():
|
||
continue
|
||
for py_file in dir_path.rglob("*.py"):
|
||
if py_file.name == "__init__.py":
|
||
continue
|
||
rel = str(py_file.relative_to(repo_root))
|
||
# 归一化路径分隔符
|
||
rel = rel.replace("\\", "/")
|
||
if rel not in documented:
|
||
undocumented.append(rel)
|
||
|
||
return sorted(undocumented)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# DDL / 数据字典解析辅助函数
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _parse_ddl_tables(sql: str) -> dict[str, set[str]]:
|
||
"""从 DDL SQL 中提取表名和列名。
|
||
|
||
返回 {表名: {列名集合}} 字典。
|
||
支持带 schema 前缀的表名(如 billiards_dwd.dim_member → dim_member)。
|
||
"""
|
||
tables: dict[str, set[str]] = {}
|
||
|
||
# 匹配 CREATE TABLE [IF NOT EXISTS] [schema.]table_name (
|
||
create_re = re.compile(
|
||
r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
|
||
r"(?:\w+\.)?(\w+)\s*\(",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
for match in create_re.finditer(sql):
|
||
table_name = match.group(1)
|
||
# 找到对应的括号内容
|
||
start = match.end()
|
||
depth = 1
|
||
pos = start
|
||
while pos < len(sql) and depth > 0:
|
||
if sql[pos] == "(":
|
||
depth += 1
|
||
elif sql[pos] == ")":
|
||
depth -= 1
|
||
pos += 1
|
||
body = sql[start:pos - 1]
|
||
|
||
columns: set[str] = set()
|
||
# 逐行提取列名——取每行第一个标识符
|
||
for line in body.split("\n"):
|
||
line = line.strip().rstrip(",")
|
||
if not line:
|
||
continue
|
||
# 提取第一个单词
|
||
col_match = re.match(r"(\w+)", line)
|
||
if col_match:
|
||
col_name = col_match.group(1).lower()
|
||
# 排除 SQL 关键字
|
||
if col_name not in _SQL_KEYWORDS:
|
||
columns.add(col_name)
|
||
|
||
tables[table_name] = columns
|
||
|
||
return tables
|
||
|
||
|
||
def _parse_dictionary_tables(md: str) -> dict[str, set[str]]:
|
||
"""从数据字典 Markdown 中提取表名和字段名。
|
||
|
||
约定:
|
||
- 表名出现在 ## 标题中(可能带反引号)
|
||
- 字段名出现在 Markdown 表格的第一列
|
||
- 跳过表头行(含"字段"字样)和分隔行(含 ---)
|
||
"""
|
||
tables: dict[str, set[str]] = {}
|
||
current_table: str | None = None
|
||
|
||
for line in md.split("\n"):
|
||
# 匹配 ## 标题中的表名
|
||
heading = re.match(r"^##\s+`?(\w+)`?", line)
|
||
if heading:
|
||
current_table = heading.group(1)
|
||
tables[current_table] = set()
|
||
continue
|
||
|
||
if current_table is None:
|
||
continue
|
||
|
||
# 跳过分隔行
|
||
if re.match(r"^\s*\|[-\s|]+\|\s*$", line):
|
||
continue
|
||
|
||
# 解析表格行
|
||
row_match = re.match(r"^\s*\|\s*(\S+)", line)
|
||
if row_match:
|
||
field = row_match.group(1)
|
||
# 跳过表头(含"字段"字样)
|
||
if field in ("字段",):
|
||
continue
|
||
tables[current_table].add(field)
|
||
|
||
return tables
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# check_ddl_vs_dictionary — DDL 与数据字典比对
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def check_ddl_vs_dictionary(repo_root: Path) -> list[AlignmentIssue]:
|
||
"""比对 DDL 文件与数据字典文档的覆盖度。
|
||
|
||
检查:
|
||
1. DDL 中有但字典中没有的表 → missing
|
||
2. 同名表中 DDL 有但字典没有的列 → conflict
|
||
"""
|
||
issues: list[AlignmentIssue] = []
|
||
|
||
# 收集所有 DDL 表定义
|
||
ddl_tables: dict[str, set[str]] = {}
|
||
db_dir = repo_root / "database"
|
||
if db_dir.is_dir():
|
||
for sql_file in sorted(db_dir.glob("schema_*.sql")):
|
||
content = _safe_read(sql_file)
|
||
for tbl, cols in _parse_ddl_tables(content).items():
|
||
if tbl in ddl_tables:
|
||
ddl_tables[tbl] |= cols
|
||
else:
|
||
ddl_tables[tbl] = set(cols)
|
||
|
||
# 收集所有数据字典表定义
|
||
dict_tables: dict[str, set[str]] = {}
|
||
docs_dir = repo_root / "docs"
|
||
if docs_dir.is_dir():
|
||
for dict_file in sorted(docs_dir.glob("*dictionary*.md")):
|
||
content = _safe_read(dict_file)
|
||
for tbl, fields in _parse_dictionary_tables(content).items():
|
||
if tbl in dict_tables:
|
||
dict_tables[tbl] |= fields
|
||
else:
|
||
dict_tables[tbl] = set(fields)
|
||
|
||
# 比对
|
||
for tbl, ddl_cols in sorted(ddl_tables.items()):
|
||
if tbl not in dict_tables:
|
||
issues.append(AlignmentIssue(
|
||
doc_path="docs/*dictionary*.md",
|
||
issue_type="missing",
|
||
description=f"DDL 定义了表 `{tbl}`,但数据字典中未收录",
|
||
related_code=f"database/schema_*.sql ({tbl})",
|
||
))
|
||
else:
|
||
# 检查列差异
|
||
dict_cols = dict_tables[tbl]
|
||
missing_cols = ddl_cols - dict_cols
|
||
for col in sorted(missing_cols):
|
||
issues.append(AlignmentIssue(
|
||
doc_path="docs/*dictionary*.md",
|
||
issue_type="conflict",
|
||
description=f"表 `{tbl}` 的列 `{col}` 在 DDL 中存在但数据字典中缺失",
|
||
related_code=f"database/schema_*.sql ({tbl}.{col})",
|
||
))
|
||
|
||
return issues
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# check_api_samples_vs_parsers — API 样本与解析器比对
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def check_api_samples_vs_parsers(repo_root: Path) -> list[AlignmentIssue]:
|
||
"""比对 API 响应样本与 ODS 表结构的一致性。
|
||
|
||
策略:
|
||
1. 扫描 docs/test-json-doc/ 下的 .json 文件
|
||
2. 提取 JSON 中的顶层字段名
|
||
3. 从 ODS DDL 中查找同名表
|
||
4. 比对字段差异(忽略 ODS 元数据列)
|
||
"""
|
||
issues: list[AlignmentIssue] = []
|
||
|
||
sample_dir = repo_root / "docs" / "test-json-doc"
|
||
if not sample_dir.is_dir():
|
||
return issues
|
||
|
||
# 收集 ODS 表定义(保留全部列,比对时忽略元数据列)
|
||
ods_tables: dict[str, set[str]] = {}
|
||
db_dir = repo_root / "database"
|
||
if db_dir.is_dir():
|
||
for sql_file in sorted(db_dir.glob("schema_*ODS*.sql")):
|
||
content = _safe_read(sql_file)
|
||
for tbl, cols in _parse_ddl_tables(content).items():
|
||
ods_tables[tbl] = cols
|
||
|
||
# 逐个样本文件比对
|
||
for json_file in sorted(sample_dir.glob("*.json")):
|
||
entity_name = json_file.stem # 文件名(不含扩展名)作为实体名
|
||
|
||
# 解析 JSON 样本
|
||
try:
|
||
content = _safe_read(json_file)
|
||
data = json.loads(content)
|
||
except (json.JSONDecodeError, ValueError):
|
||
continue
|
||
|
||
# 提取顶层字段名
|
||
sample_fields: set[str] = set()
|
||
if isinstance(data, list) and data:
|
||
# 数组格式——取第一个元素的键
|
||
first = data[0]
|
||
if isinstance(first, dict):
|
||
sample_fields = set(first.keys())
|
||
elif isinstance(data, dict):
|
||
sample_fields = set(data.keys())
|
||
|
||
if not sample_fields:
|
||
continue
|
||
|
||
# 查找匹配的 ODS 表
|
||
matched_table: str | None = None
|
||
matched_cols: set[str] = set()
|
||
for tbl, cols in ods_tables.items():
|
||
# 表名包含实体名(如 test_entity 匹配 billiards_ods.test_entity)
|
||
tbl_lower = tbl.lower()
|
||
entity_lower = entity_name.lower()
|
||
if entity_lower in tbl_lower or tbl_lower == entity_lower:
|
||
matched_table = tbl
|
||
matched_cols = cols
|
||
break
|
||
|
||
if matched_table is None:
|
||
continue
|
||
|
||
# 比对:样本中有但 ODS 表中没有的字段
|
||
extra_fields = sample_fields - matched_cols
|
||
for field in sorted(extra_fields):
|
||
issues.append(AlignmentIssue(
|
||
doc_path=f"docs/test-json-doc/{json_file.name}",
|
||
issue_type="conflict",
|
||
description=(
|
||
f"API 样本字段 `{field}` 在 ODS 表 `{matched_table}` 中未定义"
|
||
),
|
||
related_code=f"database/schema_*ODS*.sql ({matched_table})",
|
||
))
|
||
|
||
return issues
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# build_mappings — 构建文档与代码的映射关系
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def build_mappings(
|
||
doc_paths: list[str],
|
||
repo_root: Path,
|
||
) -> list[DocMapping]:
|
||
"""为每份文档建立与代码模块的映射关系。"""
|
||
mappings: list[DocMapping] = []
|
||
|
||
for doc_rel in doc_paths:
|
||
doc_path = repo_root / doc_rel
|
||
refs = extract_code_references(doc_path)
|
||
|
||
# 确定关联代码和状态
|
||
valid_refs: list[str] = []
|
||
has_stale = False
|
||
for ref in refs:
|
||
if check_reference_validity(ref, repo_root):
|
||
valid_refs.append(ref)
|
||
else:
|
||
has_stale = True
|
||
|
||
# 推断文档主题(取文件名或第一行标题)
|
||
topic = _infer_topic(doc_path, doc_rel)
|
||
|
||
if not refs:
|
||
status = "orphan"
|
||
elif has_stale:
|
||
status = "stale"
|
||
else:
|
||
status = "aligned"
|
||
|
||
mappings.append(DocMapping(
|
||
doc_path=doc_rel,
|
||
doc_topic=topic,
|
||
related_code=valid_refs,
|
||
status=status,
|
||
))
|
||
|
||
return mappings
|
||
|
||
|
||
def _infer_topic(doc_path: Path, doc_rel: str) -> str:
|
||
"""从文档推断主题——优先取 Markdown 一级标题,否则用文件名。"""
|
||
if doc_path.is_file() and doc_path.suffix.lower() in (".md", ".txt"):
|
||
try:
|
||
text = _safe_read(doc_path)
|
||
for line in text.split("\n"):
|
||
line = line.strip()
|
||
if line.startswith("# "):
|
||
return line[2:].strip()
|
||
except Exception:
|
||
pass
|
||
return doc_rel
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# render_alignment_report — 生成 Markdown 格式的文档对齐报告
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def render_alignment_report(
|
||
mappings: list[DocMapping],
|
||
issues: list[AlignmentIssue],
|
||
repo_root: str,
|
||
) -> str:
|
||
"""生成 Markdown 格式的文档对齐报告。
|
||
|
||
分区:映射关系表、过期点列表、冲突点列表、缺失点列表、统计摘要。
|
||
"""
|
||
lines: list[str] = []
|
||
|
||
# --- 头部 ---
|
||
now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
lines.append("# 文档对齐报告")
|
||
lines.append("")
|
||
lines.append(f"- 生成时间:{now}")
|
||
lines.append(f"- 仓库路径:`{repo_root}`")
|
||
lines.append("")
|
||
|
||
# --- 映射关系 ---
|
||
lines.append("## 映射关系")
|
||
lines.append("")
|
||
if mappings:
|
||
lines.append("| 文档路径 | 主题 | 关联代码 | 状态 |")
|
||
lines.append("|---|---|---|---|")
|
||
for m in mappings:
|
||
code_str = ", ".join(f"`{c}`" for c in m.related_code) if m.related_code else "—"
|
||
lines.append(f"| `{m.doc_path}` | {m.doc_topic} | {code_str} | {m.status} |")
|
||
else:
|
||
lines.append("未发现文档映射关系。")
|
||
lines.append("")
|
||
|
||
# --- 按 issue_type 分组 ---
|
||
stale = [i for i in issues if i.issue_type == "stale"]
|
||
conflict = [i for i in issues if i.issue_type == "conflict"]
|
||
missing = [i for i in issues if i.issue_type == "missing"]
|
||
|
||
# --- 过期点 ---
|
||
lines.append("## 过期点")
|
||
lines.append("")
|
||
if stale:
|
||
lines.append("| 文档路径 | 描述 | 关联代码 |")
|
||
lines.append("|---|---|---|")
|
||
for i in stale:
|
||
lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
|
||
else:
|
||
lines.append("未发现过期点。")
|
||
lines.append("")
|
||
|
||
# --- 冲突点 ---
|
||
lines.append("## 冲突点")
|
||
lines.append("")
|
||
if conflict:
|
||
lines.append("| 文档路径 | 描述 | 关联代码 |")
|
||
lines.append("|---|---|---|")
|
||
for i in conflict:
|
||
lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
|
||
else:
|
||
lines.append("未发现冲突点。")
|
||
lines.append("")
|
||
|
||
# --- 缺失点 ---
|
||
lines.append("## 缺失点")
|
||
lines.append("")
|
||
if missing:
|
||
lines.append("| 文档路径 | 描述 | 关联代码 |")
|
||
lines.append("|---|---|---|")
|
||
for i in missing:
|
||
lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
|
||
else:
|
||
lines.append("未发现缺失点。")
|
||
lines.append("")
|
||
|
||
# --- 统计摘要 ---
|
||
lines.append("## 统计摘要")
|
||
lines.append("")
|
||
lines.append(f"- 文档总数:{len(mappings)}")
|
||
lines.append(f"- 过期点数量:{len(stale)}")
|
||
lines.append(f"- 冲突点数量:{len(conflict)}")
|
||
lines.append(f"- 缺失点数量:{len(missing)}")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|