初始提交：飞球 ETL 系统全量代码

2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions
--- a/scripts/audit/doc_alignment_analyzer.py
+++ b/scripts/audit/doc_alignment_analyzer.py
@@ -0,0 +1,617 @@
+# -*- coding: utf-8 -*-
+"""
+文档对齐分析器 — 检查文档与代码之间的映射关系、过期点、冲突点和缺失点。
+
+文档来源：
+- docs/ 目录（.md, .txt, .csv, .json）
+- 根目录 README.md
+- 开发笔记/ 目录
+- 各模块内的 README.md
+- .kiro/steering/ 引导文件
+- docs/test-json-doc/ API 响应样本
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+
+from scripts.audit import AlignmentIssue, DocMapping
+
+# ---------------------------------------------------------------------------
+# 常量
+# ---------------------------------------------------------------------------
+
+# 文档文件扩展名
+_DOC_EXTENSIONS = {".md", ".txt", ".csv"}
+
+# 核心代码目录——缺少文档时应报告
+_CORE_CODE_DIRS = {
+    "tasks",
+    "loaders",
+    "orchestration",
+    "quality",
+    "models",
+    "utils",
+    "api",
+    "scd",
+    "config",
+    "database",
+}
+
+# ODS 表中的通用元数据列，比对时忽略
+_ODS_META_COLUMNS = {"content_hash", "payload", "created_at", "updated_at", "id"}
+
+# SQL 关键字，解析 DDL 列名时排除
+_SQL_KEYWORDS = {
+    "primary", "key", "not", "null", "default", "unique", "check",
+    "references", "foreign", "constraint", "index", "create", "table",
+    "if", "exists", "serial", "bigserial", "true", "false",
+}
+
+
+# ---------------------------------------------------------------------------
+# 安全读取文件（编码回退）
+# ---------------------------------------------------------------------------
+
+def _safe_read(path: Path) -> str:
+    """尝试以 utf-8 → gbk → latin-1 回退读取文件内容。"""
+    for enc in ("utf-8", "gbk", "latin-1"):
+        try:
+            return path.read_text(encoding=enc)
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+    return ""
+
+
+# ---------------------------------------------------------------------------
+# scan_docs — 扫描所有文档来源
+# ---------------------------------------------------------------------------
+
+def scan_docs(repo_root: Path) -> list[str]:
+    """扫描所有文档文件路径，返回相对路径列表（已排序）。
+
+    文档来源：
+    1. docs/ 目录下的 .md, .txt, .csv, .json 文件
+    2. 根目录 README.md
+    3. 开发笔记/ 目录
+    4. 各模块内的 README.md（如 gui/README.md）
+    5. .kiro/steering/ 引导文件
+    """
+    results: list[str] = []
+
+    def _rel(p: Path) -> str:
+        """返回归一化的正斜杠相对路径。"""
+        return str(p.relative_to(repo_root)).replace("\\", "/")
+
+    # 1. docs/ 目录（递归，含 test-json-doc 下的 .json）
+    docs_dir = repo_root / "docs"
+    if docs_dir.is_dir():
+        for p in docs_dir.rglob("*"):
+            if p.is_file():
+                ext = p.suffix.lower()
+                if ext in _DOC_EXTENSIONS or ext == ".json":
+                    results.append(_rel(p))
+
+    # 2. 根目录 README.md
+    root_readme = repo_root / "README.md"
+    if root_readme.is_file():
+        results.append("README.md")
+
+    # 3. 开发笔记/
+    dev_notes = repo_root / "开发笔记"
+    if dev_notes.is_dir():
+        for p in dev_notes.rglob("*"):
+            if p.is_file():
+                results.append(_rel(p))
+
+    # 4. 各模块内的 README.md
+    for child in sorted(repo_root.iterdir()):
+        if child.is_dir() and child.name not in ("docs", "开发笔记", ".kiro"):
+            readme = child / "README.md"
+            if readme.is_file():
+                results.append(_rel(readme))
+
+    # 5. .kiro/steering/
+    steering_dir = repo_root / ".kiro" / "steering"
+    if steering_dir.is_dir():
+        for p in sorted(steering_dir.iterdir()):
+            if p.is_file():
+                results.append(_rel(p))
+
+    return sorted(set(results))
+
+
+# ---------------------------------------------------------------------------
+# extract_code_references — 从文档提取代码引用
+# ---------------------------------------------------------------------------
+
+def extract_code_references(doc_path: Path) -> list[str]:
+    """从文档中提取代码引用（反引号内的文件路径、类名、函数名等）。
+
+    规则：
+    - 提取反引号内的内容
+    - 跳过单字符引用
+    - 跳过纯数字/版本号
+    - 反斜杠归一化为正斜杠
+    - 去重
+    """
+    if not doc_path.is_file():
+        return []
+
+    text = _safe_read(doc_path)
+    if not text:
+        return []
+
+    # 提取反引号内容
+    backtick_refs = re.findall(r"`([^`]+)`", text)
+
+    seen: set[str] = set()
+    results: list[str] = []
+
+    for raw in backtick_refs:
+        ref = raw.strip()
+        # 归一化反斜杠
+        ref = ref.replace("\\", "/")
+        # 跳过单字符
+        if len(ref) <= 1:
+            continue
+        # 跳过纯数字和版本号
+        if re.fullmatch(r"[\d.]+", ref):
+            continue
+        # 去重
+        if ref in seen:
+            continue
+        seen.add(ref)
+        results.append(ref)
+
+    return results
+
+
+# ---------------------------------------------------------------------------
+# check_reference_validity — 检查引用有效性
+# ---------------------------------------------------------------------------
+
+def check_reference_validity(ref: str, repo_root: Path) -> bool:
+    """检查文档中的代码引用是否仍然有效。
+
+    检查策略：
+    1. 直接作为文件/目录路径检查
+    2. 去掉 FQ-ETL/ 前缀后检查（兼容旧文档引用）
+    3. 将点号路径转为文件路径检查（如 config.settings → config/settings.py）
+    """
+    # 1. 直接路径
+    if (repo_root / ref).exists():
+        return True
+
+    # 2. 去掉旧包名前缀（兼容历史文档）
+    for prefix in ("FQ-ETL/", "etl_billiards/"):
+        if ref.startswith(prefix):
+            stripped = ref[len(prefix):]
+            if (repo_root / stripped).exists():
+                return True
+
+    # 3. 点号模块路径 → 文件路径
+    if "." in ref and "/" not in ref:
+        as_path = ref.replace(".", "/") + ".py"
+        if (repo_root / as_path).exists():
+            return True
+        # 也可能是目录（包）
+        as_dir = ref.replace(".", "/")
+        if (repo_root / as_dir).is_dir():
+            return True
+
+    return False
+
+
+# ---------------------------------------------------------------------------
+# find_undocumented_modules — 找出缺少文档的核心代码模块
+# ---------------------------------------------------------------------------
+
+def find_undocumented_modules(
+    repo_root: Path,
+    documented: set[str],
+) -> list[str]:
+    """找出缺少文档的核心代码模块。
+
+    只检查 _CORE_CODE_DIRS 中的 .py 文件（排除 __init__.py）。
+    返回已排序的相对路径列表。
+    """
+    undocumented: list[str] = []
+
+    for core_dir in sorted(_CORE_CODE_DIRS):
+        dir_path = repo_root / core_dir
+        if not dir_path.is_dir():
+            continue
+        for py_file in dir_path.rglob("*.py"):
+            if py_file.name == "__init__.py":
+                continue
+            rel = str(py_file.relative_to(repo_root))
+            # 归一化路径分隔符
+            rel = rel.replace("\\", "/")
+            if rel not in documented:
+                undocumented.append(rel)
+
+    return sorted(undocumented)
+
+
+# ---------------------------------------------------------------------------
+# DDL / 数据字典解析辅助函数
+# ---------------------------------------------------------------------------
+
+def _parse_ddl_tables(sql: str) -> dict[str, set[str]]:
+    """从 DDL SQL 中提取表名和列名。
+
+    返回 {表名: {列名集合}} 字典。
+    支持带 schema 前缀的表名（如 billiards_dwd.dim_member → dim_member）。
+    """
+    tables: dict[str, set[str]] = {}
+
+    # 匹配 CREATE TABLE [IF NOT EXISTS] [schema.]table_name (
+    create_re = re.compile(
+        r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
+        r"(?:\w+\.)?(\w+)\s*\(",
+        re.IGNORECASE,
+    )
+
+    for match in create_re.finditer(sql):
+        table_name = match.group(1)
+        # 找到对应的括号内容
+        start = match.end()
+        depth = 1
+        pos = start
+        while pos < len(sql) and depth > 0:
+            if sql[pos] == "(":
+                depth += 1
+            elif sql[pos] == ")":
+                depth -= 1
+            pos += 1
+        body = sql[start:pos - 1]
+
+        columns: set[str] = set()
+        # 逐行提取列名——取每行第一个标识符
+        for line in body.split("\n"):
+            line = line.strip().rstrip(",")
+            if not line:
+                continue
+            # 提取第一个单词
+            col_match = re.match(r"(\w+)", line)
+            if col_match:
+                col_name = col_match.group(1).lower()
+                # 排除 SQL 关键字
+                if col_name not in _SQL_KEYWORDS:
+                    columns.add(col_name)
+
+        tables[table_name] = columns
+
+    return tables
+
+
+def _parse_dictionary_tables(md: str) -> dict[str, set[str]]:
+    """从数据字典 Markdown 中提取表名和字段名。
+
+    约定：
+    - 表名出现在 ## 标题中（可能带反引号）
+    - 字段名出现在 Markdown 表格的第一列
+    - 跳过表头行（含"字段"字样）和分隔行（含 ---）
+    """
+    tables: dict[str, set[str]] = {}
+    current_table: str | None = None
+
+    for line in md.split("\n"):
+        # 匹配 ## 标题中的表名
+        heading = re.match(r"^##\s+`?(\w+)`?", line)
+        if heading:
+            current_table = heading.group(1)
+            tables[current_table] = set()
+            continue
+
+        if current_table is None:
+            continue
+
+        # 跳过分隔行
+        if re.match(r"^\s*\|[-\s|]+\|\s*$", line):
+            continue
+
+        # 解析表格行
+        row_match = re.match(r"^\s*\|\s*(\S+)", line)
+        if row_match:
+            field = row_match.group(1)
+            # 跳过表头（含"字段"字样）
+            if field in ("字段",):
+                continue
+            tables[current_table].add(field)
+
+    return tables
+
+
+# ---------------------------------------------------------------------------
+# check_ddl_vs_dictionary — DDL 与数据字典比对
+# ---------------------------------------------------------------------------
+
+def check_ddl_vs_dictionary(repo_root: Path) -> list[AlignmentIssue]:
+    """比对 DDL 文件与数据字典文档的覆盖度。
+
+    检查：
+    1. DDL 中有但字典中没有的表 → missing
+    2. 同名表中 DDL 有但字典没有的列 → conflict
+    """
+    issues: list[AlignmentIssue] = []
+
+    # 收集所有 DDL 表定义
+    ddl_tables: dict[str, set[str]] = {}
+    db_dir = repo_root / "database"
+    if db_dir.is_dir():
+        for sql_file in sorted(db_dir.glob("schema_*.sql")):
+            content = _safe_read(sql_file)
+            for tbl, cols in _parse_ddl_tables(content).items():
+                if tbl in ddl_tables:
+                    ddl_tables[tbl] |= cols
+                else:
+                    ddl_tables[tbl] = set(cols)
+
+    # 收集所有数据字典表定义
+    dict_tables: dict[str, set[str]] = {}
+    docs_dir = repo_root / "docs"
+    if docs_dir.is_dir():
+        for dict_file in sorted(docs_dir.glob("*dictionary*.md")):
+            content = _safe_read(dict_file)
+            for tbl, fields in _parse_dictionary_tables(content).items():
+                if tbl in dict_tables:
+                    dict_tables[tbl] |= fields
+                else:
+                    dict_tables[tbl] = set(fields)
+
+    # 比对
+    for tbl, ddl_cols in sorted(ddl_tables.items()):
+        if tbl not in dict_tables:
+            issues.append(AlignmentIssue(
+                doc_path="docs/*dictionary*.md",
+                issue_type="missing",
+                description=f"DDL 定义了表 `{tbl}`，但数据字典中未收录",
+                related_code=f"database/schema_*.sql ({tbl})",
+            ))
+        else:
+            # 检查列差异
+            dict_cols = dict_tables[tbl]
+            missing_cols = ddl_cols - dict_cols
+            for col in sorted(missing_cols):
+                issues.append(AlignmentIssue(
+                    doc_path="docs/*dictionary*.md",
+                    issue_type="conflict",
+                    description=f"表 `{tbl}` 的列 `{col}` 在 DDL 中存在但数据字典中缺失",
+                    related_code=f"database/schema_*.sql ({tbl}.{col})",
+                ))
+
+    return issues
+
+
+# ---------------------------------------------------------------------------
+# check_api_samples_vs_parsers — API 样本与解析器比对
+# ---------------------------------------------------------------------------
+
+def check_api_samples_vs_parsers(repo_root: Path) -> list[AlignmentIssue]:
+    """比对 API 响应样本与 ODS 表结构的一致性。
+
+    策略：
+    1. 扫描 docs/test-json-doc/ 下的 .json 文件
+    2. 提取 JSON 中的顶层字段名
+    3. 从 ODS DDL 中查找同名表
+    4. 比对字段差异（忽略 ODS 元数据列）
+    """
+    issues: list[AlignmentIssue] = []
+
+    sample_dir = repo_root / "docs" / "test-json-doc"
+    if not sample_dir.is_dir():
+        return issues
+
+    # 收集 ODS 表定义（保留全部列，比对时忽略元数据列）
+    ods_tables: dict[str, set[str]] = {}
+    db_dir = repo_root / "database"
+    if db_dir.is_dir():
+        for sql_file in sorted(db_dir.glob("schema_*ODS*.sql")):
+            content = _safe_read(sql_file)
+            for tbl, cols in _parse_ddl_tables(content).items():
+                ods_tables[tbl] = cols
+
+    # 逐个样本文件比对
+    for json_file in sorted(sample_dir.glob("*.json")):
+        entity_name = json_file.stem  # 文件名（不含扩展名）作为实体名
+
+        # 解析 JSON 样本
+        try:
+            content = _safe_read(json_file)
+            data = json.loads(content)
+        except (json.JSONDecodeError, ValueError):
+            continue
+
+        # 提取顶层字段名
+        sample_fields: set[str] = set()
+        if isinstance(data, list) and data:
+            # 数组格式——取第一个元素的键
+            first = data[0]
+            if isinstance(first, dict):
+                sample_fields = set(first.keys())
+        elif isinstance(data, dict):
+            sample_fields = set(data.keys())
+
+        if not sample_fields:
+            continue
+
+        # 查找匹配的 ODS 表
+        matched_table: str | None = None
+        matched_cols: set[str] = set()
+        for tbl, cols in ods_tables.items():
+            # 表名包含实体名（如 test_entity 匹配 billiards_ods.test_entity）
+            tbl_lower = tbl.lower()
+            entity_lower = entity_name.lower()
+            if entity_lower in tbl_lower or tbl_lower == entity_lower:
+                matched_table = tbl
+                matched_cols = cols
+                break
+
+        if matched_table is None:
+            continue
+
+        # 比对：样本中有但 ODS 表中没有的字段
+        extra_fields = sample_fields - matched_cols
+        for field in sorted(extra_fields):
+            issues.append(AlignmentIssue(
+                doc_path=f"docs/test-json-doc/{json_file.name}",
+                issue_type="conflict",
+                description=(
+                    f"API 样本字段 `{field}` 在 ODS 表 `{matched_table}` 中未定义"
+                ),
+                related_code=f"database/schema_*ODS*.sql ({matched_table})",
+            ))
+
+    return issues
+
+
+# ---------------------------------------------------------------------------
+# build_mappings — 构建文档与代码的映射关系
+# ---------------------------------------------------------------------------
+
+def build_mappings(
+    doc_paths: list[str],
+    repo_root: Path,
+) -> list[DocMapping]:
+    """为每份文档建立与代码模块的映射关系。"""
+    mappings: list[DocMapping] = []
+
+    for doc_rel in doc_paths:
+        doc_path = repo_root / doc_rel
+        refs = extract_code_references(doc_path)
+
+        # 确定关联代码和状态
+        valid_refs: list[str] = []
+        has_stale = False
+        for ref in refs:
+            if check_reference_validity(ref, repo_root):
+                valid_refs.append(ref)
+            else:
+                has_stale = True
+
+        # 推断文档主题（取文件名或第一行标题）
+        topic = _infer_topic(doc_path, doc_rel)
+
+        if not refs:
+            status = "orphan"
+        elif has_stale:
+            status = "stale"
+        else:
+            status = "aligned"
+
+        mappings.append(DocMapping(
+            doc_path=doc_rel,
+            doc_topic=topic,
+            related_code=valid_refs,
+            status=status,
+        ))
+
+    return mappings
+
+
+def _infer_topic(doc_path: Path, doc_rel: str) -> str:
+    """从文档推断主题——优先取 Markdown 一级标题，否则用文件名。"""
+    if doc_path.is_file() and doc_path.suffix.lower() in (".md", ".txt"):
+        try:
+            text = _safe_read(doc_path)
+            for line in text.split("\n"):
+                line = line.strip()
+                if line.startswith("# "):
+                    return line[2:].strip()
+        except Exception:
+            pass
+    return doc_rel
+
+
+# ---------------------------------------------------------------------------
+# render_alignment_report — 生成 Markdown 格式的文档对齐报告
+# ---------------------------------------------------------------------------
+
+def render_alignment_report(
+    mappings: list[DocMapping],
+    issues: list[AlignmentIssue],
+    repo_root: str,
+) -> str:
+    """生成 Markdown 格式的文档对齐报告。
+
+    分区：映射关系表、过期点列表、冲突点列表、缺失点列表、统计摘要。
+    """
+    lines: list[str] = []
+
+    # --- 头部 ---
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    lines.append("# 文档对齐报告")
+    lines.append("")
+    lines.append(f"- 生成时间：{now}")
+    lines.append(f"- 仓库路径：`{repo_root}`")
+    lines.append("")
+
+    # --- 映射关系 ---
+    lines.append("## 映射关系")
+    lines.append("")
+    if mappings:
+        lines.append("| 文档路径 | 主题 | 关联代码 | 状态 |")
+        lines.append("|---|---|---|---|")
+        for m in mappings:
+            code_str = ", ".join(f"`{c}`" for c in m.related_code) if m.related_code else "—"
+            lines.append(f"| `{m.doc_path}` | {m.doc_topic} | {code_str} | {m.status} |")
+    else:
+        lines.append("未发现文档映射关系。")
+    lines.append("")
+
+    # --- 按 issue_type 分组 ---
+    stale = [i for i in issues if i.issue_type == "stale"]
+    conflict = [i for i in issues if i.issue_type == "conflict"]
+    missing = [i for i in issues if i.issue_type == "missing"]
+
+    # --- 过期点 ---
+    lines.append("## 过期点")
+    lines.append("")
+    if stale:
+        lines.append("| 文档路径 | 描述 | 关联代码 |")
+        lines.append("|---|---|---|")
+        for i in stale:
+            lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
+    else:
+        lines.append("未发现过期点。")
+    lines.append("")
+
+    # --- 冲突点 ---
+    lines.append("## 冲突点")
+    lines.append("")
+    if conflict:
+        lines.append("| 文档路径 | 描述 | 关联代码 |")
+        lines.append("|---|---|---|")
+        for i in conflict:
+            lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
+    else:
+        lines.append("未发现冲突点。")
+    lines.append("")
+
+    # --- 缺失点 ---
+    lines.append("## 缺失点")
+    lines.append("")
+    if missing:
+        lines.append("| 文档路径 | 描述 | 关联代码 |")
+        lines.append("|---|---|---|")
+        for i in missing:
+            lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
+    else:
+        lines.append("未发现缺失点。")
+    lines.append("")
+
+    # --- 统计摘要 ---
+    lines.append("## 统计摘要")
+    lines.append("")
+    lines.append(f"- 文档总数：{len(mappings)}")
+    lines.append(f"- 过期点数量：{len(stale)}")
+    lines.append(f"- 冲突点数量：{len(conflict)}")
+    lines.append(f"- 缺失点数量：{len(missing)}")
+    lines.append("")
+
+    return "\n".join(lines)