初始提交：飞球 ETL 系统全量代码

2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions
--- a/tests/unit/test_audit_report_properties.py
+++ b/tests/unit/test_audit_report_properties.py
@@ -0,0 +1,485 @@
+# -*- coding: utf-8 -*-
+"""
+属性测试 — 报告输出属性
+
+Feature: repo-audit
+- Property 13: 统计摘要一致性
+- Property 14: 报告头部元信息
+- Property 15: 写操作仅限 docs/audit/
+
+Validates: Requirements 4.2, 4.5, 4.6, 4.7, 5.2
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import string
+from pathlib import Path
+
+from hypothesis import given, settings, assume
+from hypothesis import strategies as st
+
+from scripts.audit import (
+    AlignmentIssue,
+    Category,
+    Disposition,
+    DocMapping,
+    FlowNode,
+    InventoryItem,
+)
+from scripts.audit.inventory_analyzer import render_inventory_report
+from scripts.audit.flow_analyzer import render_flow_report
+from scripts.audit.doc_alignment_analyzer import render_alignment_report
+
+# ---------------------------------------------------------------------------
+# 共享生成器策略
+# ---------------------------------------------------------------------------
+
+_PATH_CHARS = string.ascii_letters + string.digits + "_-."
+
+_path_segment = st.text(
+    alphabet=_PATH_CHARS,
+    min_size=1,
+    max_size=12,
+)
+
+_rel_path = st.lists(
+    _path_segment,
+    min_size=1,
+    max_size=3,
+).map(lambda parts: "/".join(parts))
+
+_safe_text = st.text(
+    alphabet=st.characters(
+        whitelist_categories=("L", "N", "P", "S", "Z"),
+        blacklist_characters="|\n\r",
+    ),
+    min_size=1,
+    max_size=30,
+)
+
+_repo_root_str = st.text(
+    alphabet=string.ascii_letters + string.digits + "/_-.",
+    min_size=3,
+    max_size=40,
+).map(lambda s: "/" + s.lstrip("/"))
+
+
+# ---------------------------------------------------------------------------
+# InventoryItem 生成器
+# ---------------------------------------------------------------------------
+
+def _inventory_item_st() -> st.SearchStrategy[InventoryItem]:
+    return st.builds(
+        InventoryItem,
+        rel_path=_rel_path,
+        category=st.sampled_from(list(Category)),
+        disposition=st.sampled_from(list(Disposition)),
+        description=_safe_text,
+    )
+
+
+_inventory_list = st.lists(_inventory_item_st(), min_size=0, max_size=20)
+
+
+# ---------------------------------------------------------------------------
+# FlowNode 生成器（限制深度和宽度）
+# ---------------------------------------------------------------------------
+
+def _flow_node_st(max_depth: int = 2) -> st.SearchStrategy[FlowNode]:
+    """生成随机 FlowNode 树，限制深度避免爆炸。"""
+    if max_depth <= 0:
+        return st.builds(
+            FlowNode,
+            name=_path_segment,
+            source_file=_rel_path,
+            node_type=st.sampled_from(["entry", "module", "class", "function"]),
+            children=st.just([]),
+        )
+    return st.builds(
+        FlowNode,
+        name=_path_segment,
+        source_file=_rel_path,
+        node_type=st.sampled_from(["entry", "module", "class", "function"]),
+        children=st.lists(
+            _flow_node_st(max_depth - 1),
+            min_size=0,
+            max_size=3,
+        ),
+    )
+
+
+_flow_tree_list = st.lists(_flow_node_st(), min_size=0, max_size=5)
+_orphan_list = st.lists(_rel_path, min_size=0, max_size=10)
+
+
+# ---------------------------------------------------------------------------
+# DocMapping / AlignmentIssue 生成器
+# ---------------------------------------------------------------------------
+
+_issue_type_st = st.sampled_from(["stale", "conflict", "missing"])
+
+
+def _alignment_issue_st() -> st.SearchStrategy[AlignmentIssue]:
+    return st.builds(
+        AlignmentIssue,
+        doc_path=_rel_path,
+        issue_type=_issue_type_st,
+        description=_safe_text,
+        related_code=_rel_path,
+    )
+
+
+def _doc_mapping_st() -> st.SearchStrategy[DocMapping]:
+    return st.builds(
+        DocMapping,
+        doc_path=_rel_path,
+        doc_topic=_safe_text,
+        related_code=st.lists(_rel_path, min_size=0, max_size=5),
+        status=st.sampled_from(["aligned", "stale", "conflict", "orphan"]),
+    )
+
+
+_mapping_list = st.lists(_doc_mapping_st(), min_size=0, max_size=15)
+_issue_list = st.lists(_alignment_issue_st(), min_size=0, max_size=15)
+
+
+# ===========================================================================
+# Property 13: 统计摘要一致性
+# ===========================================================================
+
+
+class TestProperty13SummaryConsistency:
+    """Property 13: 统计摘要一致性
+
+    Feature: repo-audit, Property 13: 统计摘要一致性
+    Validates: Requirements 4.5, 4.6, 4.7
+
+    对于任意报告的统计摘要，各分类/标签的计数之和应等于对应条目列表的总长度。
+    """
+
+    # --- 13a: render_inventory_report 的分类计数之和 = 列表长度 ---
+
+    @given(items=_inventory_list)
+    @settings(max_examples=100)
+    def test_inventory_category_counts_sum(
+        self, items: list[InventoryItem]
+    ) -> None:
+        """Feature: repo-audit, Property 13: 统计摘要一致性
+        Validates: Requirements 4.5
+
+        render_inventory_report 统计摘要中各用途分类的计数之和应等于条目总数。
+        """
+        report = render_inventory_report(items, "/tmp/repo")
+
+        # 定位"按用途分类"表格，提取各行数字并求和
+        cat_sum = _extract_summary_total(report, "按用途分类")
+        assert cat_sum == len(items), (
+            f"分类计数之和 {cat_sum} != 条目总数 {len(items)}"
+        )
+
+    # --- 13b: render_inventory_report 的处置标签计数之和 = 列表长度 ---
+
+    @given(items=_inventory_list)
+    @settings(max_examples=100)
+    def test_inventory_disposition_counts_sum(
+        self, items: list[InventoryItem]
+    ) -> None:
+        """Feature: repo-audit, Property 13: 统计摘要一致性
+        Validates: Requirements 4.5
+
+        render_inventory_report 统计摘要中各处置标签的计数之和应等于条目总数。
+        """
+        report = render_inventory_report(items, "/tmp/repo")
+
+        disp_sum = _extract_summary_total(report, "按处置标签")
+        assert disp_sum == len(items), (
+            f"处置标签计数之和 {disp_sum} != 条目总数 {len(items)}"
+        )
+
+    # --- 13c: render_flow_report 的孤立模块数量 = orphans 列表长度 ---
+
+    @given(trees=_flow_tree_list, orphans=_orphan_list)
+    @settings(max_examples=100)
+    def test_flow_orphan_count_matches(
+        self, trees: list[FlowNode], orphans: list[str]
+    ) -> None:
+        """Feature: repo-audit, Property 13: 统计摘要一致性
+        Validates: Requirements 4.6
+
+        render_flow_report 统计摘要中的孤立模块数量应等于 orphans 列表长度。
+        """
+        report = render_flow_report(trees, orphans, "/tmp/repo")
+
+        # 从统计摘要表格中提取"孤立模块"行的数字
+        orphan_count = _extract_flow_stat(report, "孤立模块")
+        assert orphan_count == len(orphans), (
+            f"报告中孤立模块数 {orphan_count} != orphans 列表长度 {len(orphans)}"
+        )
+
+    # --- 13d: render_alignment_report 的 issue 类型计数一致 ---
+
+    @given(mappings=_mapping_list, issues=_issue_list)
+    @settings(max_examples=100)
+    def test_alignment_issue_counts_match(
+        self, mappings: list[DocMapping], issues: list[AlignmentIssue]
+    ) -> None:
+        """Feature: repo-audit, Property 13: 统计摘要一致性
+        Validates: Requirements 4.7
+
+        render_alignment_report 统计摘要中过期/冲突/缺失点计数应与
+        issues 列表中对应类型的实际数量一致。
+        """
+        report = render_alignment_report(mappings, issues, "/tmp/repo")
+
+        expected_stale = sum(1 for i in issues if i.issue_type == "stale")
+        expected_conflict = sum(1 for i in issues if i.issue_type == "conflict")
+        expected_missing = sum(1 for i in issues if i.issue_type == "missing")
+
+        actual_stale = _extract_alignment_stat(report, "过期点数量")
+        actual_conflict = _extract_alignment_stat(report, "冲突点数量")
+        actual_missing = _extract_alignment_stat(report, "缺失点数量")
+
+        assert actual_stale == expected_stale, (
+            f"过期点: 报告 {actual_stale} != 实际 {expected_stale}"
+        )
+        assert actual_conflict == expected_conflict, (
+            f"冲突点: 报告 {actual_conflict} != 实际 {expected_conflict}"
+        )
+        assert actual_missing == expected_missing, (
+            f"缺失点: 报告 {actual_missing} != 实际 {expected_missing}"
+        )
+
+
+# ===========================================================================
+# Property 14: 报告头部元信息
+# ===========================================================================
+
+
+class TestProperty14ReportHeader:
+    """Property 14: 报告头部元信息
+
+    Feature: repo-audit, Property 14: 报告头部元信息
+    Validates: Requirements 4.2
+
+    对于任意报告输出，头部应包含一个符合 ISO 格式的时间戳字符串和仓库根目录路径字符串。
+    """
+
+    _ISO_TS_RE = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z")
+
+    @given(items=_inventory_list, repo_root=_repo_root_str)
+    @settings(max_examples=100)
+    def test_inventory_report_header(
+        self, items: list[InventoryItem], repo_root: str
+    ) -> None:
+        """Feature: repo-audit, Property 14: 报告头部元信息
+        Validates: Requirements 4.2
+
+        render_inventory_report 头部应包含 ISO 时间戳和仓库路径。
+        """
+        report = render_inventory_report(items, repo_root)
+        header = report[:500]
+
+        assert self._ISO_TS_RE.search(header), (
+            "inventory 报告头部缺少 ISO 格式时间戳"
+        )
+        assert repo_root in header, (
+            f"inventory 报告头部缺少仓库路径 '{repo_root}'"
+        )
+
+    @given(trees=_flow_tree_list, orphans=_orphan_list, repo_root=_repo_root_str)
+    @settings(max_examples=100)
+    def test_flow_report_header(
+        self, trees: list[FlowNode], orphans: list[str], repo_root: str
+    ) -> None:
+        """Feature: repo-audit, Property 14: 报告头部元信息
+        Validates: Requirements 4.2
+
+        render_flow_report 头部应包含 ISO 时间戳和仓库路径。
+        """
+        report = render_flow_report(trees, orphans, repo_root)
+        header = report[:500]
+
+        assert self._ISO_TS_RE.search(header), (
+            "flow 报告头部缺少 ISO 格式时间戳"
+        )
+        assert repo_root in header, (
+            f"flow 报告头部缺少仓库路径 '{repo_root}'"
+        )
+
+    @given(mappings=_mapping_list, issues=_issue_list, repo_root=_repo_root_str)
+    @settings(max_examples=100)
+    def test_alignment_report_header(
+        self, mappings: list[DocMapping], issues: list[AlignmentIssue], repo_root: str
+    ) -> None:
+        """Feature: repo-audit, Property 14: 报告头部元信息
+        Validates: Requirements 4.2
+
+        render_alignment_report 头部应包含 ISO 时间戳和仓库路径。
+        """
+        report = render_alignment_report(mappings, issues, repo_root)
+        header = report[:500]
+
+        assert self._ISO_TS_RE.search(header), (
+            "alignment 报告头部缺少 ISO 格式时间戳"
+        )
+        assert repo_root in header, (
+            f"alignment 报告头部缺少仓库路径 '{repo_root}'"
+        )
+
+
+# ===========================================================================
+# Property 15: 写操作仅限 docs/audit/
+# ===========================================================================
+
+
+class TestProperty15WritesOnlyDocsAudit:
+    """Property 15: 写操作仅限 docs/audit/
+
+    Feature: repo-audit, Property 15: 写操作仅限 docs/audit/
+    Validates: Requirements 5.2
+
+    对于任意审计执行过程，所有文件写操作的目标路径应以 docs/audit/ 为前缀。
+    由于需要实际文件系统，使用较少迭代。
+    """
+
+    @staticmethod
+    def _make_minimal_repo(base: Path, variant: int) -> Path:
+        """构造最小仓库结构，variant 控制变体以增加多样性。"""
+        repo = base / f"repo_{variant}"
+        repo.mkdir()
+
+        # 必需的 cli 入口
+        cli_dir = repo / "cli"
+        cli_dir.mkdir()
+        (cli_dir / "__init__.py").write_text("", encoding="utf-8")
+        (cli_dir / "main.py").write_text(
+            "# -*- coding: utf-8 -*-\ndef main(): pass\n",
+            encoding="utf-8",
+        )
+
+        # config 目录
+        config_dir = repo / "config"
+        config_dir.mkdir()
+        (config_dir / "__init__.py").write_text("", encoding="utf-8")
+
+        # docs 目录
+        docs_dir = repo / "docs"
+        docs_dir.mkdir()
+
+        # 根据 variant 添加不同的额外文件
+        if variant % 3 == 0:
+            (repo / "README.md").write_text("# 项目\n", encoding="utf-8")
+        if variant % 3 == 1:
+            scripts_dir = repo / "scripts"
+            scripts_dir.mkdir()
+            (scripts_dir / "__init__.py").write_text("", encoding="utf-8")
+        if variant % 3 == 2:
+            (docs_dir / "notes.md").write_text("# 笔记\n", encoding="utf-8")
+
+        return repo
+
+    @staticmethod
+    def _snapshot_files(repo: Path) -> dict[str, float]:
+        """记录仓库中所有文件的 mtime 快照（排除 docs/audit/）。"""
+        snap: dict[str, float] = {}
+        for p in repo.rglob("*"):
+            if p.is_file():
+                rel = p.relative_to(repo).as_posix()
+                if not rel.startswith("docs/audit"):
+                    snap[rel] = p.stat().st_mtime
+        return snap
+
+    @given(variant=st.integers(min_value=0, max_value=9))
+    @settings(max_examples=10)
+    def test_writes_only_under_docs_audit(self, variant: int) -> None:
+        """Feature: repo-audit, Property 15: 写操作仅限 docs/audit/
+        Validates: Requirements 5.2
+
+        运行 run_audit 后，docs/audit/ 外不应有新文件被创建。
+        docs/audit/ 下应有报告文件。
+        """
+        import tempfile
+        from scripts.audit.run_audit import run_audit
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = Path(tmp_dir)
+            repo = self._make_minimal_repo(tmp_path, variant)
+            before_snap = self._snapshot_files(repo)
+
+            run_audit(repo)
+
+            # 验证 docs/audit/ 下有新文件
+            audit_dir = repo / "docs" / "audit"
+            assert audit_dir.is_dir(), "docs/audit/ 目录未创建"
+            audit_files = list(audit_dir.iterdir())
+            assert len(audit_files) > 0, "docs/audit/ 下无报告文件"
+
+            # 验证 docs/audit/ 外无新文件
+            for p in repo.rglob("*"):
+                if p.is_file():
+                    rel = p.relative_to(repo).as_posix()
+                    if rel.startswith("docs/audit"):
+                        continue
+                    assert rel in before_snap, (
+                        f"docs/audit/ 外出现了新文件: {rel}"
+                    )
+
+
+# ===========================================================================
+# 辅助函数 — 从报告文本中提取统计数字
+# ===========================================================================
+
+def _extract_summary_total(report: str, section_name: str) -> int:
+    """从 inventory 报告的统计摘要中提取指定分区的数字之和。
+
+    查找 "### {section_name}" 下的 Markdown 表格，
+    累加每行最后一列的数字（排除合计行）。
+    """
+    lines = report.split("\n")
+    in_section = False
+    total = 0
+
+    for line in lines:
+        stripped = line.strip()
+        if stripped == f"### {section_name}":
+            in_section = True
+            continue
+        if in_section and stripped.startswith("###"):
+            # 进入下一个子节
+            break
+        if in_section and stripped.startswith("|") and "**合计**" not in stripped:
+            # 跳过表头和分隔行
+            if stripped.startswith("| 用途分类") or stripped.startswith("| 处置标签"):
+                continue
+            if stripped.startswith("|---"):
+                continue
+            # 提取最后一列的数字
+            cells = [c.strip() for c in stripped.split("|") if c.strip()]
+            if cells:
+                try:
+                    total += int(cells[-1])
+                except ValueError:
+                    pass
+
+    return total
+
+
+def _extract_flow_stat(report: str, label: str) -> int:
+    """从 flow 报告统计摘要表格中提取指定指标的数字。"""
+    # 匹配 "| 孤立模块 | 5 |" 格式
+    pattern = re.compile(rf"\|\s*{re.escape(label)}\s*\|\s*(\d+)\s*\|")
+    m = pattern.search(report)
+    return int(m.group(1)) if m else -1
+
+
+def _extract_alignment_stat(report: str, label: str) -> int:
+    """从 alignment 报告统计摘要中提取指定指标的数字。
+
+    匹配 "- 过期点数量：3" 格式。
+    """
+    # 兼容全角/半角冒号
+    pattern = re.compile(rf"{re.escape(label)}[：:]\s*(\d+)")
+    m = pattern.search(report)
+    return int(m.group(1)) if m else -1