初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

View File

@@ -0,0 +1,485 @@
# -*- coding: utf-8 -*-
"""
属性测试 — 报告输出属性
Feature: repo-audit
- Property 13: 统计摘要一致性
- Property 14: 报告头部元信息
- Property 15: 写操作仅限 docs/audit/
Validates: Requirements 4.2, 4.5, 4.6, 4.7, 5.2
"""
from __future__ import annotations
import os
import re
import string
from pathlib import Path
from hypothesis import given, settings, assume
from hypothesis import strategies as st
from scripts.audit import (
AlignmentIssue,
Category,
Disposition,
DocMapping,
FlowNode,
InventoryItem,
)
from scripts.audit.inventory_analyzer import render_inventory_report
from scripts.audit.flow_analyzer import render_flow_report
from scripts.audit.doc_alignment_analyzer import render_alignment_report
# ---------------------------------------------------------------------------
# 共享生成器策略
# ---------------------------------------------------------------------------
_PATH_CHARS = string.ascii_letters + string.digits + "_-."
_path_segment = st.text(
alphabet=_PATH_CHARS,
min_size=1,
max_size=12,
)
_rel_path = st.lists(
_path_segment,
min_size=1,
max_size=3,
).map(lambda parts: "/".join(parts))
_safe_text = st.text(
alphabet=st.characters(
whitelist_categories=("L", "N", "P", "S", "Z"),
blacklist_characters="|\n\r",
),
min_size=1,
max_size=30,
)
_repo_root_str = st.text(
alphabet=string.ascii_letters + string.digits + "/_-.",
min_size=3,
max_size=40,
).map(lambda s: "/" + s.lstrip("/"))
# ---------------------------------------------------------------------------
# InventoryItem 生成器
# ---------------------------------------------------------------------------
def _inventory_item_st() -> st.SearchStrategy[InventoryItem]:
return st.builds(
InventoryItem,
rel_path=_rel_path,
category=st.sampled_from(list(Category)),
disposition=st.sampled_from(list(Disposition)),
description=_safe_text,
)
_inventory_list = st.lists(_inventory_item_st(), min_size=0, max_size=20)
# ---------------------------------------------------------------------------
# FlowNode 生成器(限制深度和宽度)
# ---------------------------------------------------------------------------
def _flow_node_st(max_depth: int = 2) -> st.SearchStrategy[FlowNode]:
"""生成随机 FlowNode 树,限制深度避免爆炸。"""
if max_depth <= 0:
return st.builds(
FlowNode,
name=_path_segment,
source_file=_rel_path,
node_type=st.sampled_from(["entry", "module", "class", "function"]),
children=st.just([]),
)
return st.builds(
FlowNode,
name=_path_segment,
source_file=_rel_path,
node_type=st.sampled_from(["entry", "module", "class", "function"]),
children=st.lists(
_flow_node_st(max_depth - 1),
min_size=0,
max_size=3,
),
)
_flow_tree_list = st.lists(_flow_node_st(), min_size=0, max_size=5)
_orphan_list = st.lists(_rel_path, min_size=0, max_size=10)
# ---------------------------------------------------------------------------
# DocMapping / AlignmentIssue 生成器
# ---------------------------------------------------------------------------
_issue_type_st = st.sampled_from(["stale", "conflict", "missing"])
def _alignment_issue_st() -> st.SearchStrategy[AlignmentIssue]:
return st.builds(
AlignmentIssue,
doc_path=_rel_path,
issue_type=_issue_type_st,
description=_safe_text,
related_code=_rel_path,
)
def _doc_mapping_st() -> st.SearchStrategy[DocMapping]:
return st.builds(
DocMapping,
doc_path=_rel_path,
doc_topic=_safe_text,
related_code=st.lists(_rel_path, min_size=0, max_size=5),
status=st.sampled_from(["aligned", "stale", "conflict", "orphan"]),
)
_mapping_list = st.lists(_doc_mapping_st(), min_size=0, max_size=15)
_issue_list = st.lists(_alignment_issue_st(), min_size=0, max_size=15)
# ===========================================================================
# Property 13: 统计摘要一致性
# ===========================================================================
class TestProperty13SummaryConsistency:
"""Property 13: 统计摘要一致性
Feature: repo-audit, Property 13: 统计摘要一致性
Validates: Requirements 4.5, 4.6, 4.7
对于任意报告的统计摘要,各分类/标签的计数之和应等于对应条目列表的总长度。
"""
# --- 13a: render_inventory_report 的分类计数之和 = 列表长度 ---
@given(items=_inventory_list)
@settings(max_examples=100)
def test_inventory_category_counts_sum(
self, items: list[InventoryItem]
) -> None:
"""Feature: repo-audit, Property 13: 统计摘要一致性
Validates: Requirements 4.5
render_inventory_report 统计摘要中各用途分类的计数之和应等于条目总数。
"""
report = render_inventory_report(items, "/tmp/repo")
# 定位"按用途分类"表格,提取各行数字并求和
cat_sum = _extract_summary_total(report, "按用途分类")
assert cat_sum == len(items), (
f"分类计数之和 {cat_sum} != 条目总数 {len(items)}"
)
# --- 13b: render_inventory_report 的处置标签计数之和 = 列表长度 ---
@given(items=_inventory_list)
@settings(max_examples=100)
def test_inventory_disposition_counts_sum(
self, items: list[InventoryItem]
) -> None:
"""Feature: repo-audit, Property 13: 统计摘要一致性
Validates: Requirements 4.5
render_inventory_report 统计摘要中各处置标签的计数之和应等于条目总数。
"""
report = render_inventory_report(items, "/tmp/repo")
disp_sum = _extract_summary_total(report, "按处置标签")
assert disp_sum == len(items), (
f"处置标签计数之和 {disp_sum} != 条目总数 {len(items)}"
)
# --- 13c: render_flow_report 的孤立模块数量 = orphans 列表长度 ---
@given(trees=_flow_tree_list, orphans=_orphan_list)
@settings(max_examples=100)
def test_flow_orphan_count_matches(
self, trees: list[FlowNode], orphans: list[str]
) -> None:
"""Feature: repo-audit, Property 13: 统计摘要一致性
Validates: Requirements 4.6
render_flow_report 统计摘要中的孤立模块数量应等于 orphans 列表长度。
"""
report = render_flow_report(trees, orphans, "/tmp/repo")
# 从统计摘要表格中提取"孤立模块"行的数字
orphan_count = _extract_flow_stat(report, "孤立模块")
assert orphan_count == len(orphans), (
f"报告中孤立模块数 {orphan_count} != orphans 列表长度 {len(orphans)}"
)
# --- 13d: render_alignment_report 的 issue 类型计数一致 ---
@given(mappings=_mapping_list, issues=_issue_list)
@settings(max_examples=100)
def test_alignment_issue_counts_match(
self, mappings: list[DocMapping], issues: list[AlignmentIssue]
) -> None:
"""Feature: repo-audit, Property 13: 统计摘要一致性
Validates: Requirements 4.7
render_alignment_report 统计摘要中过期/冲突/缺失点计数应与
issues 列表中对应类型的实际数量一致。
"""
report = render_alignment_report(mappings, issues, "/tmp/repo")
expected_stale = sum(1 for i in issues if i.issue_type == "stale")
expected_conflict = sum(1 for i in issues if i.issue_type == "conflict")
expected_missing = sum(1 for i in issues if i.issue_type == "missing")
actual_stale = _extract_alignment_stat(report, "过期点数量")
actual_conflict = _extract_alignment_stat(report, "冲突点数量")
actual_missing = _extract_alignment_stat(report, "缺失点数量")
assert actual_stale == expected_stale, (
f"过期点: 报告 {actual_stale} != 实际 {expected_stale}"
)
assert actual_conflict == expected_conflict, (
f"冲突点: 报告 {actual_conflict} != 实际 {expected_conflict}"
)
assert actual_missing == expected_missing, (
f"缺失点: 报告 {actual_missing} != 实际 {expected_missing}"
)
# ===========================================================================
# Property 14: 报告头部元信息
# ===========================================================================
class TestProperty14ReportHeader:
"""Property 14: 报告头部元信息
Feature: repo-audit, Property 14: 报告头部元信息
Validates: Requirements 4.2
对于任意报告输出,头部应包含一个符合 ISO 格式的时间戳字符串和仓库根目录路径字符串。
"""
_ISO_TS_RE = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z")
@given(items=_inventory_list, repo_root=_repo_root_str)
@settings(max_examples=100)
def test_inventory_report_header(
self, items: list[InventoryItem], repo_root: str
) -> None:
"""Feature: repo-audit, Property 14: 报告头部元信息
Validates: Requirements 4.2
render_inventory_report 头部应包含 ISO 时间戳和仓库路径。
"""
report = render_inventory_report(items, repo_root)
header = report[:500]
assert self._ISO_TS_RE.search(header), (
"inventory 报告头部缺少 ISO 格式时间戳"
)
assert repo_root in header, (
f"inventory 报告头部缺少仓库路径 '{repo_root}'"
)
@given(trees=_flow_tree_list, orphans=_orphan_list, repo_root=_repo_root_str)
@settings(max_examples=100)
def test_flow_report_header(
self, trees: list[FlowNode], orphans: list[str], repo_root: str
) -> None:
"""Feature: repo-audit, Property 14: 报告头部元信息
Validates: Requirements 4.2
render_flow_report 头部应包含 ISO 时间戳和仓库路径。
"""
report = render_flow_report(trees, orphans, repo_root)
header = report[:500]
assert self._ISO_TS_RE.search(header), (
"flow 报告头部缺少 ISO 格式时间戳"
)
assert repo_root in header, (
f"flow 报告头部缺少仓库路径 '{repo_root}'"
)
@given(mappings=_mapping_list, issues=_issue_list, repo_root=_repo_root_str)
@settings(max_examples=100)
def test_alignment_report_header(
self, mappings: list[DocMapping], issues: list[AlignmentIssue], repo_root: str
) -> None:
"""Feature: repo-audit, Property 14: 报告头部元信息
Validates: Requirements 4.2
render_alignment_report 头部应包含 ISO 时间戳和仓库路径。
"""
report = render_alignment_report(mappings, issues, repo_root)
header = report[:500]
assert self._ISO_TS_RE.search(header), (
"alignment 报告头部缺少 ISO 格式时间戳"
)
assert repo_root in header, (
f"alignment 报告头部缺少仓库路径 '{repo_root}'"
)
# ===========================================================================
# Property 15: 写操作仅限 docs/audit/
# ===========================================================================
class TestProperty15WritesOnlyDocsAudit:
"""Property 15: 写操作仅限 docs/audit/
Feature: repo-audit, Property 15: 写操作仅限 docs/audit/
Validates: Requirements 5.2
对于任意审计执行过程,所有文件写操作的目标路径应以 docs/audit/ 为前缀。
由于需要实际文件系统,使用较少迭代。
"""
@staticmethod
def _make_minimal_repo(base: Path, variant: int) -> Path:
"""构造最小仓库结构variant 控制变体以增加多样性。"""
repo = base / f"repo_{variant}"
repo.mkdir()
# 必需的 cli 入口
cli_dir = repo / "cli"
cli_dir.mkdir()
(cli_dir / "__init__.py").write_text("", encoding="utf-8")
(cli_dir / "main.py").write_text(
"# -*- coding: utf-8 -*-\ndef main(): pass\n",
encoding="utf-8",
)
# config 目录
config_dir = repo / "config"
config_dir.mkdir()
(config_dir / "__init__.py").write_text("", encoding="utf-8")
# docs 目录
docs_dir = repo / "docs"
docs_dir.mkdir()
# 根据 variant 添加不同的额外文件
if variant % 3 == 0:
(repo / "README.md").write_text("# 项目\n", encoding="utf-8")
if variant % 3 == 1:
scripts_dir = repo / "scripts"
scripts_dir.mkdir()
(scripts_dir / "__init__.py").write_text("", encoding="utf-8")
if variant % 3 == 2:
(docs_dir / "notes.md").write_text("# 笔记\n", encoding="utf-8")
return repo
@staticmethod
def _snapshot_files(repo: Path) -> dict[str, float]:
"""记录仓库中所有文件的 mtime 快照(排除 docs/audit/)。"""
snap: dict[str, float] = {}
for p in repo.rglob("*"):
if p.is_file():
rel = p.relative_to(repo).as_posix()
if not rel.startswith("docs/audit"):
snap[rel] = p.stat().st_mtime
return snap
@given(variant=st.integers(min_value=0, max_value=9))
@settings(max_examples=10)
def test_writes_only_under_docs_audit(self, variant: int) -> None:
"""Feature: repo-audit, Property 15: 写操作仅限 docs/audit/
Validates: Requirements 5.2
运行 run_audit 后docs/audit/ 外不应有新文件被创建。
docs/audit/ 下应有报告文件。
"""
import tempfile
from scripts.audit.run_audit import run_audit
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_path = Path(tmp_dir)
repo = self._make_minimal_repo(tmp_path, variant)
before_snap = self._snapshot_files(repo)
run_audit(repo)
# 验证 docs/audit/ 下有新文件
audit_dir = repo / "docs" / "audit"
assert audit_dir.is_dir(), "docs/audit/ 目录未创建"
audit_files = list(audit_dir.iterdir())
assert len(audit_files) > 0, "docs/audit/ 下无报告文件"
# 验证 docs/audit/ 外无新文件
for p in repo.rglob("*"):
if p.is_file():
rel = p.relative_to(repo).as_posix()
if rel.startswith("docs/audit"):
continue
assert rel in before_snap, (
f"docs/audit/ 外出现了新文件: {rel}"
)
# ===========================================================================
# 辅助函数 — 从报告文本中提取统计数字
# ===========================================================================
def _extract_summary_total(report: str, section_name: str) -> int:
"""从 inventory 报告的统计摘要中提取指定分区的数字之和。
查找 "### {section_name}" 下的 Markdown 表格,
累加每行最后一列的数字(排除合计行)。
"""
lines = report.split("\n")
in_section = False
total = 0
for line in lines:
stripped = line.strip()
if stripped == f"### {section_name}":
in_section = True
continue
if in_section and stripped.startswith("###"):
# 进入下一个子节
break
if in_section and stripped.startswith("|") and "**合计**" not in stripped:
# 跳过表头和分隔行
if stripped.startswith("| 用途分类") or stripped.startswith("| 处置标签"):
continue
if stripped.startswith("|---"):
continue
# 提取最后一列的数字
cells = [c.strip() for c in stripped.split("|") if c.strip()]
if cells:
try:
total += int(cells[-1])
except ValueError:
pass
return total
def _extract_flow_stat(report: str, label: str) -> int:
"""从 flow 报告统计摘要表格中提取指定指标的数字。"""
# 匹配 "| 孤立模块 | 5 |" 格式
pattern = re.compile(rf"\|\s*{re.escape(label)}\s*\|\s*(\d+)\s*\|")
m = pattern.search(report)
return int(m.group(1)) if m else -1
def _extract_alignment_stat(report: str, label: str) -> int:
"""从 alignment 报告统计摘要中提取指定指标的数字。
匹配 "- 过期点数量3" 格式。
"""
# 兼容全角/半角冒号
pattern = re.compile(rf"{re.escape(label)}[:]\s*(\d+)")
m = pattern.search(report)
return int(m.group(1)) if m else -1