Files
Neo-ZQYY/apps/etl/pipelines/feiqiu/tests/unit/test_audit_scanner.py

429 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
单元测试 — 仓库扫描器 (scanner.py)
覆盖:
- 排除模式匹配逻辑
- 递归遍历与 FileEntry 构建
- 空目录检测
- 权限错误容错
"""
from __future__ import annotations
import os
from pathlib import Path
import pytest
from scripts.audit import FileEntry
from scripts.audit.scanner import EXCLUDED_PATTERNS, _is_excluded, scan_repo
# ---------------------------------------------------------------------------
# _is_excluded 单元测试
# ---------------------------------------------------------------------------
class TestIsExcluded:
"""排除模式匹配逻辑测试。"""
def test_exact_match_git(self) -> None:
assert _is_excluded(".git", EXCLUDED_PATTERNS) is True
def test_exact_match_pycache(self) -> None:
assert _is_excluded("__pycache__", EXCLUDED_PATTERNS) is True
def test_exact_match_pytest_cache(self) -> None:
assert _is_excluded(".pytest_cache", EXCLUDED_PATTERNS) is True
def test_exact_match_kiro(self) -> None:
assert _is_excluded(".kiro", EXCLUDED_PATTERNS) is True
def test_wildcard_pyc(self) -> None:
assert _is_excluded("module.pyc", EXCLUDED_PATTERNS) is True
def test_normal_py_not_excluded(self) -> None:
assert _is_excluded("main.py", EXCLUDED_PATTERNS) is False
def test_normal_dir_not_excluded(self) -> None:
assert _is_excluded("src", EXCLUDED_PATTERNS) is False
def test_empty_patterns(self) -> None:
assert _is_excluded(".git", []) is False
def test_custom_pattern(self) -> None:
assert _is_excluded("data.csv", ["*.csv"]) is True
# ---------------------------------------------------------------------------
# scan_repo 单元测试
# ---------------------------------------------------------------------------
class TestScanRepo:
"""scan_repo 递归遍历测试。"""
def test_basic_structure(self, tmp_path: Path) -> None:
"""基本文件和目录应被正确扫描。"""
(tmp_path / "a.py").write_text("# code", encoding="utf-8")
sub = tmp_path / "sub"
sub.mkdir()
(sub / "b.txt").write_text("hello", encoding="utf-8")
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
assert "a.py" in paths
assert "sub" in paths
assert "sub/b.txt" in paths
def test_file_entry_fields(self, tmp_path: Path) -> None:
"""FileEntry 各字段应正确填充。"""
(tmp_path / "hello.md").write_text("# hi", encoding="utf-8")
entries = scan_repo(tmp_path)
md = next(e for e in entries if e.rel_path == "hello.md")
assert md.is_dir is False
assert md.size_bytes > 0
assert md.extension == ".md"
assert md.is_empty_dir is False
def test_directory_entry_fields(self, tmp_path: Path) -> None:
"""目录条目的字段应正确设置。"""
sub = tmp_path / "mydir"
sub.mkdir()
(sub / "file.py").write_text("pass", encoding="utf-8")
entries = scan_repo(tmp_path)
d = next(e for e in entries if e.rel_path == "mydir")
assert d.is_dir is True
assert d.size_bytes == 0
assert d.extension == ""
assert d.is_empty_dir is False
def test_excluded_git_dir(self, tmp_path: Path) -> None:
""".git 目录及其内容应被排除。"""
git_dir = tmp_path / ".git"
git_dir.mkdir()
(git_dir / "config").write_text("", encoding="utf-8")
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
assert ".git" not in paths
assert ".git/config" not in paths
def test_excluded_pycache(self, tmp_path: Path) -> None:
"""__pycache__ 目录应被排除。"""
cache = tmp_path / "pkg" / "__pycache__"
cache.mkdir(parents=True)
(cache / "mod.cpython-310.pyc").write_bytes(b"\x00")
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
assert not any("__pycache__" in p for p in paths)
def test_excluded_pyc_files(self, tmp_path: Path) -> None:
"""*.pyc 文件应被排除。"""
(tmp_path / "mod.pyc").write_bytes(b"\x00")
(tmp_path / "mod.py").write_text("pass", encoding="utf-8")
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
assert "mod.pyc" not in paths
assert "mod.py" in paths
def test_empty_directory_detection(self, tmp_path: Path) -> None:
"""空目录应被标记为 is_empty_dir=True。"""
(tmp_path / "empty").mkdir()
entries = scan_repo(tmp_path)
d = next(e for e in entries if e.rel_path == "empty")
assert d.is_dir is True
assert d.is_empty_dir is True
def test_dir_with_only_excluded_children(self, tmp_path: Path) -> None:
"""仅含被排除子项的目录应视为空目录。"""
sub = tmp_path / "pkg"
sub.mkdir()
cache = sub / "__pycache__"
cache.mkdir()
(cache / "x.pyc").write_bytes(b"\x00")
entries = scan_repo(tmp_path)
d = next(e for e in entries if e.rel_path == "pkg")
assert d.is_empty_dir is True
def test_custom_exclude_patterns(self, tmp_path: Path) -> None:
"""自定义排除模式应生效。"""
(tmp_path / "keep.py").write_text("pass", encoding="utf-8")
(tmp_path / "skip.log").write_text("log", encoding="utf-8")
entries = scan_repo(tmp_path, exclude=["*.log"])
paths = {e.rel_path for e in entries}
assert "keep.py" in paths
assert "skip.log" not in paths
def test_empty_repo(self, tmp_path: Path) -> None:
"""空仓库应返回空列表。"""
entries = scan_repo(tmp_path)
assert entries == []
def test_results_sorted(self, tmp_path: Path) -> None:
"""返回结果应按 rel_path 排序。"""
(tmp_path / "z.py").write_text("", encoding="utf-8")
(tmp_path / "a.py").write_text("", encoding="utf-8")
sub = tmp_path / "m"
sub.mkdir()
(sub / "b.py").write_text("", encoding="utf-8")
entries = scan_repo(tmp_path)
paths = [e.rel_path for e in entries]
assert paths == sorted(paths)
@pytest.mark.skipif(
os.name == "nt",
reason="Windows 上 chmod 行为不同,跳过权限测试",
)
def test_permission_error_skipped(self, tmp_path: Path) -> None:
"""权限不足的目录应被跳过,不中断扫描。"""
ok_file = tmp_path / "ok.py"
ok_file.write_text("pass", encoding="utf-8")
no_access = tmp_path / "secret"
no_access.mkdir()
(no_access / "data.txt").write_text("x", encoding="utf-8")
no_access.chmod(0o000)
try:
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
# ok.py 应正常扫描到
assert "ok.py" in paths
# secret 目录本身会被记录(在 _walk 中先记录目录再尝试 iterdir
# 但其子文件不应出现
assert "secret/data.txt" not in paths
finally:
no_access.chmod(0o755)
def test_nested_directories(self, tmp_path: Path) -> None:
"""多层嵌套目录应被正确遍历。"""
deep = tmp_path / "a" / "b" / "c"
deep.mkdir(parents=True)
(deep / "leaf.py").write_text("pass", encoding="utf-8")
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
assert "a" in paths
assert "a/b" in paths
assert "a/b/c" in paths
assert "a/b/c/leaf.py" in paths
def test_extension_lowercase(self, tmp_path: Path) -> None:
"""扩展名应统一为小写。"""
(tmp_path / "README.MD").write_text("", encoding="utf-8")
entries = scan_repo(tmp_path)
md = next(e for e in entries if "README" in e.rel_path)
assert md.extension == ".md"
def test_no_extension(self, tmp_path: Path) -> None:
"""无扩展名的文件 extension 应为空字符串。"""
(tmp_path / "Makefile").write_text("", encoding="utf-8")
entries = scan_repo(tmp_path)
f = next(e for e in entries if e.rel_path == "Makefile")
assert f.extension == ""
def test_root_not_in_entries(self, tmp_path: Path) -> None:
"""根目录自身不应出现在结果中。"""
(tmp_path / "a.py").write_text("", encoding="utf-8")
entries = scan_repo(tmp_path)
paths = {e.rel_path for e in entries}
assert "." not in paths
assert "" not in paths
# ---------------------------------------------------------------------------
# 属性测试 — Property 7: 扫描器排除规则
# Feature: repo-audit, Property 7: 扫描器排除规则
# Validates: Requirements 1.1
# ---------------------------------------------------------------------------
import fnmatch
import string
import tempfile
from hypothesis import given, settings
from hypothesis import strategies as st
# --- 生成器策略 ---
# 合法的文件/目录名字符(排除路径分隔符和特殊字符)
_SAFE_CHARS = string.ascii_lowercase + string.digits + "_-"
# 安全的文件名策略(不与排除模式冲突的普通名称)
_safe_name = st.text(_SAFE_CHARS, min_size=1, max_size=8)
# 排除模式中的目录名
_EXCLUDED_DIR_NAMES = [".git", "__pycache__", ".pytest_cache", ".kiro"]
# 排除模式中的文件扩展名
_EXCLUDED_FILE_EXT = ".pyc"
# 随机选择一个被排除的目录名
_excluded_dir_name = st.sampled_from(_EXCLUDED_DIR_NAMES)
def _build_tree(tmp: Path, normal_names: list[str], excluded_dirs: list[str],
include_pyc: bool) -> None:
"""在临时目录中构建包含正常文件和被排除条目的文件树。"""
# 创建正常文件
for name in normal_names:
safe = name or "f"
filepath = tmp / f"{safe}.txt"
if not filepath.exists():
filepath.write_text("ok", encoding="utf-8")
# 创建被排除的目录(含子文件)
for dirname in excluded_dirs:
d = tmp / dirname
d.mkdir(exist_ok=True)
(d / "inner.txt").write_text("hidden", encoding="utf-8")
# 可选:创建 .pyc 文件
if include_pyc:
(tmp / "module.pyc").write_bytes(b"\x00")
class TestProperty7ScannerExclusionRules:
"""
Property 7: 扫描器排除规则
对于任意文件树scan_repo 返回的 FileEntry 列表中不应包含
rel_path 匹配排除模式(.git、__pycache__、.pytest_cache 等)的条目。
Feature: repo-audit, Property 7: 扫描器排除规则
Validates: Requirements 1.1
"""
@given(
normal_names=st.lists(_safe_name, min_size=0, max_size=5),
excluded_dirs=st.lists(_excluded_dir_name, min_size=1, max_size=3),
include_pyc=st.booleans(),
)
@settings(max_examples=100)
def test_excluded_entries_never_in_results(
self,
normal_names: list[str],
excluded_dirs: list[str],
include_pyc: bool,
) -> None:
"""扫描结果中不应包含任何匹配排除模式的条目。"""
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
_build_tree(tmp, normal_names, excluded_dirs, include_pyc)
entries = scan_repo(tmp)
for entry in entries:
# 检查 rel_path 的每一段是否匹配排除模式
parts = entry.rel_path.split("/")
for part in parts:
for pat in EXCLUDED_PATTERNS:
assert not fnmatch.fnmatch(part, pat), (
f"排除模式 '{pat}' 不应出现在结果中,"
f"但发现 rel_path='{entry.rel_path}' 包含 '{part}'"
)
@given(
excluded_dir=_excluded_dir_name,
depth=st.integers(min_value=1, max_value=3),
)
@settings(max_examples=100)
def test_excluded_dirs_at_any_depth(
self,
excluded_dir: str,
depth: int,
) -> None:
"""被排除目录无论在哪一层嵌套深度,都不应出现在结果中。"""
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
# 构建嵌套路径normal/normal/.../excluded_dir/file.txt
current = tmp
for i in range(depth):
current = current / f"level{i}"
current.mkdir(exist_ok=True)
# 放一个正常文件保证父目录非空
(current / "keep.txt").write_text("ok", encoding="utf-8")
# 在最深层放置被排除目录
excluded = current / excluded_dir
excluded.mkdir(exist_ok=True)
(excluded / "secret.txt").write_text("hidden", encoding="utf-8")
entries = scan_repo(tmp)
for entry in entries:
parts = entry.rel_path.split("/")
assert excluded_dir not in parts, (
f"被排除目录 '{excluded_dir}' 不应出现在结果中,"
f"但发现 rel_path='{entry.rel_path}'"
)
@given(
custom_patterns=st.lists(
st.sampled_from(["*.log", "*.tmp", "*.bak", "node_modules", ".venv"]),
min_size=1,
max_size=3,
),
)
@settings(max_examples=100)
def test_custom_exclude_patterns_respected(
self,
custom_patterns: list[str],
) -> None:
"""自定义排除模式同样应被 scan_repo 正确排除。"""
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
# 创建一个正常文件
(tmp / "main.py").write_text("pass", encoding="utf-8")
# 为每个自定义模式创建匹配的文件或目录
for pat in custom_patterns:
if pat.startswith("*."):
# 通配符模式 → 创建匹配的文件
ext = pat[1:] # e.g. ".log"
(tmp / f"data{ext}").write_text("x", encoding="utf-8")
else:
# 精确匹配 → 创建目录
d = tmp / pat
d.mkdir(exist_ok=True)
(d / "inner.txt").write_text("x", encoding="utf-8")
entries = scan_repo(tmp, exclude=custom_patterns)
for entry in entries:
parts = entry.rel_path.split("/")
for part in parts:
for pat in custom_patterns:
assert not fnmatch.fnmatch(part, pat), (
f"自定义排除模式 '{pat}' 不应出现在结果中,"
f"但发现 rel_path='{entry.rel_path}' 包含 '{part}'"
)