初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

View File

@@ -0,0 +1,309 @@
# -*- coding: utf-8 -*-
"""
属性测试 — classify 完整性
Feature: repo-audit, Property 1: classify 完整性
Validates: Requirements 1.2, 1.3
对于任意 FileEntryclassify 函数返回的 InventoryItem 的 category 字段
应属于 Category 枚举disposition 字段应属于 Disposition 枚举,
且 description 字段为非空字符串。
"""
from __future__ import annotations
import string
from hypothesis import given, settings
from hypothesis import strategies as st
from scripts.audit import Category, Disposition, FileEntry, InventoryItem
from scripts.audit.inventory_analyzer import classify
# ---------------------------------------------------------------------------
# 生成器策略
# ---------------------------------------------------------------------------
# 常见文件扩展名(含空扩展名表示无扩展名的情况)
_EXTENSIONS = st.sampled_from([
"", ".py", ".sql", ".md", ".txt", ".json", ".csv", ".xlsx",
".bat", ".sh", ".ps1", ".lnk", ".rar", ".log", ".ini", ".cfg",
".toml", ".yaml", ".yml", ".html", ".css", ".js",
])
# 路径片段:字母数字加常见特殊字符
_PATH_CHARS = string.ascii_letters + string.digits + "_-."
_path_segment = st.text(
alphabet=_PATH_CHARS,
min_size=1,
max_size=20,
)
# 生成 1~4 层目录深度的相对路径
_rel_path = st.lists(
_path_segment,
min_size=1,
max_size=4,
).map(lambda parts: "/".join(parts))
def _file_entry_strategy() -> st.SearchStrategy[FileEntry]:
"""生成随机 FileEntry 的 hypothesis 策略。
覆盖各种扩展名、目录层级、大小和布尔标志组合。
"""
return st.builds(
FileEntry,
rel_path=_rel_path,
is_dir=st.booleans(),
size_bytes=st.integers(min_value=0, max_value=10_000_000),
extension=_EXTENSIONS,
is_empty_dir=st.booleans(),
)
# ---------------------------------------------------------------------------
# Property 1: classify 完整性
# ---------------------------------------------------------------------------
@given(entry=_file_entry_strategy())
@settings(max_examples=100)
def test_classify_completeness(entry: FileEntry) -> None:
"""Property 1: classify 完整性
Feature: repo-audit, Property 1: classify 完整性
Validates: Requirements 1.2, 1.3
对于任意 FileEntryclassify 返回的 InventoryItem 应满足:
- category 属于 Category 枚举
- disposition 属于 Disposition 枚举
- description 为非空字符串
"""
result = classify(entry)
# 返回类型正确
assert isinstance(result, InventoryItem), (
f"classify 应返回 InventoryItem实际返回 {type(result)}"
)
# category 属于 Category 枚举
assert isinstance(result.category, Category), (
f"category 应为 Category 枚举成员,实际为 {result.category!r}"
)
# disposition 属于 Disposition 枚举
assert isinstance(result.disposition, Disposition), (
f"disposition 应为 Disposition 枚举成员,实际为 {result.disposition!r}"
)
# description 为非空字符串
assert isinstance(result.description, str) and len(result.description) > 0, (
f"description 应为非空字符串,实际为 {result.description!r}"
)
# ---------------------------------------------------------------------------
# 辅助:高优先级目录前缀(用于在低优先级属性测试中排除)
# ---------------------------------------------------------------------------
_HIGH_PRIORITY_PREFIXES = ("tmp/", "logs/", "export/")
# 安全的顶层目录名(不会触发高优先级规则)
_SAFE_TOP_DIRS = st.sampled_from([
"src", "lib", "data", "misc", "vendor", "tools", "archive",
"assets", "resources", "contrib", "extras",
])
# 非 .lnk/.rar 的扩展名
_SAFE_EXTENSIONS = st.sampled_from([
"", ".py", ".sql", ".md", ".txt", ".json", ".csv", ".xlsx",
".bat", ".sh", ".ps1", ".log", ".ini", ".cfg",
".toml", ".yaml", ".yml", ".html", ".css", ".js",
])
def _safe_rel_path() -> st.SearchStrategy[str]:
"""生成不以高优先级目录开头的相对路径。"""
return st.builds(
lambda top, rest: f"{top}/{rest}" if rest else top,
top=_SAFE_TOP_DIRS,
rest=st.lists(_path_segment, min_size=0, max_size=3).map(
lambda parts: "/".join(parts) if parts else ""
),
)
# ---------------------------------------------------------------------------
# Property 3: 空目录标记为候选删除
# ---------------------------------------------------------------------------
@given(data=st.data())
@settings(max_examples=100)
def test_empty_dir_candidate_delete(data: st.DataObject) -> None:
"""Property 3: 空目录标记为候选删除
Feature: repo-audit, Property 3: 空目录标记为候选删除
Validates: Requirements 1.5
对于任意 is_empty_dir=True 的 FileEntry排除 tmp/、logs/、reports/、
export/ 开头和 .lnk/.rar 扩展名classify 返回的 disposition
应为 Disposition.CANDIDATE_DELETE。
"""
rel_path = data.draw(_safe_rel_path())
ext = data.draw(_SAFE_EXTENSIONS)
entry = FileEntry(
rel_path=rel_path,
is_dir=True,
size_bytes=0,
extension=ext,
is_empty_dir=True,
)
result = classify(entry)
assert result.disposition == Disposition.CANDIDATE_DELETE, (
f"空目录 '{entry.rel_path}' 应标记为候选删除,"
f"实际为 {result.disposition.value}"
)
# ---------------------------------------------------------------------------
# Property 4: .lnk/.rar 文件标记为候选删除
# ---------------------------------------------------------------------------
@given(data=st.data())
@settings(max_examples=100)
def test_lnk_rar_candidate_delete(data: st.DataObject) -> None:
"""Property 4: .lnk/.rar 文件标记为候选删除
Feature: repo-audit, Property 4: .lnk/.rar 文件标记为候选删除
Validates: Requirements 1.6
对于任意扩展名为 .lnk 或 .rar 的 FileEntry排除 tmp/、logs/、
reports/、export/ 开头,且 is_empty_dir=Falseclassify 返回的
disposition 应为 Disposition.CANDIDATE_DELETE。
"""
rel_path = data.draw(_safe_rel_path())
ext = data.draw(st.sampled_from([".lnk", ".rar"]))
entry = FileEntry(
rel_path=rel_path,
is_dir=False,
size_bytes=data.draw(st.integers(min_value=0, max_value=10_000_000)),
extension=ext,
is_empty_dir=False,
)
result = classify(entry)
assert result.disposition == Disposition.CANDIDATE_DELETE, (
f"文件 '{entry.rel_path}' (ext={ext}) 应标记为候选删除,"
f"实际为 {result.disposition.value}"
)
# ---------------------------------------------------------------------------
# Property 5: tmp/ 下文件处置范围
# ---------------------------------------------------------------------------
_TMP_EXTENSIONS = st.sampled_from([
"", ".py", ".sql", ".md", ".txt", ".json", ".csv", ".xlsx",
".bat", ".sh", ".ps1", ".lnk", ".rar", ".log", ".ini", ".cfg",
".toml", ".yaml", ".yml", ".html", ".css", ".js", ".tmp", ".bak",
])
def _tmp_rel_path() -> st.SearchStrategy[str]:
"""生成以 tmp/ 开头的相对路径。"""
return st.builds(
lambda rest: f"tmp/{rest}",
rest=st.lists(_path_segment, min_size=1, max_size=3).map(
lambda parts: "/".join(parts)
),
)
@given(data=st.data())
@settings(max_examples=100)
def test_tmp_disposition_range(data: st.DataObject) -> None:
"""Property 5: tmp/ 下文件处置范围
Feature: repo-audit, Property 5: tmp/ 下文件处置范围
Validates: Requirements 1.7
对于任意 rel_path 以 tmp/ 开头的 FileEntryclassify 返回的
disposition 应为 CANDIDATE_DELETE 或 CANDIDATE_ARCHIVE 之一。
"""
rel_path = data.draw(_tmp_rel_path())
ext = data.draw(_TMP_EXTENSIONS)
entry = FileEntry(
rel_path=rel_path,
is_dir=data.draw(st.booleans()),
size_bytes=data.draw(st.integers(min_value=0, max_value=10_000_000)),
extension=ext,
is_empty_dir=data.draw(st.booleans()),
)
result = classify(entry)
allowed = {Disposition.CANDIDATE_DELETE, Disposition.CANDIDATE_ARCHIVE}
assert result.disposition in allowed, (
f"tmp/ 下文件 '{entry.rel_path}' 的处置应为候选删除或候选归档,"
f"实际为 {result.disposition.value}"
)
# ---------------------------------------------------------------------------
# Property 6: 运行时产出目录标记为候选归档
# ---------------------------------------------------------------------------
_RUNTIME_DIRS = st.sampled_from(["logs", "export"])
# 排除 __init__.py 的文件名
_NON_INIT_BASENAME = st.text(
alphabet=_PATH_CHARS,
min_size=1,
max_size=20,
).filter(lambda s: s != "__init__.py")
def _runtime_output_rel_path() -> st.SearchStrategy[str]:
"""生成以 logs/、reports/ 或 export/ 开头的相对路径basename 不是 __init__.py。"""
return st.builds(
lambda top, mid, name: (
f"{top}/{'/'.join(mid)}/{name}" if mid else f"{top}/{name}"
),
top=_RUNTIME_DIRS,
mid=st.lists(_path_segment, min_size=0, max_size=2),
name=_NON_INIT_BASENAME,
)
@given(data=st.data())
@settings(max_examples=100)
def test_runtime_output_candidate_archive(data: st.DataObject) -> None:
"""Property 6: 运行时产出目录标记为候选归档
Feature: repo-audit, Property 6: 运行时产出目录标记为候选归档
Validates: Requirements 1.8
对于任意 rel_path 以 logs/ 或 export/ 开头且非 __init__.py
的 FileEntryclassify 返回的 disposition 应为 CANDIDATE_ARCHIVE。
需求 1.8 仅覆盖 logs/ 和 export/ 目录(不含 reports/)。
"""
rel_path = data.draw(_runtime_output_rel_path())
ext = data.draw(_EXTENSIONS)
entry = FileEntry(
rel_path=rel_path,
is_dir=data.draw(st.booleans()),
size_bytes=data.draw(st.integers(min_value=0, max_value=10_000_000)),
extension=ext,
is_empty_dir=data.draw(st.booleans()),
)
result = classify(entry)
assert result.disposition == Disposition.CANDIDATE_ARCHIVE, (
f"运行时产出 '{entry.rel_path}' 应标记为候选归档,"
f"实际为 {result.disposition.value}"
)