初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

View File

@@ -0,0 +1,304 @@
# -*- coding: utf-8 -*-
"""PipelineRunner 属性测试 - hypothesis 验证管道编排器的通用正确性属性。"""
import string
from datetime import datetime, timedelta
from unittest.mock import MagicMock, patch
import pytest
from hypothesis import given, settings
from hypothesis import strategies as st
from orchestration.pipeline_runner import PipelineRunner
# run() 内部延迟导入 TaskLogger需要 mock 源模块路径
_TASK_LOGGER_PATH = "utils.task_logger.TaskLogger"
FILE_VERSION = "v1_shell"
# ── 策略定义 ──────────────────────────────────────────────────────
pipeline_name_st = st.sampled_from(list(PipelineRunner.PIPELINE_LAYERS.keys()))
processing_mode_st = st.sampled_from(["increment_only", "verify_only", "increment_verify"])
data_source_st = st.sampled_from(["online", "offline", "hybrid"])
_TASK_PREFIXES = ["ODS_", "DWD_", "DWS_", "INDEX_"]
task_code_st = st.builds(
lambda prefix, suffix: prefix + suffix,
prefix=st.sampled_from(_TASK_PREFIXES),
suffix=st.text(
alphabet=string.ascii_uppercase + string.digits + "_",
min_size=1, max_size=12,
),
)
# 单任务结果生成器
task_result_st = st.fixed_dictionaries({
"task_code": task_code_st,
"status": st.sampled_from(["SUCCESS", "FAIL", "SKIP"]),
"counts": st.fixed_dictionaries({
"fetched": st.integers(min_value=0, max_value=10000),
"inserted": st.integers(min_value=0, max_value=10000),
"updated": st.integers(min_value=0, max_value=10000),
"skipped": st.integers(min_value=0, max_value=10000),
"errors": st.integers(min_value=0, max_value=100),
}),
"dump_dir": st.none(),
})
task_results_st = st.lists(task_result_st, min_size=0, max_size=10)
# ── 辅助函数 ──────────────────────────────────────────────────────
def _make_config():
"""创建 mock 配置对象。"""
config = MagicMock()
config.get = MagicMock(side_effect=lambda key, default=None: {
"app.timezone": "Asia/Shanghai",
"verification.ods_use_local_json": False,
"verification.skip_ods_when_fetch_before_verify": True,
"run.ods_tasks": [],
"run.dws_tasks": [],
"run.index_tasks": [],
}.get(key, default))
return config
def _make_runner(task_executor=None, task_registry=None):
"""创建 PipelineRunner 实例,注入 mock 依赖。"""
if task_executor is None:
task_executor = MagicMock()
task_executor.run_tasks.return_value = []
if task_registry is None:
task_registry = MagicMock()
task_registry.get_tasks_by_layer.return_value = ["FAKE_TASK"]
return PipelineRunner(
config=_make_config(),
task_executor=task_executor,
task_registry=task_registry,
db_conn=MagicMock(),
api_client=MagicMock(),
logger=MagicMock(),
)
# ── Property 5: 管道名称→层列表映射 ──────────────────────────────
# Feature: scheduler-refactor, Property 5: 管道名称→层列表映射
# **Validates: Requirements 2.1**
class TestProperty5PipelineNameToLayers:
"""对于任意有效的管道名称PipelineRunner 解析出的层列表应与
PIPELINE_LAYERS 字典中的定义完全一致。"""
@given(pipeline=pipeline_name_st)
@settings(max_examples=100)
def test_layers_match_pipeline_definition(self, pipeline):
"""run() 返回的 layers 字段与 PIPELINE_LAYERS[pipeline] 完全一致。"""
executor = MagicMock()
executor.run_tasks.return_value = []
runner = _make_runner(task_executor=executor)
with patch(_TASK_LOGGER_PATH):
result = runner.run(
pipeline=pipeline,
processing_mode="increment_only",
data_source="offline",
)
expected_layers = PipelineRunner.PIPELINE_LAYERS[pipeline]
assert result["layers"] == expected_layers
@given(pipeline=pipeline_name_st)
@settings(max_examples=100)
def test_resolve_tasks_called_with_correct_layers(self, pipeline):
"""_resolve_tasks 接收的层列表与 PIPELINE_LAYERS 定义一致。"""
executor = MagicMock()
executor.run_tasks.return_value = []
runner = _make_runner(task_executor=executor)
with (
patch(_TASK_LOGGER_PATH),
patch.object(runner, "_resolve_tasks", wraps=runner._resolve_tasks) as spy,
):
runner.run(
pipeline=pipeline,
processing_mode="increment_only",
data_source="offline",
)
expected_layers = PipelineRunner.PIPELINE_LAYERS[pipeline]
spy.assert_called_once_with(expected_layers)
# ── Property 6: processing_mode 控制执行流程 ─────────────────────
# Feature: scheduler-refactor, Property 6: processing_mode 控制执行流程
# **Validates: Requirements 2.3, 2.4**
class TestProperty6ProcessingModeControlsFlow:
"""对于任意 processing_mode增量 ETL 执行当且仅当模式包含 increment
校验流程执行当且仅当模式包含 verify。"""
@given(
pipeline=pipeline_name_st,
mode=processing_mode_st,
data_source=data_source_st,
)
@settings(max_examples=100)
def test_increment_executes_iff_mode_contains_increment(self, pipeline, mode, data_source):
"""增量 ETLtask_executor.run_tasks执行当且仅当 mode 包含 'increment'"""
executor = MagicMock()
executor.run_tasks.return_value = []
runner = _make_runner(task_executor=executor)
with (
patch(_TASK_LOGGER_PATH),
patch.object(runner, "_run_verification", return_value={"status": "COMPLETED"}),
):
runner.run(
pipeline=pipeline,
processing_mode=mode,
data_source=data_source,
)
should_increment = "increment" in mode
if should_increment:
assert executor.run_tasks.called, (
f"mode={mode} 包含 'increment',但 run_tasks 未被调用"
)
else:
# verify_only 且 fetch_before_verify=False默认run_tasks 不应被调用
assert not executor.run_tasks.called, (
f"mode={mode} 不包含 'increment',但 run_tasks 被调用了"
)
@given(
pipeline=pipeline_name_st,
mode=processing_mode_st,
data_source=data_source_st,
)
@settings(max_examples=100)
def test_verification_executes_iff_mode_contains_verify(self, pipeline, mode, data_source):
"""校验流程_run_verification执行当且仅当 mode 包含 'verify'"""
executor = MagicMock()
executor.run_tasks.return_value = []
runner = _make_runner(task_executor=executor)
with (
patch(_TASK_LOGGER_PATH),
patch.object(runner, "_run_verification", return_value={"status": "COMPLETED"}) as mock_verify,
):
runner.run(
pipeline=pipeline,
processing_mode=mode,
data_source=data_source,
)
should_verify = "verify" in mode
if should_verify:
assert mock_verify.called, (
f"mode={mode} 包含 'verify',但 _run_verification 未被调用"
)
else:
assert not mock_verify.called, (
f"mode={mode} 不包含 'verify',但 _run_verification 被调用了"
)
# ── Property 7: 管道结果汇总完整性 ──────────────────────────────
# Feature: scheduler-refactor, Property 7: 管道结果汇总完整性
# **Validates: Requirements 2.6**
class TestProperty7PipelineSummaryCompleteness:
"""对于任意一组任务执行结果PipelineRunner 返回的汇总字典应包含
status/pipeline/layers/results 字段,且 results 长度等于实际执行的任务数。"""
@given(
pipeline=pipeline_name_st,
task_results=task_results_st,
)
@settings(max_examples=100)
def test_summary_has_required_fields(self, pipeline, task_results):
"""返回字典必须包含 status、pipeline、layers、results、verification_summary。"""
executor = MagicMock()
executor.run_tasks.return_value = task_results
runner = _make_runner(task_executor=executor)
with patch(_TASK_LOGGER_PATH):
result = runner.run(
pipeline=pipeline,
processing_mode="increment_only",
data_source="offline",
)
required_keys = {"status", "pipeline", "layers", "results", "verification_summary"}
assert required_keys.issubset(result.keys()), (
f"缺少必要字段: {required_keys - result.keys()}"
)
@given(
pipeline=pipeline_name_st,
task_results=task_results_st,
)
@settings(max_examples=100)
def test_results_length_equals_executed_tasks(self, pipeline, task_results):
"""results 列表长度等于 task_executor.run_tasks 返回的任务数。"""
executor = MagicMock()
executor.run_tasks.return_value = task_results
runner = _make_runner(task_executor=executor)
with patch(_TASK_LOGGER_PATH):
result = runner.run(
pipeline=pipeline,
processing_mode="increment_only",
data_source="offline",
)
assert len(result["results"]) == len(task_results), (
f"results 长度 {len(result['results'])} != 实际任务数 {len(task_results)}"
)
@given(
pipeline=pipeline_name_st,
task_results=task_results_st,
)
@settings(max_examples=100)
def test_pipeline_and_layers_match_input(self, pipeline, task_results):
"""返回的 pipeline 和 layers 字段与输入一致。"""
executor = MagicMock()
executor.run_tasks.return_value = task_results
runner = _make_runner(task_executor=executor)
with patch(_TASK_LOGGER_PATH):
result = runner.run(
pipeline=pipeline,
processing_mode="increment_only",
data_source="offline",
)
assert result["pipeline"] == pipeline
assert result["layers"] == PipelineRunner.PIPELINE_LAYERS[pipeline]
@given(
pipeline=pipeline_name_st,
task_results=task_results_st,
)
@settings(max_examples=100)
def test_increment_only_has_no_verification(self, pipeline, task_results):
"""increment_only 模式下 verification_summary 应为 None。"""
executor = MagicMock()
executor.run_tasks.return_value = task_results
runner = _make_runner(task_executor=executor)
with patch(_TASK_LOGGER_PATH):
result = runner.run(
pipeline=pipeline,
processing_mode="increment_only",
data_source="offline",
)
assert result["verification_summary"] is None