在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,572 @@
# -*- coding: utf-8 -*-
"""
DWD 第一阶段重构 — 属性测试
Feature: dwd-phase1-refactor
测试位置monorepo 根目录 tests/
使用 hypothesis + FakeCursor 验证重构后 DwdLoadTask 的核心正确性属性。
"""
from __future__ import annotations
import re
import sys
import logging
from datetime import datetime, timedelta, timezone
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Dict, List, Sequence, Tuple
from unittest.mock import MagicMock
from hypothesis import given, settings, assume
import hypothesis.strategies as st
# ── 将 ETL 模块加入 sys.path ──
_ETL_ROOT = Path(__file__).resolve().parent.parent / "apps" / "etl" / "connectors" / "feiqiu"
if str(_ETL_ROOT) not in sys.path:
sys.path.insert(0, str(_ETL_ROOT))
from tasks.dwd.dwd_load_task import DwdLoadTask
# ── 测试用 FakeCursor扩展自 task_test_utils增加 fetchmany 支持)──
class PropertyTestCursor:
"""捕获 SQL 语句的极简游标,用于属性测试验证 SQL 结构。"""
def __init__(self):
self.executed: List[Dict[str, Any]] = []
self._fetchall_rows: List[Tuple] = []
self.rowcount = 0
self.connection = SimpleNamespace(encoding="UTF8")
def execute(self, sql: str, params=None):
sql_text = sql.decode("utf-8", errors="ignore") if isinstance(sql, (bytes, bytearray)) else str(sql)
self.executed.append({"sql": sql_text.strip(), "params": params})
self._fetchall_rows = []
lowered = sql_text.lower()
# information_schema 查询返回假列信息
if "from information_schema.columns" in lowered:
table_name = params[1] if params and len(params) >= 2 else None
self._fetchall_rows = self._fake_columns(table_name)
return
if "from information_schema.table_constraints" in lowered:
self._fetchall_rows = []
return
self.rowcount = 0
def fetchone(self):
return None
def fetchall(self):
return list(self._fetchall_rows)
def fetchmany(self, size=None):
"""_merge_fact_increment 使用 fetchmany 读取 RETURNING 结果。"""
# 返回空列表表示无更多行,终止循环
return []
def mogrify(self, template, args):
return b"(?)"
@staticmethod
def _fake_columns(table_name: str | None) -> List[Tuple[str, str, str]]:
"""返回假列信息,确保包含 fetched_at 和常见列。"""
return [
("id", "bigint", "int8"),
("site_id", "bigint", "int8"),
("record_index", "integer", "int4"),
("content_hash", "text", "text"),
("fetched_at", "timestamp with time zone", "timestamptz"),
("payload", "jsonb", "jsonb"),
]
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
# ── 辅助:构造最小可用的 DwdLoadTask 实例 ──
def _make_task(fact_upsert: bool = True) -> DwdLoadTask:
"""构造一个用于测试的 DwdLoadTask使用 mock config/db/api/logger。"""
config = MagicMock()
config.get = lambda key, default=None: {
"app.store_id": 1,
"app.timezone": "Asia/Shanghai",
"dwd.fact_upsert": fact_upsert,
}.get(key, default)
db = MagicMock()
api = MagicMock()
logger = logging.getLogger("test_dwd_property")
logger.setLevel(logging.WARNING) # 抑制测试噪音
return DwdLoadTask(config, db, api, logger)
# ── Hypothesis 策略 ──
@st.composite
def window_pair(draw):
"""生成有效的 (window_start, window_end) 时间对,确保 start < end。"""
base = draw(st.datetimes(
min_value=datetime(2020, 1, 1),
max_value=datetime(2030, 12, 31),
timezones=st.just(timezone.utc),
))
delta_minutes = draw(st.integers(min_value=1, max_value=10080)) # 1 分钟 ~ 7 天
end = base + timedelta(minutes=delta_minutes)
return base, end
# 从 TABLE_MAP 中提取所有事实表dwd_ 前缀)
_FACT_TABLES = [
(dwd, ods)
for dwd, ods in DwdLoadTask.TABLE_MAP.items()
if dwd.split(".")[-1].startswith("dwd_")
]
# ══════════════════════════════════════════════════════════════
# Property 1: 事实表增量 SQL 始终使用窗口范围条件
# Feature: dwd-phase1-refactor, Property 1: 事实表增量 SQL 始终使用窗口范围条件
# ══════════════════════════════════════════════════════════════
# 匹配双边窗口条件fetched_at >= %s AND fetched_at < %s
_WINDOW_PATTERN = re.compile(
r'"fetched_at"\s*>=\s*%s\s+AND\s+"fetched_at"\s*<\s*%s',
re.IGNORECASE,
)
# 匹配单边水位线条件fetched_at > %s不跟 AND fetched_at < %s
_SINGLE_WATERMARK_PATTERN = re.compile(
r'"fetched_at"\s*>\s*%s(?!\s+AND\s+"fetched_at"\s*<\s*%s)',
re.IGNORECASE,
)
@given(
table_pair=st.sampled_from(_FACT_TABLES),
window=window_pair(),
)
@settings(max_examples=100)
def test_fact_increment_sql_uses_window_range(
table_pair: Tuple[str, str],
window: Tuple[datetime, datetime],
):
"""
Property 1: 事实表增量 SQL 始终使用窗口范围条件
对于任意事实表和任意 window_start/window_end 时间对,
_merge_fact_increment() 生成的 SQL 的 WHERE 子句:
- 必须包含 fetched_at >= %s AND fetched_at < %s
- 不得包含单边水位线条件 fetched_at > %s
**Validates: Requirements 1.1, 1.2**
"""
dwd_table, ods_table = table_pair
window_start, window_end = window
task = _make_task()
cur = PropertyTestCursor()
# 获取 FakeCursor 返回的列名(模拟 information_schema 查询结果)
fake_cols = [row[0] for row in PropertyTestCursor._fake_columns(None)]
# 获取该表的 FACT_MAPPINGS 中的目标列
mapping_entries = task.FACT_MAPPINGS.get(dwd_table) or []
mapping_dest = [dst for dst, _, _ in mapping_entries]
# 构建 dwd_cols 和 ods_cols合并映射列和假列
dwd_cols = list(set(fake_cols + mapping_dest))
ods_cols = list(fake_cols)
# 构建类型字典
fake_types = {row[0]: row[1] for row in PropertyTestCursor._fake_columns(None)}
dwd_types = dict(fake_types)
ods_types = dict(fake_types)
# 为映射目标列补充类型
for col in mapping_dest:
if col not in dwd_types:
dwd_types[col] = "text"
result = task._merge_fact_increment(
cur=cur,
dwd_table=dwd_table,
ods_table=ods_table,
dwd_cols=dwd_cols,
ods_cols=ods_cols,
dwd_types=dwd_types,
ods_types=ods_types,
window_start=window_start,
window_end=window_end,
)
# 找到主 INSERT SQL排除 information_schema 查询)
insert_sqls = [
entry["sql"]
for entry in cur.executed
if entry["sql"].upper().startswith("INSERT INTO")
]
assert len(insert_sqls) == 1, (
f"期望恰好 1 条 INSERT SQL实际 {len(insert_sqls)} 条,"
f"表: {dwd_table}"
)
sql = insert_sqls[0]
# 验证:包含双边窗口条件
assert _WINDOW_PATTERN.search(sql), (
f"INSERT SQL 缺少双边窗口条件 (fetched_at >= %s AND fetched_at < %s)"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# 验证:不包含单边水位线条件
assert not _SINGLE_WATERMARK_PATTERN.search(sql), (
f"INSERT SQL 包含单边水位线条件 (fetched_at > %s)"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# 验证SQL 参数包含 window_start 和 window_end
insert_entry = [
entry for entry in cur.executed
if entry["sql"].upper().startswith("INSERT INTO")
][0]
params = insert_entry["params"]
assert params is not None, f"INSERT SQL 参数为 None表: {dwd_table}"
assert window_start in params, (
f"SQL 参数缺少 window_start={window_start}"
f"表: {dwd_table}, params={params}"
)
assert window_end in params, (
f"SQL 参数缺少 window_end={window_end}"
f"表: {dwd_table}, params={params}"
)
# ══════════════════════════════════════════════════════════════
# Property 2: 事实表增量不执行回补 SQL
# Feature: dwd-phase1-refactor, Property 2: 事实表增量不执行回补 SQL
# ══════════════════════════════════════════════════════════════
# 匹配回补 SQL 特征LEFT JOIN ... WHERE ... IS NULL
_BACKFILL_PATTERN = re.compile(
r"LEFT\s+JOIN\b.*?\bWHERE\b.*?\bIS\s+NULL\b",
re.IGNORECASE | re.DOTALL,
)
@given(
table_pair=st.sampled_from(_FACT_TABLES),
window=window_pair(),
)
@settings(max_examples=100)
def test_fact_increment_no_backfill(
table_pair: Tuple[str, str],
window: Tuple[datetime, datetime],
):
"""
Property 2: 事实表增量不执行回补 SQL
对于任意事实表和任意 window_start/window_end 时间对,
_merge_fact_increment() 执行的所有 SQL 语句中不得包含
LEFT JOIN ... WHERE ... IS NULL 模式(回补特征)。
**Validates: Requirements 2.3**
"""
dwd_table, ods_table = table_pair
window_start, window_end = window
task = _make_task()
cur = PropertyTestCursor()
# 构建列信息(复用 Property 1 的逻辑)
fake_cols = [row[0] for row in PropertyTestCursor._fake_columns(None)]
mapping_entries = task.FACT_MAPPINGS.get(dwd_table) or []
mapping_dest = [dst for dst, _, _ in mapping_entries]
dwd_cols = list(set(fake_cols + mapping_dest))
ods_cols = list(fake_cols)
fake_types = {row[0]: row[1] for row in PropertyTestCursor._fake_columns(None)}
dwd_types = dict(fake_types)
ods_types = dict(fake_types)
for col in mapping_dest:
if col not in dwd_types:
dwd_types[col] = "text"
result = task._merge_fact_increment(
cur=cur,
dwd_table=dwd_table,
ods_table=ods_table,
dwd_cols=dwd_cols,
ods_cols=ods_cols,
dwd_types=dwd_types,
ods_types=ods_types,
window_start=window_start,
window_end=window_end,
)
# 检查所有执行的 SQL排除 information_schema 元数据查询)
all_sqls = [
entry["sql"]
for entry in cur.executed
if "information_schema" not in entry["sql"].lower()
]
for sql in all_sqls:
assert not _BACKFILL_PATTERN.search(sql), (
f"检测到回补 SQLLEFT JOIN ... IS NULL 模式),"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# ══════════════════════════════════════════════════════════════
# Property 4: 维度表始终走 SCD2 路径
# Feature: dwd-phase1-refactor, Property 4: 维度表始终走 SCD2 路径
# ══════════════════════════════════════════════════════════════
# 从 TABLE_MAP 中提取所有维度表dim_ 前缀)
_DIM_TABLES = [
(dwd, ods)
for dwd, ods in DwdLoadTask.TABLE_MAP.items()
if dwd.split(".")[-1].startswith("dim_")
]
@given(
table_pair=st.sampled_from(_DIM_TABLES),
now=st.datetimes(
min_value=datetime(2020, 1, 1),
max_value=datetime(2030, 12, 31),
timezones=st.just(timezone.utc),
),
)
@settings(max_examples=100)
def test_dim_always_scd2(
table_pair: Tuple[str, str],
now: datetime,
):
"""
Property 4: 维度表始终走 SCD2 路径
对于任意维度表TABLE_MAP 中 dim_ 前缀),调用 _merge_dim()
始终委托给 _merge_dim_scd2(),不经过任何条件分支判断。
**Validates: Requirements 3.8, 5.1**
"""
from unittest.mock import patch, sentinel
dwd_table, ods_table = table_pair
task = _make_task()
cur = PropertyTestCursor()
# 构建最小列集合
fake_cols = [row[0] for row in PropertyTestCursor._fake_columns(None)]
dwd_cols = list(fake_cols)
ods_cols = list(fake_cols)
# mock _merge_dim_scd2返回 sentinel 以验证委托关系
expected_return = {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0}
with patch.object(task, "_merge_dim_scd2", return_value=expected_return) as mock_scd2:
result = task._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
# 验证 _merge_dim_scd2 被调用了恰好一次
assert mock_scd2.call_count == 1, (
f"期望 _merge_dim_scd2 被调用 1 次,实际 {mock_scd2.call_count} 次,"
f"表: {dwd_table}"
)
# 验证传入参数正确
call_args = mock_scd2.call_args
assert call_args[0] == (cur, dwd_table, ods_table, dwd_cols, ods_cols, now), (
f"_merge_dim_scd2 调用参数不匹配,表: {dwd_table}\n"
f"期望: (cur, {dwd_table}, {ods_table}, {dwd_cols}, {ods_cols}, {now})\n"
f"实际: {call_args[0]}"
)
# 验证返回值直接透传
assert result is expected_return, (
f"_merge_dim() 返回值未直接透传 _merge_dim_scd2() 的结果,"
f"表: {dwd_table}"
)
# ══════════════════════════════════════════════════════════════
# Property 3: 事实表主增量 SQL 结构等价性
# Feature: dwd-phase1-refactor, Property 3: 事实表主增量 SQL 结构等价性
# ══════════════════════════════════════════════════════════════
class _PkAwareCursor(PropertyTestCursor):
"""扩展 PropertyTestCursor对 table_constraints 查询返回主键信息。
_merge_fact_increment 通过 _get_primary_keys 查询 information_schema
获取主键列。基础 PropertyTestCursor 返回空列表,导致无法生成
ON CONFLICT 子句。此子类注入 pk_cols 以完整测试 SQL 结构。
"""
def __init__(self, pk_cols: Sequence[str]):
super().__init__()
self._pk_cols = list(pk_cols)
def execute(self, sql: str, params=None):
sql_text = sql.decode("utf-8", errors="ignore") if isinstance(sql, (bytes, bytearray)) else str(sql)
lowered = sql_text.lower()
# 拦截主键查询,返回注入的 pk_cols
if "from information_schema.table_constraints" in lowered:
self.executed.append({"sql": sql_text.strip(), "params": params})
self._fetchall_rows = [
{"column_name": col} for col in self._pk_cols
]
return
super().execute(sql, params)
# INSERT INTO "dwd"."<table>" (...) — 实际 SQL 使用引号包裹 schema 和表名
_INSERT_INTO_PATTERN = re.compile(
r'INSERT\s+INTO\s+"?dwd"?\."?\w+"?\s*\(',
re.IGNORECASE,
)
# SELECT ... FROM "ods"."<table>"
_SELECT_FROM_ODS_PATTERN = re.compile(
r'SELECT\s+.+?\s+FROM\s+"?ods"?\."?\w+"?',
re.IGNORECASE | re.DOTALL,
)
# ON CONFLICT (...) DO UPDATE SET 或 DO NOTHING
_ON_CONFLICT_PATTERN = re.compile(
r"ON\s+CONFLICT\s*\([^)]+\)\s+DO\s+(UPDATE\s+SET|NOTHING)",
re.IGNORECASE,
)
# IS DISTINCT FROM变更检测
_IS_DISTINCT_FROM_PATTERN = re.compile(
r"IS\s+DISTINCT\s+FROM",
re.IGNORECASE,
)
# RETURNING (xmax = 0) AS inserted
_RETURNING_PATTERN = re.compile(
r"RETURNING\s+\(xmax\s*=\s*0\)\s+AS\s+inserted",
re.IGNORECASE,
)
@given(
table_pair=st.sampled_from(_FACT_TABLES),
window=window_pair(),
fact_upsert=st.booleans(),
)
@settings(max_examples=100)
def test_fact_increment_sql_structure(
table_pair: Tuple[str, str],
window: Tuple[datetime, datetime],
fact_upsert: bool,
):
"""
Property 3: 事实表主增量 SQL 结构等价性
对于任意事实表、窗口参数和 fact_upsert 配置,
_merge_fact_increment() 生成的主增量 SQL 应包含:
1. INSERT INTO <dwd_table> 结构
2. SELECT ... FROM <ods_table> 结构
3. ON CONFLICT (<pk_cols>) DO UPDATE SET 或 DO NOTHING 结构
4. IS DISTINCT FROM 变更检测(当有 DO UPDATE SET 时)
5. RETURNING (xmax = 0) AS inserted 结构
**Validates: Requirements 5.2, 5.5**
"""
dwd_table, ods_table = table_pair
window_start, window_end = window
task = _make_task(fact_upsert=fact_upsert)
# 构建列信息
fake_cols = [row[0] for row in PropertyTestCursor._fake_columns(None)]
mapping_entries = task.FACT_MAPPINGS.get(dwd_table) or []
mapping_dest = [dst for dst, _, _ in mapping_entries]
dwd_cols = list(set(fake_cols + mapping_dest))
ods_cols = list(fake_cols)
fake_types = {row[0]: row[1] for row in PropertyTestCursor._fake_columns(None)}
dwd_types = dict(fake_types)
ods_types = dict(fake_types)
for col in mapping_dest:
if col not in dwd_types:
dwd_types[col] = "text"
# 使用注入主键的 cursor确保 ON CONFLICT 子句被生成
# 选择 "id" 作为主键——它存在于 _fake_columns 返回的列中
cur = _PkAwareCursor(pk_cols=["id"])
result = task._merge_fact_increment(
cur=cur,
dwd_table=dwd_table,
ods_table=ods_table,
dwd_cols=dwd_cols,
ods_cols=ods_cols,
dwd_types=dwd_types,
ods_types=ods_types,
window_start=window_start,
window_end=window_end,
)
# 提取主 INSERT SQL
insert_sqls = [
entry["sql"]
for entry in cur.executed
if entry["sql"].upper().startswith("INSERT INTO")
]
assert len(insert_sqls) == 1, (
f"期望恰好 1 条 INSERT SQL实际 {len(insert_sqls)} 条,"
f"表: {dwd_table}"
)
sql = insert_sqls[0]
# 1. 验证 INSERT INTO dwd.<table> 结构
assert _INSERT_INTO_PATTERN.search(sql), (
f"SQL 缺少 INSERT INTO dwd.<table> 结构,"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# 2. 验证 SELECT ... FROM ods.<table> 结构
assert _SELECT_FROM_ODS_PATTERN.search(sql), (
f"SQL 缺少 SELECT ... FROM ods.<table> 结构,"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# 3. 验证 ON CONFLICT 结构
assert _ON_CONFLICT_PATTERN.search(sql), (
f"SQL 缺少 ON CONFLICT (...) DO UPDATE SET / DO NOTHING 结构,"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# 4. 验证 IS DISTINCT FROM 变更检测逻辑
# 有 DO UPDATE SET 时,必须有 IS DISTINCT FROM 变更检测
has_do_update = re.search(
r"ON\s+CONFLICT\s*\([^)]+\)\s+DO\s+UPDATE\s+SET",
sql, re.IGNORECASE,
)
if has_do_update:
assert _IS_DISTINCT_FROM_PATTERN.search(sql), (
f"SQL 有 DO UPDATE SET 但缺少 IS DISTINCT FROM 变更检测,"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)
# 5. 验证 RETURNING (xmax = 0) AS inserted 结构
assert _RETURNING_PATTERN.search(sql), (
f"SQL 缺少 RETURNING (xmax = 0) AS inserted 结构,"
f"表: {dwd_table}\nSQL: {sql[:500]}"
)

View File

@@ -0,0 +1,325 @@
# -*- coding: utf-8 -*-
"""Monorepo 级属性测试 — ETL DWS/Flow 重构
集中存放跨模块的属性测试,验证重构后的正确性属性。
"""
import sys
import string
from pathlib import Path
import pytest
from hypothesis import given, settings, assume
from hypothesis import strategies as st
# 将 ETL feiqiu 模块加入 sys.path以便直接导入
_ETL_ROOT = Path(__file__).resolve().parent.parent / "apps" / "etl" / "connectors" / "feiqiu"
if str(_ETL_ROOT) not in sys.path:
sys.path.insert(0, str(_ETL_ROOT))
from cli.main import parse_layers, VALID_LAYERS # noqa: E402
# ---------------------------------------------------------------------------
# Property 5: --layers 解析正确性
# Feature: etl-dws-flow-refactor, Property 5: --layers 解析正确性
# Validates: Requirements 6.1, 6.2
# ---------------------------------------------------------------------------
_ALL_LAYERS = sorted(VALID_LAYERS)
_layer_subsets = st.lists(
st.sampled_from(_ALL_LAYERS),
min_size=1,
max_size=4,
unique=True,
)
_invalid_layer = st.text(
alphabet=string.ascii_letters,
min_size=1,
max_size=10,
).filter(lambda s: s.upper() not in VALID_LAYERS)
class TestParseLayersProperty:
"""Property 5: --layers 解析正确性
# Feature: etl-dws-flow-refactor, Property 5: --layers 解析正确性
"""
@given(subset=_layer_subsets)
@settings(max_examples=100)
def test_valid_subset_parsed_correctly(self, subset: list[str]):
"""任意合法层子集经逗号拼接后parse_layers 返回恰好该子集(大写)。
**Validates: Requirements 6.1, 6.2**
"""
raw = ",".join(subset)
result = parse_layers(raw)
assert set(result) == set(subset)
assert all(l == l.upper() for l in result)
assert len(result) == len(subset)
@given(
valid=_layer_subsets,
bad=_invalid_layer,
)
@settings(max_examples=100)
def test_invalid_layer_raises(self, valid: list[str], bad: str):
"""包含无效层名时必须抛出 ValueError。
**Validates: Requirements 6.1, 6.2**
"""
assume(bad.upper() not in VALID_LAYERS)
raw = ",".join(valid + [bad])
with pytest.raises(ValueError, match="无效的层名"):
parse_layers(raw)
from orchestration.flow_runner import FlowRunner # noqa: E402
# ---------------------------------------------------------------------------
# Property 6: 配置优先级——配置值优先于 Registry
# Feature: etl-dws-flow-refactor, Property 6: 配置优先级
# Validates: Requirements 7.2
# ---------------------------------------------------------------------------
_LAYER_CONFIG_KEY = {
"ODS": "run.ods_tasks",
"DWD": "run.dwd_tasks",
"DWS": "run.dws_tasks",
"INDEX": "run.index_tasks",
}
class _DictConfig:
"""模拟 AppConfig.get(),支持点号路径查找。"""
def __init__(self, data: dict):
self._data = data
def get(self, key: str, default=None):
keys = key.split(".")
node = self._data
for k in keys:
if isinstance(node, dict) and k in node:
node = node[k]
else:
return default
return node
def _make_runner(config_data: dict, registry_by_layer: dict) -> FlowRunner:
"""构造最小 FlowRunner 实例,仅满足 _resolve_tasks 所需。"""
import logging
from unittest.mock import MagicMock
config = _DictConfig(config_data)
registry = MagicMock()
registry.get_tasks_by_layer = MagicMock(
side_effect=lambda layer: registry_by_layer.get(layer.upper(), [])
)
runner = object.__new__(FlowRunner)
runner.config = config
runner.task_registry = registry
runner.logger = logging.getLogger("test_prop6")
return runner
_task_code_p6 = st.from_regex(r"[A-Z][A-Z0-9_]{2,20}", fullmatch=True)
_task_list_p6 = st.lists(_task_code_p6, min_size=1, max_size=8, unique=True)
_layer_p6 = st.sampled_from(["ODS", "DWD", "DWS", "INDEX"])
class TestConfigPriorityProperty:
"""Property 6: 配置优先级——配置值优先于 Registry
# Feature: etl-dws-flow-refactor, Property 6: 配置优先级
"""
@given(layer=_layer_p6, config_tasks=_task_list_p6, registry_tasks=_task_list_p6)
@settings(max_examples=100)
def test_config_overrides_registry(
self, layer: str, config_tasks: list[str], registry_tasks: list[str]
):
"""任意层名 + 非空配置任务列表 → 返回配置值,忽略 Registry。
**Validates: Requirements 7.2**
"""
assume(set(config_tasks) != set(registry_tasks))
key = _LAYER_CONFIG_KEY[layer]
parts = key.split(".")
config_data = {parts[0]: {parts[1]: config_tasks}}
runner = _make_runner(config_data, {layer: registry_tasks})
result = runner._resolve_tasks([layer])
assert result == config_tasks
@given(layer=_layer_p6, registry_tasks=_task_list_p6)
@settings(max_examples=100)
def test_empty_config_falls_back_to_registry(
self, layer: str, registry_tasks: list[str]
):
"""配置为空时回退到 Registry 结果。
**Validates: Requirements 7.2**
"""
runner = _make_runner({}, {layer: registry_tasks})
result = runner._resolve_tasks([layer])
assert result == registry_tasks
from orchestration.topological_sort import topological_sort # noqa: E402
# ---------------------------------------------------------------------------
# Property 7 & 8 辅助:轻量 TaskMeta 替身 + 简易 Registry
# ---------------------------------------------------------------------------
from dataclasses import dataclass, field as dc_field
@dataclass
class _FakeMeta:
depends_on: list[str] = dc_field(default_factory=list)
class _FakeRegistry:
"""最小 Registry 替身,仅提供 get_metadata()。"""
def __init__(self, deps: dict[str, list[str]]):
self._deps = deps
def get_metadata(self, code: str):
if code in self._deps:
return _FakeMeta(depends_on=self._deps[code])
return _FakeMeta()
# ---------------------------------------------------------------------------
# Hypothesis 策略DAG 生成 & 含环图生成
# ---------------------------------------------------------------------------
@st.composite
def dag_strategy(draw):
"""生成随机 DAG(task_codes, deps_dict)。
通过仅允许依赖索引更小的任务来保证无环。
"""
n = draw(st.integers(min_value=1, max_value=10))
codes = [f"TASK_{i}" for i in range(n)]
deps: dict[str, list[str]] = {}
for i, code in enumerate(codes):
if i > 0:
possible = codes[:i]
chosen = draw(
st.lists(
st.sampled_from(possible),
max_size=min(3, i),
unique=True,
)
)
deps[code] = chosen
else:
deps[code] = []
return codes, deps
@st.composite
def cyclic_graph_strategy(draw):
"""生成一定包含至少一个环的有向图:(task_codes, deps_dict, cycle_tasks)。"""
n = draw(st.integers(min_value=2, max_value=8))
codes = [f"TASK_{i}" for i in range(n)]
deps: dict[str, list[str]] = {code: [] for code in codes}
# 选取 2-4 个任务构成环
cycle_size = draw(st.integers(min_value=2, max_value=min(4, n)))
cycle_tasks = draw(
st.lists(
st.sampled_from(codes),
min_size=cycle_size,
max_size=cycle_size,
unique=True,
)
)
for i in range(len(cycle_tasks)):
next_task = cycle_tasks[(i + 1) % len(cycle_tasks)]
if next_task not in deps[cycle_tasks[i]]:
deps[cycle_tasks[i]].append(next_task)
return codes, deps, cycle_tasks
# ---------------------------------------------------------------------------
# Property 7: 拓扑排序正确性
# Feature: etl-dws-flow-refactor, Property 7: 拓扑排序正确性
# Validates: Requirements 8.3
# ---------------------------------------------------------------------------
class TestTopologicalSortProperty:
"""Property 7: 拓扑排序正确性
# Feature: etl-dws-flow-refactor, Property 7: 拓扑排序正确性
"""
@given(data=dag_strategy())
@settings(max_examples=100)
def test_dependencies_precede_dependents(self, data):
"""对于任意 DAG排序结果中每个任务的依赖都排在它之前。
**Validates: Requirements 8.3**
"""
codes, deps = data
registry = _FakeRegistry(deps)
result = topological_sort(codes, registry)
# 结果包含且仅包含输入任务(无丢失、无重复)
assert set(result) == set(codes)
assert len(result) == len(codes)
# 每个任务的依赖(在列表内的)都排在它之前
pos = {code: idx for idx, code in enumerate(result)}
for code in codes:
for dep in deps.get(code, []):
if dep in pos:
assert pos[dep] < pos[code], (
f"依赖 {dep} 应排在 {code} 之前,"
f"但位置为 {pos[dep]} vs {pos[code]}"
)
# ---------------------------------------------------------------------------
# Property 8: 循环依赖检测
# Feature: etl-dws-flow-refactor, Property 8: 循环依赖检测
# Validates: Requirements 8.4
# ---------------------------------------------------------------------------
class TestCyclicDependencyProperty:
"""Property 8: 循环依赖检测
# Feature: etl-dws-flow-refactor, Property 8: 循环依赖检测
"""
@given(data=cyclic_graph_strategy())
@settings(max_examples=100)
def test_cycle_raises_value_error(self, data):
"""对于任意含环有向图topological_sort 应抛出 ValueError。
**Validates: Requirements 8.4**
"""
codes, deps, cycle_tasks = data
registry = _FakeRegistry(deps)
with pytest.raises(ValueError, match="循环依赖"):
topological_sort(codes, registry)
@given(data=cyclic_graph_strategy())
@settings(max_examples=100)
def test_cycle_error_mentions_involved_tasks(self, data):
"""错误信息中应包含环中涉及的至少一个任务代码。
**Validates: Requirements 8.4**
"""
codes, deps, cycle_tasks = data
registry = _FakeRegistry(deps)
with pytest.raises(ValueError) as exc_info:
topological_sort(codes, registry)
msg = str(exc_info.value)
# 至少一个环中的任务出现在错误信息中
assert any(t in msg for t in cycle_tasks), (
f"错误信息 '{msg}' 中未包含环任务 {cycle_tasks} 中的任何一个"
)

View File

@@ -1,12 +1,25 @@
# -*- coding: utf-8 -*-
"""
Property 5: 文件迁移完整性
Property 5: 文件迁移完整性(已归档)
对于任意源-目标目录映射关系ETL 业务代码、database 文件、tests 目录)
原始用途:验证从 C:\\ZQYY\\FQ-ETL\\ 迁移到 NeoZQYY monorepo 时
源目录中的每个文件在目标目录的对应位置都应存在且内容一致。
归档原因:迁移已于 2025 年完成后续多轮重构dwd-phase1-refactor、
etl-dws-flow-refactor、ods-dedup-standardize 等)对目标代码做了大量
结构性修改,源-目标 1:1 对比前提不再成立。扫描显示 50+ 个文件已合理分化。
如需重新启用,可移除模块级 skip 标记。
**Validates: Requirements 5.1, 5.2, 5.3**
"""
import pytest
# 迁移已完成且目标代码经多轮重构已合理分化,此测试模块整体跳过
pytestmark = pytest.mark.skip(
reason="文件迁移已完成,后续重构导致源-目标合理分化50+ 文件),测试使命结束"
)
import hashlib
import os
from typing import List, Tuple
@@ -16,24 +29,20 @@ from hypothesis.strategies import sampled_from
# 源-目标目录映射(需求 5.1: ETL 业务代码5.2: database5.3: tests
MIGRATION_MAPPINGS: List[Tuple[str, str]] = [
# ETL 业务代码目录(需求 5.1
(r"C:\ZQYY\FQ-ETL\api", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\api"),
(r"C:\ZQYY\FQ-ETL\cli", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\cli"),
(r"C:\ZQYY\FQ-ETL\config", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\config"),
(r"C:\ZQYY\FQ-ETL\loaders", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\loaders"),
(r"C:\ZQYY\FQ-ETL\models", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\models"),
(r"C:\ZQYY\FQ-ETL\orchestration", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\orchestration"),
(r"C:\ZQYY\FQ-ETL\scd", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\scd"),
(r"C:\ZQYY\FQ-ETL\tasks", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\tasks"),
(r"C:\ZQYY\FQ-ETL\utils", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\utils"),
(r"C:\ZQYY\FQ-ETL\quality", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\quality"),
# tests 子目录(需求 5.3)— 只映射 ETL 自身的 unit/integration
# Monorepo 级属性测试test_property_*.py按设计放在 C:\NeoZQYY\tests\
(r"C:\ZQYY\FQ-ETL\tests\unit", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\tests\unit"),
(r"C:\ZQYY\FQ-ETL\tests\integration", r"C:\NeoZQYY\apps\etl\pipelines\feiqiu\tests\integration"),
(r"C:\ZQYY\FQ-ETL\api", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\api"),
(r"C:\ZQYY\FQ-ETL\cli", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\cli"),
(r"C:\ZQYY\FQ-ETL\config", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\config"),
(r"C:\ZQYY\FQ-ETL\loaders", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\loaders"),
(r"C:\ZQYY\FQ-ETL\models", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\models"),
(r"C:\ZQYY\FQ-ETL\orchestration", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\orchestration"),
(r"C:\ZQYY\FQ-ETL\scd", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\scd"),
(r"C:\ZQYY\FQ-ETL\tasks", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\tasks"),
(r"C:\ZQYY\FQ-ETL\utils", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\utils"),
(r"C:\ZQYY\FQ-ETL\quality", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\quality"),
(r"C:\ZQYY\FQ-ETL\tests\unit", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\tests\unit"),
(r"C:\ZQYY\FQ-ETL\tests\integration", r"C:\NeoZQYY\apps\etl\connectors\feiqiu\tests\integration"),
]
# 排除模式__pycache__ 等不参与比较
EXCLUDE_DIRS = {"__pycache__", ".pytest_cache", ".hypothesis"}
@@ -67,7 +76,6 @@ def test_all_source_files_exist_in_target(mapping: Tuple[str, str]) -> None:
**Validates: Requirements 5.1, 5.2, 5.3**
"""
src_dir, dst_dir = mapping
assert os.path.isdir(src_dir), f"源目录不存在: {src_dir}"
assert os.path.isdir(dst_dir), f"目标目录不存在: {dst_dir}"
@@ -96,7 +104,6 @@ def test_source_and_target_file_content_identical(mapping: Tuple[str, str]) -> N
**Validates: Requirements 5.1, 5.2, 5.3**
"""
src_dir, dst_dir = mapping
assert os.path.isdir(src_dir), f"源目录不存在: {src_dir}"
assert os.path.isdir(dst_dir), f"目标目录不存在: {dst_dir}"
@@ -106,17 +113,13 @@ def test_source_and_target_file_content_identical(mapping: Tuple[str, str]) -> N
for rel_path in src_files:
src_path = os.path.join(src_dir, rel_path)
dst_path = os.path.join(dst_dir, rel_path)
if not os.path.isfile(dst_path):
continue
src_hash = _file_hash(src_path)
dst_hash = _file_hash(dst_path)
if src_hash != dst_hash:
if _file_hash(src_path) != _file_hash(dst_path):
mismatched.append(rel_path)
assert not mismatched, (
f"源目录 {src_dir} 与目标目录 {dst_dir}{len(mismatched)} 个文件内容不一致:\n"
+ "\n".join(f" - {f}" for f in mismatched[:10])
+ (f"\n ... 及其他 {len(mismatched) - 10}" if len(mismatched) > 10 else "")
)
)

View File

@@ -15,10 +15,9 @@ from hypothesis.strategies import sampled_from
# uv workspace 声明的 Python 子项目成员
WORKSPACE_MEMBERS = [
"apps/etl/pipelines/feiqiu",
"apps/etl/connectors/feiqiu",
"apps/backend",
"packages/shared",
"gui",
]
MONOREPO_ROOT = r"C:\NeoZQYY"

View File

@@ -17,9 +17,9 @@ from hypothesis.strategies import sampled_from
MONOREPO_ROOT = r"C:\NeoZQYY"
# 一级目录列表(需求 1.5 定义)
# 仅包含实际存在的一级目录
TOP_LEVEL_DIRS = [
"apps",
"gui",
"packages",
"db",
"docs",