1972 lines
78 KiB
Python
1972 lines
78 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
数据流结构分析 — 属性测试
|
||
|
||
Feature: dataflow-structure-audit
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# scripts/ops 不是 Python 包,通过 sys.path 导入
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "scripts" / "ops"))
|
||
|
||
from dataflow_analyzer import ( # noqa: E402
|
||
AnalyzerConfig,
|
||
ColumnInfo,
|
||
FieldInfo,
|
||
TableCollectionResult,
|
||
collect_all_tables,
|
||
dump_collection_results,
|
||
flatten_json_tree,
|
||
)
|
||
|
||
from hypothesis import given, settings, assume
|
||
import hypothesis.strategies as st
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 策略:生成任意嵌套 JSON(dict / list / 标量组合)
|
||
# ---------------------------------------------------------------------------
|
||
_scalar = st.one_of(
|
||
st.none(),
|
||
st.booleans(),
|
||
st.integers(min_value=-10_000, max_value=10_000),
|
||
st.floats(allow_nan=False, allow_infinity=False),
|
||
st.text(min_size=0, max_size=20),
|
||
)
|
||
|
||
# 限制 key 为非空 ASCII 字母数字(避免含 '.' / '[]' 等干扰路径解析的字符)
|
||
_json_key = st.from_regex(r"[A-Za-z][A-Za-z0-9_]{0,9}", fullmatch=True)
|
||
|
||
|
||
def _json_value(max_depth: int = 3):
|
||
"""递归策略:生成嵌套 JSON 值,限制深度防止爆炸。"""
|
||
if max_depth <= 0:
|
||
return _scalar
|
||
return st.one_of(
|
||
_scalar,
|
||
st.dictionaries(_json_key, st.deferred(lambda: _json_value(max_depth - 1)),
|
||
min_size=0, max_size=4),
|
||
st.lists(st.deferred(lambda: _json_value(max_depth - 1)),
|
||
min_size=0, max_size=3),
|
||
)
|
||
|
||
|
||
# 顶层必须是 dict(与 flatten_json_tree 的输入契约一致)
|
||
_json_record = st.dictionaries(_json_key, _json_value(3), min_size=0, max_size=5)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 参考实现:收集所有叶子节点路径(用于对比验证)
|
||
# ---------------------------------------------------------------------------
|
||
def _collect_leaves(obj, prefix: str = "") -> set[str]:
|
||
"""
|
||
独立于被测代码的参考叶子收集器。
|
||
返回所有标量叶子节点的路径集合。
|
||
"""
|
||
if isinstance(obj, dict):
|
||
paths: set[str] = set()
|
||
for key, val in obj.items():
|
||
child = f"{prefix}.{key}" if prefix else key
|
||
paths |= _collect_leaves(val, child)
|
||
return paths
|
||
elif isinstance(obj, list):
|
||
arr = f"{prefix}[]" if prefix else "[]"
|
||
paths = set()
|
||
for item in obj:
|
||
paths |= _collect_leaves(item, arr)
|
||
return paths
|
||
else:
|
||
# 标量叶子 — 空前缀的裸标量不会被记录(与实现一致)
|
||
return {prefix} if prefix else set()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Property 1: JSON 递归展开路径正确性
|
||
# Validates: Requirements 1.1, 1.3, 1.4
|
||
# ---------------------------------------------------------------------------
|
||
@given(record=_json_record)
|
||
@settings(max_examples=200)
|
||
def test_flatten_json_tree_path_correctness(record: dict):
|
||
"""
|
||
Feature: dataflow-structure-audit, Property 1: JSON 递归展开路径正确性
|
||
|
||
对于任意嵌套 JSON 对象,flatten_json_tree 的输出应满足:
|
||
1. 路径使用 '.' 分隔层级
|
||
2. 数组层级使用 '[]' 标记
|
||
3. depth == 路径去掉 '[]' 后 '.' 的数量
|
||
4. 不遗漏任何叶子节点
|
||
|
||
**Validates: Requirements 1.1, 1.3, 1.4**
|
||
"""
|
||
result = flatten_json_tree([record])
|
||
|
||
# --- 参考叶子集合 ---
|
||
expected_leaves = _collect_leaves(record)
|
||
|
||
# --- 属性 4:不遗漏叶子节点 ---
|
||
actual_paths = set(result.keys())
|
||
assert actual_paths == expected_leaves, (
|
||
f"叶子节点不一致。\n"
|
||
f" 遗漏: {expected_leaves - actual_paths}\n"
|
||
f" 多余: {actual_paths - expected_leaves}"
|
||
)
|
||
|
||
for path, fi in result.items():
|
||
# --- 属性 1:路径使用 '.' 分隔层级 ---
|
||
# 路径中不应出现连续的 '.',且不以 '.' 开头或结尾
|
||
assert not path.startswith("."), f"路径不应以 '.' 开头: {path}"
|
||
stripped = path.replace("[]", "")
|
||
assert not stripped.endswith("."), f"路径不应以 '.' 结尾: {path}"
|
||
assert ".." not in stripped, f"路径不应含连续 '..': {path}"
|
||
|
||
# --- 属性 2:数组标记格式 ---
|
||
# '[]' 标记合法位置:紧跟在 key 之后或另一个 '[]' 之后
|
||
# 合法模式:key[]、key[][].child(嵌套数组)、key[].child
|
||
if "[]" in path:
|
||
# 路径不应以 '[]' 开头(第一个 '[]' 前必须有 key)
|
||
assert not path.startswith("[]"), (
|
||
f"路径不应以 '[]' 开头: {path}"
|
||
)
|
||
|
||
# --- 属性 3:depth 一致性 ---
|
||
expected_depth = stripped.count(".")
|
||
assert fi.depth == expected_depth, (
|
||
f"depth 不一致: path={path}, "
|
||
f"expected={expected_depth}, actual={fi.depth}"
|
||
)
|
||
|
||
# --- 基本字段完整性 ---
|
||
assert fi.path == path
|
||
assert fi.occurrence >= 1
|
||
assert fi.total_records == 1 # 单条记录
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Property 2: 多记录字段合并完整性与出现频率准确性
|
||
# Validates: Requirements 1.2, 1.5
|
||
# ---------------------------------------------------------------------------
|
||
@given(records=st.lists(_json_record, min_size=1, max_size=5))
|
||
@settings(max_examples=200)
|
||
def test_flatten_json_tree_field_merge_completeness(records: list[dict]):
|
||
"""
|
||
Feature: dataflow-structure-audit, Property 2: 多记录字段合并完整性与出现频率准确性
|
||
|
||
对于任意一组 JSON 记录列表,flatten_json_tree 的输出应满足:
|
||
1. 合并后的字段路径集合 == 所有单条记录字段路径集合的并集
|
||
2. 每个字段的 occurrence == 实际包含该字段的记录数
|
||
3. 每个字段的 occurrence <= total_records
|
||
4. total_records == len(records)
|
||
|
||
**Validates: Requirements 1.2, 1.5**
|
||
"""
|
||
result = flatten_json_tree(records)
|
||
|
||
# --- 参考计算:每条记录的叶子路径集合 ---
|
||
per_record_leaves: list[set[str]] = [
|
||
_collect_leaves(rec) for rec in records
|
||
]
|
||
expected_union = set().union(*per_record_leaves) if per_record_leaves else set()
|
||
|
||
# --- 属性 1:并集完整性 ---
|
||
actual_paths = set(result.keys())
|
||
assert actual_paths == expected_union, (
|
||
f"字段路径并集不一致。\n"
|
||
f" 遗漏: {expected_union - actual_paths}\n"
|
||
f" 多余: {actual_paths - expected_union}"
|
||
)
|
||
|
||
# --- 属性 2:occurrence 准确性 ---
|
||
for path, fi in result.items():
|
||
expected_occ = sum(1 for leaves in per_record_leaves if path in leaves)
|
||
assert fi.occurrence == expected_occ, (
|
||
f"occurrence 不一致: path={path}, "
|
||
f"expected={expected_occ}, actual={fi.occurrence}"
|
||
)
|
||
|
||
# --- 属性 3:occurrence <= total_records ---
|
||
assert fi.occurrence <= fi.total_records, (
|
||
f"occurrence 超过 total_records: path={path}, "
|
||
f"occurrence={fi.occurrence}, total_records={fi.total_records}"
|
||
)
|
||
|
||
# --- 属性 4:total_records 一致性 ---
|
||
for fi in result.values():
|
||
assert fi.total_records == len(records), (
|
||
f"total_records 不一致: expected={len(records)}, "
|
||
f"actual={fi.total_records}"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 单元测试:flatten_json_tree 边界情况
|
||
# Requirements: 1.1, 1.2, 1.3, 1.4
|
||
# ---------------------------------------------------------------------------
|
||
from collections import OrderedDict
|
||
|
||
|
||
def test_empty_records():
|
||
"""空记录列表返回空 OrderedDict。
|
||
|
||
Requirements: 1.2 边界
|
||
"""
|
||
result = flatten_json_tree([])
|
||
assert isinstance(result, OrderedDict)
|
||
assert len(result) == 0
|
||
|
||
|
||
def test_flat_json():
|
||
"""单层 JSON(无嵌套),所有字段 depth=0。
|
||
|
||
Requirements: 1.1
|
||
"""
|
||
record = {"name": "Alice", "age": 30, "active": True}
|
||
result = flatten_json_tree([record])
|
||
|
||
assert set(result.keys()) == {"name", "age", "active"}
|
||
for path, fi in result.items():
|
||
assert fi.depth == 0, f"顶层字段 {path} 的 depth 应为 0,实际为 {fi.depth}"
|
||
assert "." not in fi.path, f"顶层字段路径不应含 '.': {fi.path}"
|
||
assert fi.occurrence == 1
|
||
assert fi.total_records == 1
|
||
|
||
assert result["name"].json_type == "string"
|
||
assert result["name"].sample == "Alice"
|
||
assert result["age"].json_type == "integer"
|
||
assert result["age"].sample == "30"
|
||
assert result["active"].json_type == "boolean"
|
||
|
||
|
||
def test_deep_nesting():
|
||
"""深层嵌套(5+ 层),验证路径和 depth 正确。
|
||
|
||
Requirements: 1.1, 1.4
|
||
"""
|
||
# 构造 6 层嵌套:a.b.c.d.e.f = "leaf"
|
||
record = {
|
||
"a": {
|
||
"b": {
|
||
"c": {
|
||
"d": {
|
||
"e": {
|
||
"f": "leaf"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
result = flatten_json_tree([record])
|
||
|
||
assert "a.b.c.d.e.f" in result
|
||
fi = result["a.b.c.d.e.f"]
|
||
assert fi.depth == 5, f"6 层嵌套叶子 depth 应为 5,实际为 {fi.depth}"
|
||
assert fi.json_type == "string"
|
||
assert fi.sample == "leaf"
|
||
assert fi.occurrence == 1
|
||
assert fi.total_records == 1
|
||
# 确保中间 dict 节点不出现在结果中(只记录叶子)
|
||
assert len(result) == 1
|
||
|
||
|
||
def test_array_with_nested_objects():
|
||
"""数组内嵌套对象,验证 [] 标记和路径格式。
|
||
|
||
Requirements: 1.3
|
||
"""
|
||
record = {
|
||
"items": [
|
||
{"name": "apple", "price": 3.5},
|
||
{"name": "banana", "tags": ["fruit", "yellow"]},
|
||
]
|
||
}
|
||
result = flatten_json_tree([record])
|
||
|
||
# items[].name 和 items[].price 应存在
|
||
assert "items[].name" in result
|
||
assert "items[].price" in result
|
||
|
||
# items[].name: depth = "items.name".count(".") = 1
|
||
fi_name = result["items[].name"]
|
||
assert fi_name.depth == 1
|
||
assert fi_name.json_type == "string"
|
||
|
||
fi_price = result["items[].price"]
|
||
assert fi_price.depth == 1
|
||
assert fi_price.json_type == "number"
|
||
|
||
# 嵌套数组内的标量:items[].tags[] → 叶子路径
|
||
assert "items[].tags[]" in result
|
||
fi_tags = result["items[].tags[]"]
|
||
assert fi_tags.depth == 1 # "items.tags".count(".") = 1
|
||
assert fi_tags.json_type == "string"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 单元测试:collect_all_tables 编排逻辑
|
||
# Requirements: 2.1, 2.2, 2.3, 2.4
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
# 测试用 specs(精简版,只保留 collect_all_tables 需要的字段)
|
||
_TEST_SPECS = [
|
||
{
|
||
"code": "ODS_TABLE_A",
|
||
"table": "table_a",
|
||
"endpoint": "/api/table_a",
|
||
"description": "测试表 A",
|
||
},
|
||
{
|
||
"code": "ODS_TABLE_B",
|
||
"table": "table_b",
|
||
"endpoint": "/api/table_b",
|
||
"description": "测试表 B",
|
||
},
|
||
{
|
||
"code": "ODS_TABLE_C",
|
||
"table": "table_c",
|
||
"endpoint": "/api/table_c",
|
||
"description": "测试表 C",
|
||
},
|
||
]
|
||
|
||
|
||
def _fake_fetch(spec: dict, limit: int) -> list[dict]:
|
||
"""返回简单的假数据,用于验证编排逻辑。"""
|
||
return [{"id": 1, "name": "test", "value": 42}]
|
||
|
||
|
||
def test_collect_all_tables_happy_path():
|
||
"""全部表采集成功,无数据库连接。
|
||
|
||
Requirements: 2.1, 2.2
|
||
"""
|
||
config = AnalyzerConfig(limit=10)
|
||
results = collect_all_tables(config, _TEST_SPECS, fetch_fn=_fake_fetch)
|
||
|
||
assert len(results) == 3
|
||
for r in results:
|
||
assert isinstance(r, TableCollectionResult)
|
||
assert r.error is None
|
||
assert r.record_count == 1
|
||
assert len(r.json_fields) > 0
|
||
# 无 DB 连接时 ODS/DWD 列为空
|
||
assert r.ods_columns == []
|
||
assert r.dwd_columns == []
|
||
|
||
|
||
def test_collect_all_tables_filter_by_name():
|
||
"""config.tables 过滤只采集指定表。
|
||
|
||
Requirements: 2.2
|
||
"""
|
||
config = AnalyzerConfig(tables=["table_b"])
|
||
results = collect_all_tables(config, _TEST_SPECS, fetch_fn=_fake_fetch)
|
||
|
||
assert len(results) == 1
|
||
assert results[0].table_name == "table_b"
|
||
assert results[0].task_code == "ODS_TABLE_B"
|
||
|
||
|
||
def test_collect_all_tables_filter_case_insensitive():
|
||
"""表名过滤不区分大小写。
|
||
|
||
Requirements: 2.2
|
||
"""
|
||
config = AnalyzerConfig(tables=["TABLE_A", "Table_C"])
|
||
results = collect_all_tables(config, _TEST_SPECS, fetch_fn=_fake_fetch)
|
||
|
||
assert len(results) == 2
|
||
names = {r.table_name for r in results}
|
||
assert names == {"table_a", "table_c"}
|
||
|
||
|
||
def test_collect_all_tables_filter_no_match():
|
||
"""过滤条件无匹配时返回空列表。
|
||
|
||
Requirements: 2.2
|
||
"""
|
||
config = AnalyzerConfig(tables=["nonexistent"])
|
||
results = collect_all_tables(config, _TEST_SPECS, fetch_fn=_fake_fetch)
|
||
|
||
assert results == []
|
||
|
||
|
||
def test_collect_all_tables_single_table_error_isolation():
|
||
"""单表 fetch 失败不中断其余表的采集。
|
||
|
||
Requirements: 2.4
|
||
"""
|
||
call_count = 0
|
||
|
||
def _failing_fetch(spec: dict, limit: int) -> list[dict]:
|
||
nonlocal call_count
|
||
call_count += 1
|
||
if spec["table"] == "table_b":
|
||
raise RuntimeError("模拟 API 超时")
|
||
return [{"id": 1}]
|
||
|
||
config = AnalyzerConfig(limit=5)
|
||
results = collect_all_tables(config, _TEST_SPECS, fetch_fn=_failing_fetch)
|
||
|
||
assert len(results) == 3
|
||
assert call_count == 3
|
||
|
||
# table_a 成功
|
||
assert results[0].table_name == "table_a"
|
||
assert results[0].error is None
|
||
assert results[0].record_count == 1
|
||
|
||
# table_b 失败但有记录
|
||
assert results[1].table_name == "table_b"
|
||
assert results[1].error is not None
|
||
assert "模拟 API 超时" in results[1].error
|
||
assert results[1].record_count == 0
|
||
|
||
# table_c 成功(未被 table_b 的失败影响)
|
||
assert results[2].table_name == "table_c"
|
||
assert results[2].error is None
|
||
assert results[2].record_count == 1
|
||
|
||
|
||
def test_collect_all_tables_empty_specs():
|
||
"""空 specs 列表返回空结果。
|
||
|
||
Requirements: 2.2
|
||
"""
|
||
config = AnalyzerConfig()
|
||
results = collect_all_tables(config, [], fetch_fn=_fake_fetch)
|
||
|
||
assert results == []
|
||
|
||
|
||
def test_collect_all_tables_preserves_metadata():
|
||
"""采集结果保留 spec 中的元数据字段。
|
||
|
||
Requirements: 2.2
|
||
"""
|
||
config = AnalyzerConfig(limit=50)
|
||
results = collect_all_tables(config, _TEST_SPECS[:1], fetch_fn=_fake_fetch)
|
||
|
||
r = results[0]
|
||
assert r.table_name == "table_a"
|
||
assert r.task_code == "ODS_TABLE_A"
|
||
assert r.description == "测试表 A"
|
||
assert r.endpoint == "/api/table_a"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# dump_collection_results 单元测试
|
||
# Requirements: 5.4, 5.6
|
||
# ---------------------------------------------------------------------------
|
||
import json as _json
|
||
|
||
|
||
def _make_result(
|
||
table: str = "test_table",
|
||
task_code: str = "ODS_TEST",
|
||
record_count: int = 2,
|
||
error: str | None = None,
|
||
) -> TableCollectionResult:
|
||
"""构造一个最小化的 TableCollectionResult 用于测试。"""
|
||
fields = OrderedDict()
|
||
if not error:
|
||
fields["id"] = FieldInfo(
|
||
path="id", json_type="integer", sample="1",
|
||
depth=0, occurrence=2, total_records=record_count,
|
||
samples=["1", "2"],
|
||
)
|
||
fields["name"] = FieldInfo(
|
||
path="name", json_type="string", sample="测试",
|
||
depth=0, occurrence=1, total_records=record_count,
|
||
samples=["测试"],
|
||
)
|
||
ods_cols = [
|
||
ColumnInfo(name="id", data_type="bigint", is_nullable=False,
|
||
column_default=None, comment="主键", ordinal_position=1),
|
||
] if not error else []
|
||
dwd_cols_list = [
|
||
ColumnInfo(name="id", data_type="bigint", is_nullable=False,
|
||
column_default=None, comment="主键", ordinal_position=1),
|
||
ColumnInfo(name="store_id", data_type="integer", is_nullable=False,
|
||
column_default=None, comment="门店", ordinal_position=2),
|
||
] if not error else []
|
||
# dwd_tables: 模拟一张 DWD 表(表名 = "dwd_{table}")
|
||
dwd_tables_dict: dict[str, list[ColumnInfo]] = {}
|
||
if not error and dwd_cols_list:
|
||
dwd_tables_dict[f"dwd_{table}"] = dwd_cols_list
|
||
return TableCollectionResult(
|
||
table_name=table,
|
||
task_code=task_code,
|
||
description="测试表",
|
||
endpoint="/api/test",
|
||
record_count=record_count,
|
||
json_fields=fields,
|
||
ods_columns=ods_cols,
|
||
dwd_columns=dwd_cols_list,
|
||
dwd_tables=dwd_tables_dict,
|
||
error=error,
|
||
)
|
||
|
||
|
||
def test_dump_creates_subdirectories(tmp_path):
|
||
"""dump_collection_results 自动创建 json_trees/、db_schemas/、field_mappings/ 和 bd_descriptions/ 子目录。
|
||
|
||
Requirements: 5.4
|
||
"""
|
||
results = [_make_result()]
|
||
paths = dump_collection_results(results, tmp_path)
|
||
|
||
assert (tmp_path / "json_trees").is_dir()
|
||
assert (tmp_path / "db_schemas").is_dir()
|
||
assert (tmp_path / "field_mappings").is_dir()
|
||
assert (tmp_path / "bd_descriptions").is_dir()
|
||
assert paths["json_trees"] == tmp_path / "json_trees"
|
||
assert paths["db_schemas"] == tmp_path / "db_schemas"
|
||
assert paths["field_mappings"] == tmp_path / "field_mappings"
|
||
assert paths["bd_descriptions"] == tmp_path / "bd_descriptions"
|
||
assert paths["manifest"] == tmp_path
|
||
|
||
|
||
def test_dump_json_trees_format(tmp_path):
|
||
"""json_trees/{table}.json 包含正确的字段结构(含 samples)。
|
||
|
||
Requirements: 5.4
|
||
"""
|
||
results = [_make_result(table="orders", record_count=10)]
|
||
dump_collection_results(results, tmp_path)
|
||
|
||
tree_file = tmp_path / "json_trees" / "orders.json"
|
||
assert tree_file.exists()
|
||
data = _json.loads(tree_file.read_text(encoding="utf-8"))
|
||
|
||
assert data["table"] == "orders"
|
||
assert data["total_records"] == 10
|
||
assert isinstance(data["fields"], list)
|
||
assert len(data["fields"]) == 2
|
||
# 验证字段结构
|
||
f0 = data["fields"][0]
|
||
assert f0["path"] == "id"
|
||
assert f0["json_type"] == "integer"
|
||
assert f0["depth"] == 0
|
||
# 验证 samples 字段存在
|
||
assert "samples" in f0
|
||
assert isinstance(f0["samples"], list)
|
||
|
||
|
||
def test_dump_db_schemas_format(tmp_path):
|
||
"""db_schemas/ods_{table}.json 和 dwd_{dwd_short}.json 包含正确的列结构。
|
||
|
||
Requirements: 5.4
|
||
"""
|
||
results = [_make_result(table="members")]
|
||
dump_collection_results(results, tmp_path)
|
||
|
||
ods_file = tmp_path / "db_schemas" / "ods_members.json"
|
||
# DWD 文件名现在使用 DWD 表名(dwd_members),不再使用 ODS 表名
|
||
dwd_file = tmp_path / "db_schemas" / "dwd_dwd_members.json"
|
||
assert ods_file.exists()
|
||
assert dwd_file.exists()
|
||
|
||
ods = _json.loads(ods_file.read_text(encoding="utf-8"))
|
||
assert ods["schema"] == "ods"
|
||
assert ods["table"] == "members"
|
||
assert len(ods["columns"]) == 1
|
||
assert ods["columns"][0]["name"] == "id"
|
||
|
||
dwd = _json.loads(dwd_file.read_text(encoding="utf-8"))
|
||
assert dwd["schema"] == "dwd"
|
||
assert dwd["table"] == "dwd_members"
|
||
assert dwd["ods_source"] == "members"
|
||
assert len(dwd["columns"]) == 2
|
||
|
||
|
||
def test_dump_manifest_format(tmp_path):
|
||
"""collection_manifest.json 包含时间戳和表清单。
|
||
|
||
Requirements: 5.6
|
||
"""
|
||
results = [
|
||
_make_result(table="t1", task_code="ODS_T1", record_count=100),
|
||
_make_result(table="t2", task_code="ODS_T2", record_count=0, error="连接失败"),
|
||
]
|
||
dump_collection_results(results, tmp_path)
|
||
|
||
manifest_file = tmp_path / "collection_manifest.json"
|
||
assert manifest_file.exists()
|
||
manifest = _json.loads(manifest_file.read_text(encoding="utf-8"))
|
||
|
||
assert "timestamp" in manifest
|
||
assert isinstance(manifest["tables"], list)
|
||
assert len(manifest["tables"]) == 2
|
||
|
||
t1 = manifest["tables"][0]
|
||
assert t1["table"] == "t1"
|
||
assert t1["task_code"] == "ODS_T1"
|
||
assert t1["record_count"] == 100
|
||
assert t1["json_field_count"] == 2 # id + name
|
||
assert t1["ods_column_count"] == 1
|
||
assert t1["dwd_column_count"] == 2
|
||
assert t1["dwd_tables"] == ["dwd_t1"]
|
||
assert t1["error"] is None
|
||
|
||
t2 = manifest["tables"][1]
|
||
assert t2["error"] == "连接失败"
|
||
assert t2["record_count"] == 0
|
||
|
||
|
||
def test_dump_utf8_chinese_content(tmp_path):
|
||
"""JSON 文件使用 UTF-8 编码,中文字符不被转义。
|
||
|
||
Requirements: 5.4
|
||
"""
|
||
results = [_make_result()]
|
||
dump_collection_results(results, tmp_path)
|
||
|
||
tree_file = tmp_path / "json_trees" / "test_table.json"
|
||
raw = tree_file.read_text(encoding="utf-8")
|
||
# ensure_ascii=False 意味着中文直接写入,不会出现 \uXXXX
|
||
assert "测试" in raw
|
||
assert "\\u" not in raw
|
||
|
||
|
||
def test_dump_empty_results(tmp_path):
|
||
"""空结果列表仍生成 manifest 和子目录。
|
||
|
||
Requirements: 5.4
|
||
"""
|
||
paths = dump_collection_results([], tmp_path)
|
||
|
||
assert (tmp_path / "json_trees").is_dir()
|
||
assert (tmp_path / "db_schemas").is_dir()
|
||
assert (tmp_path / "field_mappings").is_dir()
|
||
assert (tmp_path / "bd_descriptions").is_dir()
|
||
manifest = _json.loads(
|
||
(tmp_path / "collection_manifest.json").read_text(encoding="utf-8")
|
||
)
|
||
assert manifest["tables"] == []
|
||
assert "timestamp" in manifest
|
||
assert "table_map" in manifest
|
||
assert paths["json_trees"] == tmp_path / "json_trees"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI 入口:build_parser / resolve_output_dir 单元测试
|
||
# ---------------------------------------------------------------------------
|
||
from analyze_dataflow import build_parser, resolve_output_dir
|
||
|
||
|
||
class TestBuildParser:
|
||
"""build_parser 参数解析测试。Requirements: 5.1, 5.2, 5.3"""
|
||
|
||
def test_defaults(self):
|
||
"""无参数时使用默认值。"""
|
||
parser = build_parser()
|
||
args = parser.parse_args([])
|
||
assert args.date_from is None
|
||
assert args.date_to is None
|
||
assert args.limit == 200
|
||
assert args.tables is None
|
||
|
||
def test_date_range(self):
|
||
"""--date-from / --date-to 正确解析。"""
|
||
parser = build_parser()
|
||
args = parser.parse_args(["--date-from", "2025-01-01", "--date-to", "2025-01-15"])
|
||
assert args.date_from == "2025-01-01"
|
||
assert args.date_to == "2025-01-15"
|
||
|
||
def test_limit(self):
|
||
"""--limit 解析为 int。"""
|
||
parser = build_parser()
|
||
args = parser.parse_args(["--limit", "50"])
|
||
assert args.limit == 50
|
||
|
||
def test_tables(self):
|
||
"""--tables 解析为逗号分隔字符串。"""
|
||
parser = build_parser()
|
||
args = parser.parse_args(["--tables", "settlement_records,payment_transactions"])
|
||
assert args.tables == "settlement_records,payment_transactions"
|
||
# 调用方可通过 split(',') 拆分为列表
|
||
assert args.tables.split(",") == ["settlement_records", "payment_transactions"]
|
||
|
||
def test_all_params_combined(self):
|
||
"""所有参数同时传入。"""
|
||
parser = build_parser()
|
||
args = parser.parse_args([
|
||
"--date-from", "2025-06-01",
|
||
"--date-to", "2025-06-30",
|
||
"--limit", "100",
|
||
"--tables", "orders,members",
|
||
])
|
||
assert args.date_from == "2025-06-01"
|
||
assert args.date_to == "2025-06-30"
|
||
assert args.limit == 100
|
||
assert args.tables == "orders,members"
|
||
|
||
|
||
class TestResolveOutputDir:
|
||
"""resolve_output_dir 输出目录回退测试。Requirements: 5.4, 5.5"""
|
||
|
||
def test_env_var_takes_priority(self, tmp_path, monkeypatch):
|
||
"""SYSTEM_ANALYZE_ROOT 存在时优先使用。"""
|
||
target = tmp_path / "custom_output"
|
||
monkeypatch.setenv("SYSTEM_ANALYZE_ROOT", str(target))
|
||
result = resolve_output_dir()
|
||
assert result == target
|
||
assert target.is_dir()
|
||
|
||
def test_fallback_to_docs_reports(self, monkeypatch):
|
||
"""SYSTEM_ANALYZE_ROOT 未设置时回退到 docs/reports/。"""
|
||
monkeypatch.delenv("SYSTEM_ANALYZE_ROOT", raising=False)
|
||
result = resolve_output_dir()
|
||
assert result == Path("docs/reports")
|
||
|
||
def test_creates_directory(self, tmp_path, monkeypatch):
|
||
"""目录不存在时自动创建。"""
|
||
target = tmp_path / "nested" / "deep" / "dir"
|
||
monkeypatch.setenv("SYSTEM_ANALYZE_ROOT", str(target))
|
||
result = resolve_output_dir()
|
||
assert result.is_dir()
|
||
|
||
def test_existing_directory_no_error(self, tmp_path, monkeypatch):
|
||
"""目录已存在时不报错。"""
|
||
target = tmp_path / "already_exists"
|
||
target.mkdir()
|
||
monkeypatch.setenv("SYSTEM_ANALYZE_ROOT", str(target))
|
||
result = resolve_output_dir()
|
||
assert result == target
|
||
assert target.is_dir()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Property 3: 输出文件名格式正确性
|
||
# Feature: dataflow-structure-audit, Property 3: 输出文件名格式正确性
|
||
# Validates: Requirements 5.6
|
||
# ---------------------------------------------------------------------------
|
||
import re
|
||
from datetime import datetime
|
||
|
||
from analyze_dataflow import generate_output_filename
|
||
|
||
|
||
@settings(max_examples=100)
|
||
@given(dt=st.datetimes(
|
||
min_value=datetime(1970, 1, 1),
|
||
max_value=datetime(9999, 12, 31, 23, 59, 59),
|
||
))
|
||
def test_output_filename_format(dt: datetime):
|
||
"""
|
||
Property 3: 输出文件名格式正确性
|
||
|
||
对于任意合法 datetime,generate_output_filename 生成的文件名应:
|
||
1. 匹配模式 dataflow_YYYY-MM-DD_HHMMSS.md
|
||
2. 文件名中的日期时间与输入 datetime 一致
|
||
|
||
Validates: Requirements 5.6
|
||
"""
|
||
filename = generate_output_filename(dt)
|
||
|
||
# 1. 格式匹配
|
||
pattern = r"^dataflow_\d{4}-\d{2}-\d{2}_\d{6}\.md$"
|
||
assert re.match(pattern, filename), (
|
||
f"文件名 '{filename}' 不匹配模式 {pattern}"
|
||
)
|
||
|
||
# 2. 日期时间一致性:从文件名中提取并与输入比对
|
||
match = re.match(
|
||
r"^dataflow_(\d{4})-(\d{2})-(\d{2})_(\d{2})(\d{2})(\d{2})\.md$",
|
||
filename,
|
||
)
|
||
assert match, f"无法从文件名 '{filename}' 中提取日期时间"
|
||
|
||
year, month, day, hour, minute, second = (int(g) for g in match.groups())
|
||
assert year == dt.year
|
||
assert month == dt.month
|
||
assert day == dt.day
|
||
assert hour == dt.hour
|
||
assert minute == dt.minute
|
||
assert second == dt.second
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 单元测试:Hook 配置文件格式验证
|
||
# Validates: Requirements 6.1
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestHookConfigFile:
|
||
"""验证 .kiro/hooks/dataflow-analyze.kiro.hook 配置文件格式正确性"""
|
||
|
||
HOOK_PATH = Path(__file__).resolve().parent.parent / ".kiro" / "hooks" / "dataflow-analyze.kiro.hook"
|
||
|
||
def test_hook_file_is_valid_json(self):
|
||
"""Hook 文件必须是合法 JSON"""
|
||
text = self.HOOK_PATH.read_text(encoding="utf-8")
|
||
data = json.loads(text) # 若非法 JSON 则抛 JSONDecodeError
|
||
assert isinstance(data, dict)
|
||
|
||
def test_hook_required_fields_exist(self):
|
||
"""Hook 文件必须包含 name、version、when.type、then.type、then.prompt"""
|
||
data = json.loads(self.HOOK_PATH.read_text(encoding="utf-8"))
|
||
assert "name" in data
|
||
assert "version" in data
|
||
assert "when" in data and "type" in data["when"]
|
||
assert "then" in data and "type" in data["then"]
|
||
assert "prompt" in data["then"]
|
||
|
||
def test_hook_when_type_is_user_triggered(self):
|
||
"""when.type 必须为 userTriggered"""
|
||
data = json.loads(self.HOOK_PATH.read_text(encoding="utf-8"))
|
||
assert data["when"]["type"] == "userTriggered"
|
||
|
||
def test_hook_then_type_is_ask_agent(self):
|
||
"""then.type 必须为 askAgent"""
|
||
data = json.loads(self.HOOK_PATH.read_text(encoding="utf-8"))
|
||
assert data["then"]["type"] == "askAgent"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 新增测试:parse_table_map / parse_fact_mappings / build_field_mappings
|
||
# Validates: 三层字段映射构建正确性
|
||
# ---------------------------------------------------------------------------
|
||
from dataflow_analyzer import (
|
||
build_field_mappings,
|
||
parse_table_map,
|
||
parse_fact_mappings,
|
||
)
|
||
|
||
|
||
class TestParseTableMap:
|
||
"""从 ETL 源码解析 TABLE_MAP。"""
|
||
|
||
def test_parse_real_file(self):
|
||
"""解析真实的 dwd_load_task.py,应返回非空字典。"""
|
||
result = parse_table_map()
|
||
# 真实文件存在时应有映射
|
||
if Path("apps/etl/connectors/feiqiu/tasks/dwd/dwd_load_task.py").exists():
|
||
assert len(result) > 0
|
||
# 所有 key 应以 "dwd." 开头,所有 value 应以 "ods." 开头
|
||
for dwd_t, ods_t in result.items():
|
||
assert dwd_t.startswith("dwd."), f"DWD 表名应以 dwd. 开头: {dwd_t}"
|
||
assert ods_t.startswith("ods."), f"ODS 表名应以 ods. 开头: {ods_t}"
|
||
|
||
def test_parse_nonexistent_file(self, tmp_path):
|
||
"""文件不存在时返回空字典。"""
|
||
result = parse_table_map(tmp_path / "nonexistent.py")
|
||
assert result == {}
|
||
|
||
def test_parse_file_without_table_map(self, tmp_path):
|
||
"""文件存在但无 TABLE_MAP 定义时返回空字典。"""
|
||
py_file = tmp_path / "empty.py"
|
||
py_file.write_text("# 空文件\n", encoding="utf-8")
|
||
result = parse_table_map(py_file)
|
||
assert result == {}
|
||
|
||
def test_parse_synthetic_table_map(self, tmp_path):
|
||
"""解析合成的 TABLE_MAP 定义。"""
|
||
py_file = tmp_path / "task.py"
|
||
py_file.write_text(
|
||
'TABLE_MAP: dict[str, str] = {\n'
|
||
' "dwd.dim_foo": "ods.foo_master",\n'
|
||
' "dwd.dwd_bar": "ods.bar_records",\n'
|
||
'}\n',
|
||
encoding="utf-8",
|
||
)
|
||
result = parse_table_map(py_file)
|
||
assert result == {
|
||
"dwd.dim_foo": "ods.foo_master",
|
||
"dwd.dwd_bar": "ods.bar_records",
|
||
}
|
||
|
||
|
||
class TestParseFactMappings:
|
||
"""从 ETL 源码解析 FACT_MAPPINGS。"""
|
||
|
||
def test_parse_real_file(self):
|
||
"""解析真实的 dwd_load_task.py,应返回非空字典。"""
|
||
result = parse_fact_mappings()
|
||
if Path("apps/etl/connectors/feiqiu/tasks/dwd/dwd_load_task.py").exists():
|
||
assert len(result) > 0
|
||
for dwd_t, mappings in result.items():
|
||
assert dwd_t.startswith("dwd."), f"DWD 表名应以 dwd. 开头: {dwd_t}"
|
||
for entry in mappings:
|
||
assert len(entry) == 3, f"映射条目应为 3 元组: {entry}"
|
||
|
||
def test_parse_nonexistent_file(self, tmp_path):
|
||
"""文件不存在时返回空字典。"""
|
||
result = parse_fact_mappings(tmp_path / "nonexistent.py")
|
||
assert result == {}
|
||
|
||
|
||
class TestBuildFieldMappings:
|
||
"""build_field_mappings 三层映射构建。"""
|
||
|
||
def _make_test_data(self):
|
||
"""构造测试用的 TableCollectionResult + TABLE_MAP + FACT_MAPPINGS。"""
|
||
fields = OrderedDict()
|
||
fields["id"] = FieldInfo(
|
||
path="id", json_type="integer", sample="123",
|
||
depth=0, occurrence=10, total_records=10,
|
||
samples=["123", "456", "789"],
|
||
)
|
||
fields["name"] = FieldInfo(
|
||
path="name", json_type="string", sample="测试",
|
||
depth=0, occurrence=10, total_records=10,
|
||
samples=["测试", "示例"],
|
||
)
|
||
fields["extra.nested"] = FieldInfo(
|
||
path="extra.nested", json_type="string", sample="嵌套值",
|
||
depth=1, occurrence=5, total_records=10,
|
||
samples=["嵌套值"],
|
||
)
|
||
|
||
ods_cols = [
|
||
ColumnInfo(name="id", data_type="BIGINT", is_nullable=False,
|
||
column_default=None, comment="主键", ordinal_position=1),
|
||
ColumnInfo(name="name", data_type="TEXT", is_nullable=True,
|
||
column_default=None, comment="名称", ordinal_position=2),
|
||
]
|
||
|
||
dwd_cols_main = [
|
||
ColumnInfo(name="foo_id", data_type="BIGINT", is_nullable=False,
|
||
column_default=None, comment="主键", ordinal_position=1),
|
||
ColumnInfo(name="name", data_type="TEXT", is_nullable=True,
|
||
column_default=None, comment="名称", ordinal_position=2),
|
||
ColumnInfo(name="scd2_start_time", data_type="TIMESTAMPTZ", is_nullable=True,
|
||
column_default=None, comment=None, ordinal_position=3),
|
||
]
|
||
|
||
result = TableCollectionResult(
|
||
table_name="foo_master",
|
||
task_code="ODS_FOO",
|
||
description="测试表",
|
||
endpoint="/api/foo",
|
||
record_count=10,
|
||
json_fields=fields,
|
||
ods_columns=ods_cols,
|
||
dwd_columns=dwd_cols_main,
|
||
dwd_tables={"dim_foo": dwd_cols_main},
|
||
)
|
||
|
||
table_map = {"dwd.dim_foo": "ods.foo_master"}
|
||
fact_mappings = {
|
||
"dwd.dim_foo": [("foo_id", "id", None)],
|
||
}
|
||
all_dwd_cols = {"dim_foo": dwd_cols_main}
|
||
|
||
return result, table_map, fact_mappings, all_dwd_cols
|
||
|
||
def test_anchors_generated(self):
|
||
"""锚点 ID 格式正确。"""
|
||
result, tm, fm, dwd = self._make_test_data()
|
||
mapping = build_field_mappings(result, tm, fm, dwd)
|
||
|
||
assert mapping["anchors"]["api"] == "api-foo-master"
|
||
assert mapping["anchors"]["ods"] == "ods-foo-master"
|
||
assert "dim_foo" in mapping["anchors"]["dwd"]
|
||
assert mapping["anchors"]["dwd"]["dim_foo"] == "dwd-dim-foo"
|
||
|
||
def test_json_to_ods_mapping(self):
|
||
"""JSON→ODS 映射:已映射和未映射字段正确标注。"""
|
||
result, tm, fm, dwd = self._make_test_data()
|
||
mapping = build_field_mappings(result, tm, fm, dwd)
|
||
|
||
j2o = mapping["json_to_ods"]
|
||
assert len(j2o) == 3
|
||
|
||
# id → 已映射
|
||
id_entry = next(e for e in j2o if e["json_path"] == "id")
|
||
assert id_entry["ods_col"] == "id"
|
||
assert id_entry["match_type"] in ("exact", "case_insensitive")
|
||
|
||
# name → 已映射
|
||
name_entry = next(e for e in j2o if e["json_path"] == "name")
|
||
assert name_entry["ods_col"] == "name"
|
||
|
||
# extra.nested → 未映射(ODS 无此列)
|
||
nested_entry = next(e for e in j2o if e["json_path"] == "extra.nested")
|
||
assert nested_entry["ods_col"] is None
|
||
assert nested_entry["match_type"] == "unmapped"
|
||
|
||
def test_ods_to_dwd_mapping(self):
|
||
"""ODS→DWD 映射:id 列映射到 dim_foo.foo_id。"""
|
||
result, tm, fm, dwd = self._make_test_data()
|
||
mapping = build_field_mappings(result, tm, fm, dwd)
|
||
|
||
o2d = mapping["ods_to_dwd"]
|
||
# id → foo_id(通过 FACT_MAPPINGS)
|
||
assert "id" in o2d
|
||
targets = o2d["id"]
|
||
assert any(t["dwd_col"] == "foo_id" and t["dwd_table"] == "dim_foo" for t in targets)
|
||
|
||
# name → name(同名直传)
|
||
assert "name" in o2d
|
||
name_targets = o2d["name"]
|
||
assert any(t["dwd_col"] == "name" and t["dwd_table"] == "dim_foo" for t in name_targets)
|
||
|
||
def test_dwd_to_ods_mapping(self):
|
||
"""DWD→ODS 映射:包含直接映射和 SCD2 列。"""
|
||
result, tm, fm, dwd = self._make_test_data()
|
||
mapping = build_field_mappings(result, tm, fm, dwd)
|
||
|
||
d2o = mapping["dwd_to_ods"]
|
||
assert "dim_foo" in d2o
|
||
cols = d2o["dim_foo"]
|
||
|
||
# foo_id ← id(FACT_MAPPINGS 显式映射)
|
||
foo_id_entry = next(c for c in cols if c["dwd_col"] == "foo_id")
|
||
assert foo_id_entry["ods_source"] == "id"
|
||
assert foo_id_entry["mapping_type"] == "直接"
|
||
|
||
# name ← name(同名直传)
|
||
name_entry = next(c for c in cols if c["dwd_col"] == "name")
|
||
assert name_entry["ods_source"] == "name"
|
||
assert name_entry["mapping_type"] == "直接"
|
||
assert name_entry["note"] == "同名直传"
|
||
|
||
# scd2_start_time(SCD2 元数据)
|
||
scd2_entry = next(c for c in cols if c["dwd_col"] == "scd2_start_time")
|
||
assert scd2_entry["mapping_type"] == "SCD2"
|
||
assert scd2_entry["ods_source"] == "—"
|
||
|
||
def test_field_mappings_dumped_to_file(self, tmp_path):
|
||
"""dump_collection_results 生成 field_mappings JSON 文件。"""
|
||
result, _, _, _ = self._make_test_data()
|
||
dump_collection_results([result], tmp_path)
|
||
|
||
fm_file = tmp_path / "field_mappings" / "foo_master.json"
|
||
assert fm_file.exists()
|
||
data = _json.loads(fm_file.read_text(encoding="utf-8"))
|
||
assert data["ods_table"] == "foo_master"
|
||
assert "anchors" in data
|
||
assert "json_to_ods" in data
|
||
assert "ods_to_dwd" in data
|
||
assert "dwd_to_ods" in data
|
||
|
||
def test_no_dwd_tables_produces_empty_mappings(self):
|
||
"""ODS 表无 DWD 映射时,ods_to_dwd 和 dwd_to_ods 为空。"""
|
||
fields = OrderedDict()
|
||
fields["id"] = FieldInfo(
|
||
path="id", json_type="integer", sample="1",
|
||
depth=0, occurrence=1, total_records=1,
|
||
samples=["1"],
|
||
)
|
||
ods_cols = [
|
||
ColumnInfo(name="id", data_type="BIGINT", is_nullable=False,
|
||
column_default=None, comment=None, ordinal_position=1),
|
||
]
|
||
result = TableCollectionResult(
|
||
table_name="orphan_table",
|
||
task_code="ODS_ORPHAN",
|
||
description="无 DWD 映射的表",
|
||
endpoint="/api/orphan",
|
||
record_count=1,
|
||
json_fields=fields,
|
||
ods_columns=ods_cols,
|
||
dwd_columns=[],
|
||
dwd_tables={},
|
||
)
|
||
mapping = build_field_mappings(result, {}, {}, {})
|
||
assert mapping["ods_to_dwd"] == {}
|
||
assert mapping["dwd_to_ods"] == {}
|
||
assert mapping["anchors"]["dwd"] == {}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 新增测试:多示例值采集(samples 字段)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def test_flatten_json_tree_collects_multiple_samples():
|
||
"""flatten_json_tree 为每个字段收集多个不同的示例值。"""
|
||
records = [
|
||
{"status": 1, "name": "Alice"},
|
||
{"status": 2, "name": "Bob"},
|
||
{"status": 1, "name": "Charlie"},
|
||
{"status": 3, "name": "Alice"},
|
||
]
|
||
result = flatten_json_tree(records)
|
||
|
||
# status 有 3 个不同值
|
||
assert "status" in result
|
||
fi_status = result["status"]
|
||
assert set(fi_status.samples) == {"1", "2", "3"}
|
||
|
||
# name 有 3 个不同值
|
||
fi_name = result["name"]
|
||
assert set(fi_name.samples) == {"Alice", "Bob", "Charlie"}
|
||
|
||
|
||
def test_flatten_json_tree_samples_limit():
|
||
"""samples 数量不超过 MAX_SAMPLES。"""
|
||
from dataflow_analyzer import MAX_SAMPLES
|
||
|
||
# 生成超过 MAX_SAMPLES 个不同值
|
||
records = [{"val": i} for i in range(MAX_SAMPLES + 5)]
|
||
result = flatten_json_tree(records)
|
||
|
||
fi = result["val"]
|
||
assert len(fi.samples) <= MAX_SAMPLES
|
||
|
||
|
||
def test_flatten_json_tree_samples_skip_none():
|
||
"""None 值不被收集到 samples 中。"""
|
||
records = [
|
||
{"x": None},
|
||
{"x": 42},
|
||
{"x": None},
|
||
{"x": 99},
|
||
]
|
||
result = flatten_json_tree(records)
|
||
|
||
fi = result["x"]
|
||
assert "None" not in fi.samples
|
||
assert set(fi.samples) == {"42", "99"}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 新增测试:parse_bd_manual_fields / load_bd_descriptions / dump_bd_descriptions
|
||
# ---------------------------------------------------------------------------
|
||
from dataflow_analyzer import (
|
||
parse_bd_manual_fields,
|
||
dump_bd_descriptions,
|
||
)
|
||
|
||
|
||
class TestParseBdManualFields:
|
||
"""从 BD_manual Markdown 文档解析字段说明。"""
|
||
|
||
def test_parse_standard_format(self, tmp_path):
|
||
"""标准 BD_manual 格式解析。"""
|
||
doc = tmp_path / "BD_manual_test.md"
|
||
doc.write_text(
|
||
"# test 测试表\n\n"
|
||
"## 字段说明\n\n"
|
||
"| 序号 | 字段名 | 类型 | 可空 | 说明 |\n"
|
||
"|------|--------|------|------|------|\n"
|
||
"| 1 | id | BIGINT | NO | 主键 ID |\n"
|
||
"| 2 | name | TEXT | YES | 名称字段 |\n"
|
||
"| 3 | status | INTEGER | YES | 状态:1=启用,0=禁用 |\n",
|
||
encoding="utf-8",
|
||
)
|
||
result = parse_bd_manual_fields(doc)
|
||
assert result["id"] == "主键 ID"
|
||
assert result["name"] == "名称字段"
|
||
assert result["status"] == "状态:1=启用,0=禁用"
|
||
|
||
def test_parse_nonexistent_file(self, tmp_path):
|
||
"""文件不存在时返回空字典。"""
|
||
result = parse_bd_manual_fields(tmp_path / "nonexistent.md")
|
||
assert result == {}
|
||
|
||
def test_parse_no_field_table(self, tmp_path):
|
||
"""文件存在但无字段说明表格时返回空字典。"""
|
||
doc = tmp_path / "empty.md"
|
||
doc.write_text("# 空文档\n\n没有字段说明。\n", encoding="utf-8")
|
||
result = parse_bd_manual_fields(doc)
|
||
assert result == {}
|
||
|
||
def test_parse_dwd_format_with_pk_column(self, tmp_path):
|
||
"""DWD 格式(含主键列)也能正确解析。"""
|
||
doc = tmp_path / "BD_manual_dim_test.md"
|
||
doc.write_text(
|
||
"# dim_test 测试维表\n\n"
|
||
"## 字段说明\n\n"
|
||
"| 序号 | 字段名 | 类型 | 可空 | 主键 | 说明 |\n"
|
||
"|------|--------|------|------|------|------|\n"
|
||
"| 1 | test_id | BIGINT | NO | PK | 唯一标识 |\n"
|
||
"| 2 | label | TEXT | YES | | 标签名 |\n",
|
||
encoding="utf-8",
|
||
)
|
||
result = parse_bd_manual_fields(doc)
|
||
assert result["test_id"] == "唯一标识"
|
||
assert result["label"] == "标签名"
|
||
|
||
def test_parse_real_ods_doc(self):
|
||
"""解析真实的 ODS BD_manual 文档(如果存在)。"""
|
||
doc = Path("apps/etl/connectors/feiqiu/docs/database/ODS/main/BD_manual_assistant_accounts_master.md")
|
||
if doc.exists():
|
||
result = parse_bd_manual_fields(doc)
|
||
assert len(result) > 0
|
||
# 应包含 id 字段
|
||
assert "id" in result
|
||
|
||
def test_parse_real_dwd_doc(self):
|
||
"""解析真实的 DWD BD_manual 文档(如果存在)。"""
|
||
doc = Path("apps/etl/connectors/feiqiu/docs/database/DWD/main/BD_manual_dim_assistant.md")
|
||
if doc.exists():
|
||
result = parse_bd_manual_fields(doc)
|
||
assert len(result) > 0
|
||
assert "assistant_id" in result
|
||
|
||
|
||
class TestDumpBdDescriptions:
|
||
"""dump_bd_descriptions 输出 BD 描述 JSON。"""
|
||
|
||
def test_dump_creates_files(self, tmp_path):
|
||
"""为每张表生成 bd_descriptions JSON 文件。"""
|
||
results = [_make_result(table="test_table")]
|
||
dump_bd_descriptions(results, tmp_path)
|
||
|
||
bd_file = tmp_path / "bd_descriptions" / "test_table.json"
|
||
assert bd_file.exists()
|
||
data = _json.loads(bd_file.read_text(encoding="utf-8"))
|
||
assert data["ods_table"] == "test_table"
|
||
assert "ods_fields" in data
|
||
assert "dwd_fields" in data
|
||
|
||
def test_dump_empty_results(self, tmp_path):
|
||
"""空结果列表不报错。"""
|
||
dump_bd_descriptions([], tmp_path)
|
||
assert (tmp_path / "bd_descriptions").is_dir()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 新增测试:gen_dataflow_report.py 报告生成器
|
||
# ---------------------------------------------------------------------------
|
||
from gen_dataflow_report import generate_report, _esc, _format_samples, _is_enum_like
|
||
|
||
|
||
class TestReportHelpers:
|
||
"""报告生成器辅助函数测试。"""
|
||
|
||
def test_esc_pipe(self):
|
||
"""管道符被转义。"""
|
||
assert _esc("a|b") == "a\\|b"
|
||
|
||
def test_esc_newline(self):
|
||
"""换行符被替换为空格。"""
|
||
assert _esc("a\nb") == "a b"
|
||
|
||
def test_esc_none(self):
|
||
"""空字符串返回空。"""
|
||
assert _esc("") == ""
|
||
|
||
def test_format_samples_basic(self):
|
||
"""基本多示例格式化。"""
|
||
result = _format_samples(["a", "b", "c"])
|
||
assert "`a`" in result
|
||
assert "`b`" in result
|
||
assert "`c`" in result
|
||
|
||
def test_format_samples_truncate(self):
|
||
"""超长示例值被截断。"""
|
||
long_val = "x" * 50
|
||
result = _format_samples([long_val])
|
||
assert "..." in result
|
||
|
||
def test_format_samples_empty(self):
|
||
"""空列表返回空字符串。"""
|
||
assert _format_samples([]) == ""
|
||
|
||
def test_is_enum_like_true(self):
|
||
"""少量不同值 + 足够记录数 → 枚举。"""
|
||
assert _is_enum_like(["1", "2", "3"], 100) is True
|
||
|
||
def test_is_enum_like_false_too_many(self):
|
||
"""超过 8 个不同值 → 非枚举。"""
|
||
assert _is_enum_like([str(i) for i in range(10)], 100) is False
|
||
|
||
def test_is_enum_like_false_single(self):
|
||
"""只有 1 个值 → 非枚举。"""
|
||
assert _is_enum_like(["1"], 100) is False
|
||
|
||
def test_is_enum_like_false_few_records(self):
|
||
"""记录数太少 → 非枚举。"""
|
||
assert _is_enum_like(["1", "2"], 3) is False
|
||
|
||
|
||
class TestGenerateReport:
|
||
"""generate_report 集成测试。"""
|
||
|
||
def _setup_data_dir(self, tmp_path):
|
||
"""构造最小化的数据目录用于报告生成。"""
|
||
# collection_manifest.json
|
||
manifest = {
|
||
"timestamp": "2026-02-16T22:00:00+08:00",
|
||
"table_map": {"dwd.dwd_test": "ods.test_table"},
|
||
"tables": [{
|
||
"table": "test_table",
|
||
"task_code": "ODS_TEST",
|
||
"description": "测试表",
|
||
"record_count": 10,
|
||
"json_field_count": 2,
|
||
"ods_column_count": 2,
|
||
"dwd_tables": ["dwd_test"],
|
||
"dwd_column_count": 3,
|
||
"error": None,
|
||
}],
|
||
}
|
||
(tmp_path / "collection_manifest.json").write_text(
|
||
_json.dumps(manifest, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# json_trees/test_table.json
|
||
jt_dir = tmp_path / "json_trees"
|
||
jt_dir.mkdir()
|
||
jt = {
|
||
"table": "test_table", "total_records": 10,
|
||
"fields": [
|
||
{"path": "id", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 10, "total_records": 10,
|
||
"samples": ["1", "2", "3"]},
|
||
{"path": "status", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 10, "total_records": 10,
|
||
"samples": ["0", "1", "2"]},
|
||
],
|
||
}
|
||
(jt_dir / "test_table.json").write_text(
|
||
_json.dumps(jt, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# db_schemas/
|
||
db_dir = tmp_path / "db_schemas"
|
||
db_dir.mkdir()
|
||
ods = {"schema": "ods", "table": "test_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": "主键", "ordinal_position": 1},
|
||
{"name": "status", "data_type": "INTEGER", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 2},
|
||
]}
|
||
(db_dir / "ods_test_table.json").write_text(
|
||
_json.dumps(ods, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
dwd = {"schema": "dwd", "table": "dwd_test", "ods_source": "test_table", "columns": [
|
||
{"name": "test_id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1},
|
||
{"name": "status", "data_type": "INTEGER", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 2},
|
||
{"name": "scd2_start_time", "data_type": "TIMESTAMPTZ", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 3},
|
||
]}
|
||
(db_dir / "dwd_dwd_test.json").write_text(
|
||
_json.dumps(dwd, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# field_mappings/test_table.json
|
||
fm_dir = tmp_path / "field_mappings"
|
||
fm_dir.mkdir()
|
||
fm = {
|
||
"ods_table": "test_table",
|
||
"anchors": {
|
||
"api": "api-test-table", "ods": "ods-test-table",
|
||
"dwd": {"dwd_test": "dwd-dwd-test"},
|
||
},
|
||
"json_to_ods": [
|
||
{"json_path": "id", "ods_col": "id", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0},
|
||
{"json_path": "status", "ods_col": "status", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0},
|
||
],
|
||
"ods_to_dwd": {
|
||
"id": [{"dwd_table": "dwd_test", "dwd_col": "test_id", "cast": None, "note": "字段重命名"}],
|
||
"status": [{"dwd_table": "dwd_test", "dwd_col": "status", "cast": None, "note": "同名直传"}],
|
||
},
|
||
"dwd_to_ods": {
|
||
"dwd_test": [
|
||
{"dwd_col": "test_id", "type": "BIGINT", "ods_source": "id", "mapping_type": "直接", "note": "字段重命名"},
|
||
{"dwd_col": "status", "type": "INTEGER", "ods_source": "status", "mapping_type": "直接", "note": "同名直传"},
|
||
{"dwd_col": "scd2_start_time", "type": "TIMESTAMPTZ", "ods_source": "—", "mapping_type": "SCD2", "note": "SCD2 元数据"},
|
||
],
|
||
},
|
||
}
|
||
(fm_dir / "test_table.json").write_text(
|
||
_json.dumps(fm, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# bd_descriptions/test_table.json
|
||
bd_dir = tmp_path / "bd_descriptions"
|
||
bd_dir.mkdir()
|
||
bd = {
|
||
"ods_table": "test_table",
|
||
"ods_fields": {"id": "主键 ID", "status": "状态:0=禁用,1=启用"},
|
||
"dwd_fields": {"dwd_test": {"test_id": "唯一标识", "status": "状态"}},
|
||
}
|
||
(bd_dir / "test_table.json").write_text(
|
||
_json.dumps(bd, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
return tmp_path
|
||
|
||
def test_report_contains_json_field_count(self, tmp_path):
|
||
"""总览表包含 API JSON 字段数列。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "API JSON 字段数" in report
|
||
|
||
def test_report_contains_field_diff(self, tmp_path):
|
||
"""报告包含字段对比差异章节。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "API↔ODS↔DWD 字段对比差异" in report
|
||
|
||
def test_report_contains_biz_desc_in_coverage(self, tmp_path):
|
||
"""覆盖率表包含业务描述列。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# 2.3 覆盖率表应有业务描述列头
|
||
lines = report.split("\n")
|
||
coverage_header = [l for l in lines if "业务描述" in l and "JSON 字段数" in l]
|
||
assert len(coverage_header) > 0
|
||
|
||
def test_report_contains_biz_desc_in_api_section(self, tmp_path):
|
||
"""API 源字段表包含业务描述列。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "主键 ID" in report # 来自 bd_descriptions
|
||
|
||
def test_report_contains_biz_desc_in_ods_section(self, tmp_path):
|
||
"""ODS 表结构包含业务描述列。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# ODS 表头应有业务描述
|
||
assert "| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 | 业务描述 |" in report
|
||
|
||
def test_report_contains_biz_desc_in_dwd_section(self, tmp_path):
|
||
"""DWD 表结构包含业务描述列。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 业务描述 |" in report
|
||
|
||
def test_report_contains_enum_samples(self, tmp_path):
|
||
"""枚举字段展示枚举值列表。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# status 有 3 个不同值,应被识别为枚举
|
||
assert "枚举值" in report
|
||
|
||
def test_report_anchors_present(self, tmp_path):
|
||
"""报告包含锚点标签。"""
|
||
data_dir = self._setup_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert '<a id="api-test-table"></a>' in report
|
||
assert '<a id="ods-test-table"></a>' in report
|
||
assert '<a id="dwd-dwd-test"></a>' in report
|
||
|
||
|
||
class TestFieldDiffSubTables:
|
||
"""测试 1.1 字段差异分表的生成逻辑。"""
|
||
|
||
def _setup_diff_data_dir(self, tmp_path):
|
||
"""构造包含多种差异类型的测试数据。"""
|
||
import json as _json
|
||
|
||
manifest = {
|
||
"timestamp": "2026-02-17T22:00:00+08:00",
|
||
"table_map": {"dwd.dwd_alpha": "ods.alpha_table"},
|
||
"tables": [{
|
||
"table": "alpha_table",
|
||
"task_code": "ODS_ALPHA",
|
||
"description": "差异测试表",
|
||
"record_count": 50,
|
||
"json_field_count": 6,
|
||
"ods_column_count": 5,
|
||
"dwd_tables": ["dwd_alpha"],
|
||
"dwd_column_count": 5,
|
||
"error": None,
|
||
}],
|
||
}
|
||
(tmp_path / "collection_manifest.json").write_text(
|
||
_json.dumps(manifest, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# json_trees
|
||
jt_dir = tmp_path / "json_trees"
|
||
jt_dir.mkdir()
|
||
jt = {
|
||
"table": "alpha_table", "total_records": 50,
|
||
"fields": [
|
||
{"path": "id", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 50, "total_records": 50, "samples": ["1"]},
|
||
{"path": "name", "json_type": "string", "sample": "foo",
|
||
"depth": 0, "occurrence": 50, "total_records": 50, "samples": ["foo"]},
|
||
{"path": "extra_flat", "json_type": "string", "sample": "x",
|
||
"depth": 0, "occurrence": 50, "total_records": 50, "samples": ["x"]},
|
||
{"path": "nested.a", "json_type": "string", "sample": "na",
|
||
"depth": 1, "occurrence": 50, "total_records": 50, "samples": ["na"]},
|
||
{"path": "nested.b", "json_type": "string", "sample": "nb",
|
||
"depth": 1, "occurrence": 50, "total_records": 50, "samples": ["nb"]},
|
||
{"path": "status", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 50, "total_records": 50, "samples": ["0", "1"]},
|
||
],
|
||
}
|
||
(jt_dir / "alpha_table.json").write_text(
|
||
_json.dumps(jt, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# db_schemas
|
||
db_dir = tmp_path / "db_schemas"
|
||
db_dir.mkdir()
|
||
ods = {"schema": "ods", "table": "alpha_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1},
|
||
{"name": "name", "data_type": "TEXT", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 2},
|
||
{"name": "status", "data_type": "INTEGER", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 3},
|
||
{"name": "ods_only_col", "data_type": "TEXT", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 4},
|
||
{"name": "no_dwd_col", "data_type": "TEXT", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 5},
|
||
]}
|
||
(db_dir / "ods_alpha_table.json").write_text(
|
||
_json.dumps(ods, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
dwd = {"schema": "dwd", "table": "dwd_alpha", "ods_source": "alpha_table", "columns": [
|
||
{"name": "alpha_id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1},
|
||
{"name": "name", "data_type": "TEXT", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 2},
|
||
{"name": "status", "data_type": "INTEGER", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 3},
|
||
{"name": "scd2_ver", "data_type": "INTEGER", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 4},
|
||
{"name": "derived_flag", "data_type": "BOOLEAN", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 5},
|
||
]}
|
||
(db_dir / "dwd_dwd_alpha.json").write_text(
|
||
_json.dumps(dwd, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# field_mappings — 构造各种差异
|
||
fm_dir = tmp_path / "field_mappings"
|
||
fm_dir.mkdir()
|
||
fm = {
|
||
"ods_table": "alpha_table",
|
||
"anchors": {
|
||
"api": "api-alpha-table", "ods": "ods-alpha-table",
|
||
"dwd": {"dwd_alpha": "dwd-dwd-alpha"},
|
||
},
|
||
"json_to_ods": [
|
||
{"json_path": "id", "ods_col": "id", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0},
|
||
{"json_path": "name", "ods_col": "name", "match_type": "exact",
|
||
"json_type": "string", "occurrence_pct": 100.0},
|
||
{"json_path": "status", "ods_col": "status", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0},
|
||
# 平层未映射
|
||
{"json_path": "extra_flat", "ods_col": None, "match_type": "unmapped",
|
||
"json_type": "string", "occurrence_pct": 100.0},
|
||
# 嵌套未映射
|
||
{"json_path": "nested.a", "ods_col": None, "match_type": "unmapped",
|
||
"json_type": "string", "occurrence_pct": 100.0},
|
||
{"json_path": "nested.b", "ods_col": None, "match_type": "unmapped",
|
||
"json_type": "string", "occurrence_pct": 100.0},
|
||
],
|
||
"ods_to_dwd": {
|
||
"id": [{"dwd_table": "dwd_alpha", "dwd_col": "alpha_id", "cast": None, "note": "重命名"}],
|
||
"name": [{"dwd_table": "dwd_alpha", "dwd_col": "name", "cast": None, "note": ""}],
|
||
"status": [{"dwd_table": "dwd_alpha", "dwd_col": "status", "cast": None, "note": ""}],
|
||
# ods_only_col 和 no_dwd_col 不在此 → ODS→DWD 未映射
|
||
},
|
||
"dwd_to_ods": {
|
||
"dwd_alpha": [
|
||
{"dwd_col": "alpha_id", "type": "BIGINT", "ods_source": "id",
|
||
"mapping_type": "直接", "note": "重命名"},
|
||
{"dwd_col": "name", "type": "TEXT", "ods_source": "name",
|
||
"mapping_type": "直接", "note": ""},
|
||
{"dwd_col": "status", "type": "INTEGER", "ods_source": "status",
|
||
"mapping_type": "直接", "note": ""},
|
||
# DWD 无 ODS 源
|
||
{"dwd_col": "scd2_ver", "type": "INTEGER", "ods_source": "—",
|
||
"mapping_type": "SCD2", "note": "SCD2 元数据"},
|
||
{"dwd_col": "derived_flag", "type": "BOOLEAN", "ods_source": "—",
|
||
"mapping_type": "派生", "note": "派生列"},
|
||
],
|
||
},
|
||
}
|
||
(fm_dir / "alpha_table.json").write_text(
|
||
_json.dumps(fm, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
# bd_descriptions
|
||
bd_dir = tmp_path / "bd_descriptions"
|
||
bd_dir.mkdir()
|
||
bd = {
|
||
"ods_table": "alpha_table",
|
||
"ods_fields": {"id": "主键", "name": "名称"},
|
||
"dwd_fields": {"dwd_alpha": {"alpha_id": "标识", "name": "名称"}},
|
||
}
|
||
(bd_dir / "alpha_table.json").write_text(
|
||
_json.dumps(bd, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
return tmp_path
|
||
|
||
def test_summary_table_has_links(self, tmp_path):
|
||
"""汇总表中非零差异数应包含跳转链接。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# API→ODS 未映射 = 3(1 平层 + 2 嵌套),应有链接
|
||
assert "[3](#diff-alpha-table)" in report
|
||
|
||
def test_summary_table_zero_no_link(self, tmp_path):
|
||
"""汇总表中差异数为 0 时不应有链接。"""
|
||
import json as _json
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
# 添加一个全部映射的表,使其所有差异列为 0
|
||
manifest = _json.loads((data_dir / "collection_manifest.json").read_text(encoding="utf-8"))
|
||
manifest["tables"].append({
|
||
"table": "zero_table", "task_code": "ODS_ZERO", "description": "零差异",
|
||
"record_count": 5, "json_field_count": 1, "ods_column_count": 1,
|
||
"dwd_tables": ["dwd_zero"], "dwd_column_count": 1, "error": None,
|
||
})
|
||
(data_dir / "collection_manifest.json").write_text(
|
||
_json.dumps(manifest, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
fm_z = {
|
||
"ods_table": "zero_table",
|
||
"anchors": {"api": "api-zero-table", "ods": "ods-zero-table",
|
||
"dwd": {"dwd_zero": "dwd-dwd-zero"}},
|
||
"json_to_ods": [{"json_path": "id", "ods_col": "id", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0}],
|
||
"ods_to_dwd": {"id": [{"dwd_table": "dwd_zero", "dwd_col": "id", "cast": None, "note": ""}]},
|
||
"dwd_to_ods": {"dwd_zero": [{"dwd_col": "id", "type": "BIGINT", "ods_source": "id",
|
||
"mapping_type": "直接", "note": ""}]},
|
||
}
|
||
(data_dir / "field_mappings" / "zero_table.json").write_text(
|
||
_json.dumps(fm_z, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
ods_z = {"schema": "ods", "table": "zero_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1}]}
|
||
(data_dir / "db_schemas" / "ods_zero_table.json").write_text(
|
||
_json.dumps(ods_z, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
dwd_z = {"schema": "dwd", "table": "dwd_zero", "ods_source": "zero_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1}]}
|
||
(data_dir / "db_schemas" / "dwd_dwd_zero.json").write_text(
|
||
_json.dumps(dwd_z, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
jt_z = {"table": "zero_table", "total_records": 5, "fields": [
|
||
{"path": "id", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 5, "total_records": 5, "samples": ["1"]}]}
|
||
(data_dir / "json_trees" / "zero_table.json").write_text(
|
||
_json.dumps(jt_z, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
bd_z = {"ods_table": "zero_table", "ods_fields": {}, "dwd_fields": {}}
|
||
(data_dir / "bd_descriptions" / "zero_table.json").write_text(
|
||
_json.dumps(bd_z, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
report = generate_report(data_dir)
|
||
# zero_table 行应有 "| 0 |" 且不含链接
|
||
zero_line = [l for l in report.split("\n") if "zero_table" in l and "| 0 |" in l]
|
||
assert len(zero_line) > 0
|
||
assert "#diff-zero-table" not in report
|
||
|
||
def test_diff_anchor_present(self, tmp_path):
|
||
"""分表应包含锚点标签。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert '<a id="diff-alpha-table"></a>' in report
|
||
|
||
def test_diff_subtable_title(self, tmp_path):
|
||
"""分表应有递增编号标题。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "#### 1.1.1 alpha_table 字段差异明细" in report
|
||
# 不应出现 1.1.x
|
||
assert "1.1.x" not in report
|
||
|
||
def test_api_unmapped_flat_bold(self, tmp_path):
|
||
"""平层未映射字段行应加粗。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "**[`extra_flat`](#api-alpha-table)**" in report
|
||
assert "**⚠️ 未映射**" in report
|
||
|
||
def test_api_unmapped_nested_in_details(self, tmp_path):
|
||
"""嵌套未映射字段应在 <details> 折叠块中。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "<details><summary>API→ODS 未映射(嵌套对象)" in report
|
||
assert "nested.a" in report
|
||
assert "nested.b" in report
|
||
assert "</details>" in report
|
||
|
||
def test_ods_no_dwd_fields_bold(self, tmp_path):
|
||
"""ODS→DWD 未映射字段行应加粗。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# ods_only_col 和 no_dwd_col 不在 ods_to_dwd 中
|
||
assert "**[`ods_only_col`](#ods-alpha-table)**" in report
|
||
assert "**[`no_dwd_col`](#ods-alpha-table)**" in report
|
||
|
||
def test_dwd_no_ods_fields_bold(self, tmp_path):
|
||
"""DWD 无 ODS 源字段行应加粗。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "**[`scd2_ver`](#dwd-dwd-alpha)**" in report
|
||
assert "**[`derived_flag`](#dwd-dwd-alpha)**" in report
|
||
assert "**⚠️ 无 ODS 源**" in report
|
||
|
||
def test_no_diff_table_skipped(self, tmp_path):
|
||
"""无差异的表不应生成分表。"""
|
||
import json as _json
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
# 添加一个无差异的表
|
||
manifest = _json.loads((data_dir / "collection_manifest.json").read_text(encoding="utf-8"))
|
||
manifest["tables"].append({
|
||
"table": "clean_table",
|
||
"task_code": "ODS_CLEAN",
|
||
"description": "无差异表",
|
||
"record_count": 10,
|
||
"json_field_count": 1,
|
||
"ods_column_count": 1,
|
||
"dwd_tables": ["dwd_clean"],
|
||
"dwd_column_count": 1,
|
||
"error": None,
|
||
})
|
||
(data_dir / "collection_manifest.json").write_text(
|
||
_json.dumps(manifest, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
# 构造完全匹配的 field_mappings
|
||
fm_clean = {
|
||
"ods_table": "clean_table",
|
||
"anchors": {"api": "api-clean-table", "ods": "ods-clean-table",
|
||
"dwd": {"dwd_clean": "dwd-dwd-clean"}},
|
||
"json_to_ods": [
|
||
{"json_path": "id", "ods_col": "id", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0},
|
||
],
|
||
"ods_to_dwd": {"id": [{"dwd_table": "dwd_clean", "dwd_col": "id",
|
||
"cast": None, "note": ""}]},
|
||
"dwd_to_ods": {"dwd_clean": [
|
||
{"dwd_col": "id", "type": "BIGINT", "ods_source": "id",
|
||
"mapping_type": "直接", "note": ""},
|
||
]},
|
||
}
|
||
(data_dir / "field_mappings" / "clean_table.json").write_text(
|
||
_json.dumps(fm_clean, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
ods_clean = {"schema": "ods", "table": "clean_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1},
|
||
]}
|
||
(data_dir / "db_schemas" / "ods_clean_table.json").write_text(
|
||
_json.dumps(ods_clean, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
dwd_clean = {"schema": "dwd", "table": "dwd_clean", "ods_source": "clean_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1},
|
||
]}
|
||
(data_dir / "db_schemas" / "dwd_dwd_clean.json").write_text(
|
||
_json.dumps(dwd_clean, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
jt_clean = {
|
||
"table": "clean_table", "total_records": 10,
|
||
"fields": [{"path": "id", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 10, "total_records": 10, "samples": ["1"]}],
|
||
}
|
||
(data_dir / "json_trees" / "clean_table.json").write_text(
|
||
_json.dumps(jt_clean, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
bd_clean = {"ods_table": "clean_table", "ods_fields": {}, "dwd_fields": {}}
|
||
(data_dir / "bd_descriptions" / "clean_table.json").write_text(
|
||
_json.dumps(bd_clean, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
report = generate_report(data_dir)
|
||
# clean_table 不应有差异分表
|
||
assert "clean_table 字段差异明细" not in report
|
||
# 但 alpha_table 应有
|
||
assert "alpha_table 字段差异明细" in report
|
||
|
||
def test_reason_includes_nested_and_scd2(self, tmp_path):
|
||
"""差异原因应包含嵌套对象和 SCD2 信息。"""
|
||
data_dir = self._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "嵌套对象 2 个" in report
|
||
assert "SCD2/派生列 2 个" in report
|
||
|
||
|
||
class TestGuessFieldPurpose:
|
||
"""测试 _guess_field_purpose 字段用途推测。"""
|
||
|
||
def test_scd2_field(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("scd2_start_time", "test", "DWD")
|
||
assert "SCD2" in purpose
|
||
assert conf == "高"
|
||
|
||
def test_id_field(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("id", "test", "ODS")
|
||
assert "主键" in purpose
|
||
assert conf == "高"
|
||
|
||
def test_foreign_key(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("tenant_id", "test", "ODS")
|
||
assert "租户" in purpose
|
||
assert conf == "高"
|
||
|
||
def test_nested_site_profile(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("siteProfile.shop_name", "test", "API")
|
||
assert "门店" in purpose
|
||
assert conf == "高"
|
||
|
||
def test_unknown_field(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("xyzzy_foo_bar", "test", "ODS")
|
||
assert "待分析" in purpose
|
||
assert conf == "低"
|
||
|
||
def test_price_field(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("cx_unit_price", "test", "ODS")
|
||
assert "金额" in purpose or "价格" in purpose
|
||
|
||
def test_derived_field(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("derived_flag", "test", "DWD")
|
||
assert "派生" in purpose
|
||
assert conf == "高"
|
||
|
||
def test_is_delete_field(self):
|
||
from gen_dataflow_report import _guess_field_purpose
|
||
purpose, conf = _guess_field_purpose("is_delete", "test", "ODS")
|
||
assert "删除" in purpose
|
||
assert conf == "高"
|
||
|
||
|
||
class TestDiffSubTablePurposeColumn:
|
||
"""测试差异分表中推测用途列的输出。"""
|
||
|
||
def test_purpose_column_in_flat_unmapped(self, tmp_path):
|
||
"""平层未映射分表应包含推测用途、置信度、示例值、说明列。"""
|
||
# 复用 TestFieldDiffSubTables 的数据构造
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# 表头应有推测用途 + 示例值 + 说明列
|
||
assert "| # | JSON 字段 | 推测用途 | 置信度 | 示例值 | 说明 | 状态 |" in report
|
||
|
||
def test_purpose_column_in_dwd_no_ods(self, tmp_path):
|
||
"""DWD 无 ODS 源分表应包含推测用途列。"""
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# scd2_ver 应被推测为 SCD2 元数据
|
||
assert "SCD2" in report
|
||
# derived_flag 应被推测为派生列
|
||
assert "派生" in report
|
||
|
||
def test_purpose_column_in_nested(self, tmp_path):
|
||
"""嵌套对象分表也应包含推测用途列。"""
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# 嵌套对象表头
|
||
lines = report.split("\n")
|
||
nested_headers = [l for l in lines if "推测用途" in l and "置信度" in l]
|
||
assert len(nested_headers) >= 1
|
||
|
||
def test_section_numbering_incremental(self, tmp_path):
|
||
"""多个差异分表应有递增编号 1.1.1, 1.1.2, ...。"""
|
||
import json as _json
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
# 添加第二个有差异的表
|
||
manifest = _json.loads((data_dir / "collection_manifest.json").read_text(encoding="utf-8"))
|
||
manifest["tables"].append({
|
||
"table": "beta_table", "task_code": "ODS_BETA", "description": "第二表",
|
||
"record_count": 10, "json_field_count": 1, "ods_column_count": 1,
|
||
"dwd_tables": ["dwd_beta"], "dwd_column_count": 2, "error": None,
|
||
})
|
||
(data_dir / "collection_manifest.json").write_text(
|
||
_json.dumps(manifest, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
fm_b = {
|
||
"ods_table": "beta_table",
|
||
"anchors": {"api": "api-beta-table", "ods": "ods-beta-table",
|
||
"dwd": {"dwd_beta": "dwd-dwd-beta"}},
|
||
"json_to_ods": [{"json_path": "id", "ods_col": "id", "match_type": "exact",
|
||
"json_type": "integer", "occurrence_pct": 100.0}],
|
||
"ods_to_dwd": {"id": [{"dwd_table": "dwd_beta", "dwd_col": "id", "cast": None, "note": ""}]},
|
||
"dwd_to_ods": {"dwd_beta": [
|
||
{"dwd_col": "id", "type": "BIGINT", "ods_source": "id", "mapping_type": "直接", "note": ""},
|
||
{"dwd_col": "scd2_flag", "type": "INTEGER", "ods_source": "—", "mapping_type": "SCD2", "note": ""},
|
||
]},
|
||
}
|
||
(data_dir / "field_mappings" / "beta_table.json").write_text(
|
||
_json.dumps(fm_b, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
ods_b = {"schema": "ods", "table": "beta_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1}]}
|
||
(data_dir / "db_schemas" / "ods_beta_table.json").write_text(
|
||
_json.dumps(ods_b, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
dwd_b = {"schema": "dwd", "table": "dwd_beta", "ods_source": "beta_table", "columns": [
|
||
{"name": "id", "data_type": "BIGINT", "is_nullable": False,
|
||
"column_default": None, "comment": None, "ordinal_position": 1},
|
||
{"name": "scd2_flag", "data_type": "INTEGER", "is_nullable": True,
|
||
"column_default": None, "comment": None, "ordinal_position": 2},
|
||
]}
|
||
(data_dir / "db_schemas" / "dwd_dwd_beta.json").write_text(
|
||
_json.dumps(dwd_b, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
jt_b = {"table": "beta_table", "total_records": 10, "fields": [
|
||
{"path": "id", "json_type": "integer", "sample": "1",
|
||
"depth": 0, "occurrence": 10, "total_records": 10, "samples": ["1"]}]}
|
||
(data_dir / "json_trees" / "beta_table.json").write_text(
|
||
_json.dumps(jt_b, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
bd_b = {"ods_table": "beta_table", "ods_fields": {}, "dwd_fields": {}}
|
||
(data_dir / "bd_descriptions" / "beta_table.json").write_text(
|
||
_json.dumps(bd_b, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
report = generate_report(data_dir)
|
||
assert "#### 1.1.1 alpha_table 字段差异明细" in report
|
||
assert "#### 1.1.2 beta_table 字段差异明细" in report
|
||
|
||
def test_sample_value_in_flat_unmapped(self, tmp_path):
|
||
"""平层未映射字段应显示来自 json_trees 的示例值。"""
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
# extra_flat 在 json_trees 中有 samples=["x"],应出现在差异子表的 extra_flat 行
|
||
lines = report.split("\n")
|
||
flat_rows = [l for l in lines if "extra_flat" in l and "未映射" in l]
|
||
assert len(flat_rows) >= 1
|
||
assert "`x`" in flat_rows[0]
|
||
|
||
def test_bd_desc_in_ods_no_json(self, tmp_path):
|
||
"""ODS 无 JSON 源子表应显示来自 bd_descriptions 的说明。"""
|
||
import json as _json
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
# 注入 bd_descriptions 中 ods_only_col 的说明
|
||
bd = {"ods_table": "alpha_table",
|
||
"ods_fields": {"ods_only_col": "仅ODS存在的测试列"},
|
||
"dwd_fields": {}}
|
||
(data_dir / "bd_descriptions" / "alpha_table.json").write_text(
|
||
_json.dumps(bd, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
report = generate_report(data_dir)
|
||
# 说明应出现在 ods_only_col 所在行
|
||
lines = report.split("\n")
|
||
ods_only_rows = [l for l in lines if "ods_only_col" in l and "无 JSON 源" in l]
|
||
assert len(ods_only_rows) >= 1
|
||
assert "仅ODS存在的测试列" in ods_only_rows[0]
|
||
|
||
def test_dwd_no_ods_has_desc_column(self, tmp_path):
|
||
"""DWD 无 ODS 源子表应包含说明列。"""
|
||
from test_dataflow_analyzer import TestFieldDiffSubTables
|
||
inst = TestFieldDiffSubTables()
|
||
data_dir = inst._setup_diff_data_dir(tmp_path)
|
||
report = generate_report(data_dir)
|
||
assert "| # | DWD 表 | DWD 列 | 推测用途 | 置信度 | 说明 | 状态 |" in report
|