886 lines
32 KiB
Python
886 lines
32 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
数据一致性检查器(黑盒测试)
|
||
|
||
以黑盒测试者角度,检查 ETL 数据流各层之间的一致性:
|
||
1. API 源数据 vs ODS 落库数据 — 字段完整性对比
|
||
2. ODS 数据 vs DWD 落库数据 — 映射正确性对比
|
||
|
||
输出 Markdown 格式的黑盒测试报告。
|
||
|
||
Requirements: 16.1, 16.2, 16.3, 16.4
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Any, Dict, List, Sequence, Tuple
|
||
from zoneinfo import ZoneInfo
|
||
|
||
from tasks.dwd.dwd_load_task import DwdLoadTask
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 数据结构
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class FieldCheckResult:
|
||
"""单个字段的检查结果"""
|
||
field_name: str
|
||
status: str # "pass" | "missing" | "mismatch" | "type_diff" | "skip"
|
||
detail: str = ""
|
||
|
||
|
||
@dataclass
|
||
class TableCheckResult:
|
||
"""单张表的检查结果"""
|
||
table_name: str
|
||
check_type: str # "api_vs_ods" | "ods_vs_dwd"
|
||
passed: bool = True
|
||
total_fields: int = 0
|
||
passed_fields: int = 0
|
||
missing_fields: int = 0
|
||
mismatch_fields: int = 0
|
||
field_results: List[FieldCheckResult] = field(default_factory=list)
|
||
row_count_source: int = 0
|
||
row_count_target: int = 0
|
||
sample_mismatches: List[Dict[str, Any]] = field(default_factory=list)
|
||
error: str | None = None
|
||
|
||
|
||
@dataclass
|
||
class ConsistencyReport:
|
||
"""完整的一致性检查报告"""
|
||
generated_at: str = ""
|
||
api_vs_ods_results: List[TableCheckResult] = field(default_factory=list)
|
||
ods_vs_dwd_results: List[TableCheckResult] = field(default_factory=list)
|
||
|
||
@property
|
||
def all_passed(self) -> bool:
|
||
return all(r.passed for r in self.api_vs_ods_results + self.ods_vs_dwd_results)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# ODS 表名 → API JSON 文件名映射(与 json_store.ENDPOINT_FILENAME_MAP 对齐)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
ODS_TABLE_TO_JSON_FILE: Dict[str, str] = {
|
||
"assistant_accounts_master": "assistant_accounts_master.json",
|
||
"assistant_service_records": "assistant_service_records.json",
|
||
"member_profiles": "member_profiles.json",
|
||
"member_stored_value_cards": "member_stored_value_cards.json",
|
||
"member_balance_changes": "member_balance_changes.json",
|
||
"recharge_settlements": "recharge_settlements.json",
|
||
"settlement_records": "settlement_records.json",
|
||
"table_fee_transactions": "table_fee_transactions.json",
|
||
"table_fee_discount_records": "table_fee_discount_records.json",
|
||
"store_goods_sales_records": "store_goods_sales_records.json",
|
||
"store_goods_master": "store_goods_master.json",
|
||
"tenant_goods_master": "tenant_goods_master.json",
|
||
"site_tables_master": "site_tables_master.json",
|
||
"group_buy_packages": "group_buy_packages.json",
|
||
"group_buy_redemption_records": "group_buy_redemption_records.json",
|
||
"platform_coupon_redemption_records": "platform_coupon_redemption_records.json",
|
||
"payment_transactions": "payment_transactions.json",
|
||
"refund_transactions": "refund_transactions.json",
|
||
"goods_stock_summary": "goods_stock_summary.json",
|
||
"goods_stock_movements": "goods_stock_movements.json",
|
||
"stock_goods_category_tree": "stock_goods_category_tree.json",
|
||
}
|
||
|
||
# CHANGE 2026-02-26 | ODS 表名 → task_code 映射,用于从 FETCH_ROOT 定位分页 JSON
|
||
# FETCH_ROOT 目录结构:{task_code}/{task_code}-{run_id}-{date}-{time}/{ods_table}.json
|
||
ODS_TABLE_TO_TASK_CODE: Dict[str, str] = {
|
||
"assistant_accounts_master": "ODS_ASSISTANT_ACCOUNT",
|
||
"assistant_service_records": "ODS_ASSISTANT_LEDGER",
|
||
"member_profiles": "ODS_MEMBER",
|
||
"member_stored_value_cards": "ODS_MEMBER_CARD",
|
||
"member_balance_changes": "ODS_MEMBER_BALANCE",
|
||
"recharge_settlements": "ODS_RECHARGE_SETTLE",
|
||
"settlement_records": "ODS_SETTLEMENT_RECORDS",
|
||
"table_fee_transactions": "ODS_TABLE_USE",
|
||
"table_fee_discount_records": "ODS_TABLE_FEE_DISCOUNT",
|
||
"store_goods_sales_records": "ODS_STORE_GOODS_SALES",
|
||
"store_goods_master": "ODS_STORE_GOODS",
|
||
"tenant_goods_master": "ODS_TENANT_GOODS",
|
||
"site_tables_master": "ODS_TABLES",
|
||
"group_buy_packages": "ODS_GROUP_PACKAGE",
|
||
"group_buy_redemption_records": "ODS_GROUP_BUY_REDEMPTION",
|
||
"platform_coupon_redemption_records": "ODS_PLATFORM_COUPON",
|
||
"payment_transactions": "ODS_PAYMENT",
|
||
"refund_transactions": "ODS_REFUND",
|
||
"goods_stock_summary": "ODS_INVENTORY_STOCK",
|
||
"goods_stock_movements": "ODS_INVENTORY_CHANGE",
|
||
"stock_goods_category_tree": "ODS_GOODS_CATEGORY",
|
||
"staff_info_master": "ODS_STAFF_INFO",
|
||
"settlement_ticket_records": "ODS_SETTLEMENT_TICKET",
|
||
"json_archive_records": "ODS_JSON_ARCHIVE",
|
||
}
|
||
|
||
# ODS 元数据列——不来自 API,由 ETL 框架自动填充
|
||
ODS_META_COLUMNS = frozenset({
|
||
"payload", "source_file", "source_endpoint",
|
||
"fetched_at", "content_hash", "record_index",
|
||
"site_id", "tenant_id", "siteprofile", "site_profile",
|
||
})
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 核心检查逻辑(纯函数,不依赖数据库连接)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def extract_api_fields_from_json(json_path: Path) -> set[str] | None:
|
||
"""从 API JSON 缓存文件中提取第一条记录的所有字段名"""
|
||
if not json_path.exists():
|
||
return None
|
||
try:
|
||
with json_path.open("r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
except (json.JSONDecodeError, OSError):
|
||
return None
|
||
|
||
# 支持多种 JSON 结构:直接列表 / {"data": [...]} / {"data": {"xxxList": [...]}}
|
||
records = _extract_records(data)
|
||
if not records:
|
||
return None
|
||
|
||
# 合并前 N 条记录的字段(避免单条记录字段不全)
|
||
all_fields: set[str] = set()
|
||
for rec in records[:10]:
|
||
if isinstance(rec, dict):
|
||
all_fields.update(rec.keys())
|
||
return all_fields
|
||
|
||
|
||
def _extract_records(data: Any) -> list[dict]:
|
||
"""从 API 响应中提取记录列表"""
|
||
if isinstance(data, list):
|
||
return data
|
||
if isinstance(data, dict):
|
||
# 尝试 data 键
|
||
inner = data.get("data")
|
||
if isinstance(inner, list):
|
||
return inner
|
||
if isinstance(inner, dict):
|
||
# 尝试各种 list_key
|
||
for key, val in inner.items():
|
||
if isinstance(val, list) and val:
|
||
return val
|
||
return []
|
||
|
||
|
||
def extract_api_fields_from_fetch_root(
|
||
fetch_root: Path,
|
||
ods_table: str,
|
||
) -> set[str] | None:
|
||
"""从 FETCH_ROOT 分页 JSON 中提取 API 原始字段名。
|
||
|
||
CHANGE 2026-02-26 | 替代 extract_api_fields_from_json 的 API_SAMPLE_CACHE_ROOT 依赖,
|
||
直接读取 ETL 实际抓取的分页 JSON,无需额外手动生成缓存。
|
||
|
||
目录结构:FETCH_ROOT/{task_code}/{task_code}-{run_id}-{date}-{time}/{ods_table}.json
|
||
分页 JSON 结构:{ "pages": [{ "response": { "data": { "{listKey}": [...] } } }] }
|
||
"""
|
||
task_code = ODS_TABLE_TO_TASK_CODE.get(ods_table)
|
||
if not task_code:
|
||
return None
|
||
|
||
task_dir = fetch_root / task_code
|
||
if not task_dir.is_dir():
|
||
return None
|
||
|
||
# 取最新 run 目录(按目录名排序,格式含时间戳)
|
||
run_dirs = sorted(
|
||
(d for d in task_dir.iterdir() if d.is_dir()),
|
||
key=lambda d: d.name,
|
||
reverse=True,
|
||
)
|
||
if not run_dirs:
|
||
return None
|
||
|
||
# 在最新 run 目录中查找 {ods_table}.json
|
||
json_file = run_dirs[0] / f"{ods_table}.json"
|
||
if not json_file.exists():
|
||
return None
|
||
|
||
try:
|
||
with json_file.open("r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
except (json.JSONDecodeError, OSError):
|
||
return None
|
||
|
||
records = _extract_records_from_paged_json(data)
|
||
if not records:
|
||
return None
|
||
|
||
all_fields: set[str] = set()
|
||
for rec in records[:10]:
|
||
if isinstance(rec, dict):
|
||
all_fields.update(rec.keys())
|
||
return all_fields
|
||
|
||
|
||
def _extract_records_from_paged_json(data: Any) -> list[dict]:
|
||
"""从 ETL 分页 JSON 中提取业务记录。
|
||
|
||
分页 JSON 格式:
|
||
{ "pages": [{ "response": { "data": { "{listKey}": [record, ...] } } }] }
|
||
也兼容 gen_full_dataflow_doc 的扁平缓存格式(直接列表 / {"data": [...]})。
|
||
"""
|
||
if not isinstance(data, dict):
|
||
return _extract_records(data)
|
||
|
||
pages = data.get("pages")
|
||
if not isinstance(pages, list) or not pages:
|
||
# 回退到扁平格式
|
||
return _extract_records(data)
|
||
|
||
# 从第一个有数据的 page 中提取记录
|
||
for page in pages:
|
||
if not isinstance(page, dict):
|
||
continue
|
||
response = page.get("response")
|
||
if not isinstance(response, dict):
|
||
continue
|
||
records = _extract_records(response)
|
||
if records:
|
||
return records
|
||
|
||
return []
|
||
|
||
|
||
def check_api_vs_ods_fields(
|
||
api_fields: set[str],
|
||
ods_columns: set[str],
|
||
) -> TableCheckResult:
|
||
"""
|
||
对比 API JSON 字段与 ODS 表列的覆盖率。
|
||
|
||
检查逻辑:API 中的每个字段是否在 ODS 表中有对应列(小写匹配)。
|
||
ODS 元数据列(payload, fetched_at 等)不参与对比。
|
||
"""
|
||
result = TableCheckResult(table_name="", check_type="api_vs_ods")
|
||
|
||
# 标准化为小写
|
||
api_lower = {f.lower() for f in api_fields}
|
||
ods_lower = ods_columns - ODS_META_COLUMNS
|
||
|
||
result.total_fields = len(api_lower)
|
||
|
||
for api_field in sorted(api_lower):
|
||
if api_field in ods_lower:
|
||
result.field_results.append(
|
||
FieldCheckResult(api_field, "pass", "ODS 中存在同名列")
|
||
)
|
||
result.passed_fields += 1
|
||
else:
|
||
# 嵌套对象字段(含大写字母的驼峰命名)可能被展开或存入 payload
|
||
result.field_results.append(
|
||
FieldCheckResult(api_field, "missing", "ODS 中无对应列")
|
||
)
|
||
result.missing_fields += 1
|
||
|
||
result.passed = result.missing_fields == 0
|
||
return result
|
||
|
||
|
||
def check_ods_vs_dwd_mappings(
|
||
dwd_table: str,
|
||
ods_table: str,
|
||
dwd_columns: set[str],
|
||
ods_columns: set[str],
|
||
fact_mappings: list[tuple[str, str, str | None]] | None,
|
||
) -> TableCheckResult:
|
||
"""
|
||
对比 ODS 数据与 DWD 落库数据的映射正确性。
|
||
|
||
检查逻辑:
|
||
1. DWD 表的每个非 SCD2 列,是否在 FACT_MAPPINGS 中有显式映射或在 ODS 中有同名列(自动映射)
|
||
2. FACT_MAPPINGS 中引用的 ODS 列/表达式是否合法
|
||
"""
|
||
# CHANGE [2026-02-20] intent: 上游 API 不提供的字段,标记为已知无源而非报错
|
||
KNOWN_NO_SOURCE: Dict[str, set[str]] = {
|
||
"dwd.dim_member": {"update_time"},
|
||
}
|
||
|
||
result = TableCheckResult(
|
||
table_name=dwd_table,
|
||
check_type="ods_vs_dwd",
|
||
)
|
||
|
||
scd_cols = {c.lower() for c in DwdLoadTask.SCD_COLS}
|
||
ods_lower = {c.lower() for c in ods_columns}
|
||
|
||
# 构建显式映射字典:dwd_col -> (ods_expr, cast_type)
|
||
explicit_map: Dict[str, Tuple[str, str | None]] = {}
|
||
if fact_mappings:
|
||
for dwd_col, ods_expr, cast_type in fact_mappings:
|
||
explicit_map[dwd_col.lower()] = (ods_expr, cast_type)
|
||
|
||
check_cols = sorted(c for c in dwd_columns if c.lower() not in scd_cols)
|
||
result.total_fields = len(check_cols)
|
||
|
||
for col in check_cols:
|
||
col_lower = col.lower()
|
||
|
||
if col_lower in explicit_map:
|
||
ods_expr, cast_type = explicit_map[col_lower]
|
||
# 验证 ODS 表达式引用的列是否存在
|
||
expr_valid = _validate_ods_expression(ods_expr, ods_lower)
|
||
if expr_valid:
|
||
result.field_results.append(
|
||
FieldCheckResult(col, "pass", f"显式映射: {ods_expr}")
|
||
)
|
||
result.passed_fields += 1
|
||
else:
|
||
result.field_results.append(
|
||
FieldCheckResult(
|
||
col, "mismatch",
|
||
f"显式映射引用的 ODS 列/表达式无法验证: {ods_expr}"
|
||
)
|
||
)
|
||
result.mismatch_fields += 1
|
||
|
||
elif col_lower in ods_lower:
|
||
# 自动映射:ODS 和 DWD 同名
|
||
result.field_results.append(
|
||
FieldCheckResult(col, "pass", "自动映射(同名列)")
|
||
)
|
||
result.passed_fields += 1
|
||
|
||
elif col_lower == "fetched_at" and "fetched_at" in ods_lower:
|
||
result.field_results.append(
|
||
FieldCheckResult(col, "pass", "ETL 元数据列")
|
||
)
|
||
result.passed_fields += 1
|
||
|
||
else:
|
||
# CHANGE [2026-02-20] intent: 白名单跳过已知无源字段,避免误报
|
||
no_source = KNOWN_NO_SOURCE.get(dwd_table, set())
|
||
if col_lower in no_source:
|
||
result.field_results.append(
|
||
FieldCheckResult(col, "pass", "已知无源字段(上游 API 不提供)")
|
||
)
|
||
result.passed_fields += 1
|
||
else:
|
||
result.field_results.append(
|
||
FieldCheckResult(
|
||
col, "missing",
|
||
"DWD 列无 ODS 映射源(无显式映射且无同名 ODS 列)"
|
||
)
|
||
)
|
||
result.missing_fields += 1
|
||
|
||
result.passed = result.missing_fields == 0 and result.mismatch_fields == 0
|
||
return result
|
||
|
||
|
||
def _validate_ods_expression(expr: str, ods_columns: set[str]) -> bool:
|
||
"""
|
||
验证 FACT_MAPPINGS 中的 ODS 表达式是否合法。
|
||
|
||
简单列名:检查是否在 ODS 列集合中
|
||
JSON 表达式(含 ->>、#>>):检查基础列名
|
||
SQL 表达式(含 CASE、COALESCE 等):视为合法(无法静态验证)
|
||
NULL 字面量:合法
|
||
"""
|
||
if expr.upper() == "NULL":
|
||
return True
|
||
|
||
# 带引号的列名(如 "siteGoodsId")
|
||
stripped = expr.strip('"')
|
||
|
||
# JSON 路径表达式
|
||
if "->>" in expr or "#>>" in expr:
|
||
base_col = expr.split("->>")[0].split("#>>")[0].strip().strip('"').lower()
|
||
return base_col in ods_columns
|
||
|
||
# SQL 表达式(CASE WHEN, COALESCE, 函数调用等)
|
||
sql_keywords = {"case", "when", "coalesce", "nullif", "cast", "concat"}
|
||
if any(kw in expr.lower() for kw in sql_keywords):
|
||
return True
|
||
|
||
# 简单列名
|
||
return stripped.lower() in ods_columns
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 数据库交互层
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _fetch_table_columns(cur, schema: str, table: str) -> set[str]:
|
||
"""从 information_schema 获取表的列名集合(小写)"""
|
||
cur.execute(
|
||
"""
|
||
SELECT column_name
|
||
FROM information_schema.columns
|
||
WHERE table_schema = %s AND table_name = %s
|
||
ORDER BY ordinal_position
|
||
""",
|
||
(schema, table),
|
||
)
|
||
return {row[0].lower() for row in cur.fetchall()}
|
||
|
||
|
||
def _fetch_row_count(cur, schema: str, table: str) -> int:
|
||
"""获取表的行数"""
|
||
cur.execute(f'SELECT COUNT(1) FROM "{schema}"."{table}"')
|
||
row = cur.fetchone()
|
||
return int(row[0]) if row else 0
|
||
|
||
|
||
def _split_table(name: str, default_schema: str) -> Tuple[str, str]:
|
||
if "." in name:
|
||
schema, table = name.split(".", 1)
|
||
return schema, table
|
||
return default_schema, name
|
||
|
||
|
||
def _sample_value_mismatches(
|
||
cur,
|
||
dwd_schema: str,
|
||
dwd_table: str,
|
||
ods_schema: str,
|
||
ods_table: str,
|
||
dwd_pk_cols: list[str],
|
||
ods_pk_cols: list[str],
|
||
explicit_map: Dict[str, Tuple[str, str | None]],
|
||
dwd_columns: set[str],
|
||
ods_columns: set[str],
|
||
limit: int = 5,
|
||
) -> list[dict]:
|
||
"""
|
||
采样对比 ODS 与 DWD 的实际数据值差异。
|
||
|
||
通过主键 JOIN 两表,对比映射列的值是否一致。
|
||
返回不一致的采样记录。
|
||
"""
|
||
scd_cols = {c.lower() for c in DwdLoadTask.SCD_COLS}
|
||
ods_meta = {"payload", "source_file", "source_endpoint",
|
||
"fetched_at", "content_hash", "record_index"}
|
||
ods_lower = {c.lower() for c in ods_columns}
|
||
|
||
# 确定 JOIN 键
|
||
# DWD 主键中第一个在 ODS 中也存在的列(或通过映射对应的列)
|
||
join_pairs: list[Tuple[str, str]] = [] # (dwd_col, ods_expr)
|
||
for pk in dwd_pk_cols:
|
||
pk_lower = pk.lower()
|
||
if pk_lower in scd_cols:
|
||
continue
|
||
if pk_lower in explicit_map:
|
||
ods_expr, cast_type = explicit_map[pk_lower]
|
||
# 简单列名才能 JOIN
|
||
stripped = ods_expr.strip('"')
|
||
if stripped.lower() in ods_lower and stripped.isidentifier():
|
||
join_pairs.append((pk_lower, ods_expr))
|
||
elif pk_lower in ods_lower:
|
||
join_pairs.append((pk_lower, f'"{pk_lower}"'))
|
||
elif "id" in ods_lower and pk_lower.endswith("_id"):
|
||
join_pairs.append((pk_lower, '"id"'))
|
||
|
||
if not join_pairs:
|
||
return []
|
||
|
||
# 确定对比列
|
||
compare_cols: list[Tuple[str, str]] = [] # (dwd_col, ods_expr)
|
||
for col in sorted(dwd_columns):
|
||
col_lower = col.lower()
|
||
if col_lower in scd_cols or col_lower in ods_meta:
|
||
continue
|
||
if col_lower in {jp[0] for jp in join_pairs}:
|
||
continue
|
||
if col_lower in explicit_map:
|
||
ods_expr, _ = explicit_map[col_lower]
|
||
# 跳过复杂 SQL 表达式
|
||
if any(kw in ods_expr.lower() for kw in ("case", "coalesce", "nullif")):
|
||
continue
|
||
compare_cols.append((col_lower, ods_expr))
|
||
elif col_lower in ods_lower:
|
||
compare_cols.append((col_lower, f'"{col_lower}"'))
|
||
|
||
if not compare_cols:
|
||
return []
|
||
|
||
# 构建 SQL
|
||
join_cond = " AND ".join(
|
||
f'd."{dwd_col}" = o.{ods_expr}' if not ods_expr.startswith('"') or ods_expr.strip('"').isidentifier()
|
||
else f'd."{dwd_col}" = o.{ods_expr}'
|
||
for dwd_col, ods_expr in join_pairs
|
||
)
|
||
|
||
# 构建 WHERE 条件:任一对比列不一致
|
||
diff_conditions = []
|
||
for dwd_col, ods_expr in compare_cols:
|
||
# 使用 IS DISTINCT FROM 处理 NULL
|
||
diff_conditions.append(
|
||
f'd."{dwd_col}"::text IS DISTINCT FROM o.{ods_expr}::text'
|
||
)
|
||
|
||
if not diff_conditions:
|
||
return []
|
||
|
||
where_diff = " OR ".join(diff_conditions)
|
||
|
||
# ODS 快照去重(取最新 fetched_at)
|
||
ods_has_content_hash = "content_hash" in ods_lower
|
||
if ods_has_content_hash and ods_pk_cols:
|
||
ods_biz_pks = [c for c in ods_pk_cols if c.lower() != "content_hash"]
|
||
if ods_biz_pks:
|
||
distinct_on = ", ".join(f'"{c}"' for c in ods_biz_pks)
|
||
ods_subquery = (
|
||
f'(SELECT DISTINCT ON ({distinct_on}) * '
|
||
f'FROM "{ods_schema}"."{ods_table}" '
|
||
f'ORDER BY {distinct_on}, "fetched_at" DESC NULLS LAST) o'
|
||
)
|
||
else:
|
||
ods_subquery = f'"{ods_schema}"."{ods_table}" o'
|
||
else:
|
||
ods_subquery = f'"{ods_schema}"."{ods_table}" o'
|
||
|
||
# DWD SCD2 过滤
|
||
dwd_where = ""
|
||
if any(c.lower() == "scd2_is_current" for c in dwd_columns):
|
||
dwd_where = "WHERE COALESCE(scd2_is_current, 1) = 1"
|
||
|
||
select_parts = []
|
||
for dwd_col, _ in join_pairs:
|
||
select_parts.append(f'd."{dwd_col}"')
|
||
for dwd_col, ods_expr in compare_cols[:10]: # 限制对比列数
|
||
select_parts.append(f'd."{dwd_col}" AS "dwd_{dwd_col}"')
|
||
select_parts.append(f'o.{ods_expr}::text AS "ods_{dwd_col}"')
|
||
|
||
select_sql = ", ".join(select_parts)
|
||
|
||
sql = (
|
||
f"SELECT {select_sql} "
|
||
f'FROM (SELECT * FROM "{dwd_schema}"."{dwd_table}" {dwd_where}) d '
|
||
f"JOIN {ods_subquery} ON {join_cond} "
|
||
f"WHERE {where_diff} "
|
||
f"LIMIT %s"
|
||
)
|
||
|
||
try:
|
||
cur.execute(sql, (limit,))
|
||
rows = cur.fetchall()
|
||
if not rows:
|
||
return []
|
||
columns = [desc[0] for desc in (cur.description or [])]
|
||
return [dict(zip(columns, r)) for r in rows]
|
||
except Exception:
|
||
# 复杂表达式可能导致 SQL 错误,静默跳过
|
||
return []
|
||
|
||
|
||
def _fetch_pk_columns(cur, schema: str, table: str) -> list[str]:
|
||
"""获取表的主键列"""
|
||
cur.execute(
|
||
"""
|
||
SELECT kcu.column_name
|
||
FROM information_schema.table_constraints tc
|
||
JOIN information_schema.key_column_usage kcu
|
||
ON tc.constraint_name = kcu.constraint_name
|
||
AND tc.table_schema = kcu.table_schema
|
||
WHERE tc.constraint_type = 'PRIMARY KEY'
|
||
AND tc.table_schema = %s
|
||
AND tc.table_name = %s
|
||
ORDER BY kcu.ordinal_position
|
||
""",
|
||
(schema, table),
|
||
)
|
||
return [r[0] for r in cur.fetchall()]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 主入口:运行完整一致性检查
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def run_consistency_check(
|
||
db_conn,
|
||
*,
|
||
api_sample_dir: Path | None = None,
|
||
fetch_root: Path | None = None,
|
||
include_api_vs_ods: bool = True,
|
||
include_ods_vs_dwd: bool = True,
|
||
sample_limit: int = 5,
|
||
tz: ZoneInfo | None = None,
|
||
) -> ConsistencyReport:
|
||
"""
|
||
执行完整的数据一致性检查。
|
||
|
||
参数:
|
||
db_conn: 数据库连接对象(需有 .conn 属性返回 psycopg2 connection)
|
||
api_sample_dir: API JSON 缓存目录(旧方式,兼容保留)
|
||
fetch_root: FETCH_ROOT 目录(优先使用,从 ETL 实际抓取的分页 JSON 提取字段)
|
||
include_api_vs_ods: 是否执行 API vs ODS 检查
|
||
include_ods_vs_dwd: 是否执行 ODS vs DWD 检查
|
||
sample_limit: 值不一致时的采样行数
|
||
tz: 时区
|
||
"""
|
||
if tz is None:
|
||
tz = ZoneInfo("Asia/Shanghai")
|
||
|
||
report = ConsistencyReport(
|
||
generated_at=datetime.now(tz).isoformat(),
|
||
)
|
||
|
||
with db_conn.conn.cursor() as cur:
|
||
# --- 1. API vs ODS 字段完整性检查 ---
|
||
# CHANGE 2026-02-26 | 优先从 FETCH_ROOT 读取实际抓取数据,回退到 api_sample_dir 缓存
|
||
if include_api_vs_ods and (fetch_root or api_sample_dir):
|
||
for ods_table, json_file in sorted(ODS_TABLE_TO_JSON_FILE.items()):
|
||
# 优先尝试 FETCH_ROOT(ETL 实际抓取的分页 JSON)
|
||
api_fields = None
|
||
source_hint = ""
|
||
if fetch_root:
|
||
api_fields = extract_api_fields_from_fetch_root(fetch_root, ods_table)
|
||
source_hint = "FETCH_ROOT"
|
||
|
||
# 回退到 api_sample_dir(gen_full_dataflow_doc 缓存)
|
||
if api_fields is None and api_sample_dir:
|
||
json_path = api_sample_dir / json_file
|
||
api_fields = extract_api_fields_from_json(json_path)
|
||
source_hint = "API_SAMPLE_CACHE"
|
||
|
||
if api_fields is None:
|
||
result = TableCheckResult(
|
||
table_name=f"ods.{ods_table}",
|
||
check_type="api_vs_ods",
|
||
passed=True, # 无 JSON 数据时跳过,不算失败
|
||
error=f"无可用 JSON 数据(FETCH_ROOT 和 API 缓存均未找到)",
|
||
)
|
||
report.api_vs_ods_results.append(result)
|
||
continue
|
||
|
||
ods_columns = _fetch_table_columns(cur, "ods", ods_table)
|
||
if not ods_columns:
|
||
result = TableCheckResult(
|
||
table_name=f"ods.{ods_table}",
|
||
check_type="api_vs_ods",
|
||
passed=False,
|
||
error="ODS 表不存在或无列",
|
||
)
|
||
report.api_vs_ods_results.append(result)
|
||
continue
|
||
|
||
result = check_api_vs_ods_fields(api_fields, ods_columns)
|
||
result.table_name = f"ods.{ods_table}"
|
||
report.api_vs_ods_results.append(result)
|
||
|
||
# --- 2. ODS vs DWD 映射正确性检查 ---
|
||
if include_ods_vs_dwd:
|
||
table_map = DwdLoadTask.TABLE_MAP
|
||
fact_mappings = DwdLoadTask.FACT_MAPPINGS
|
||
|
||
for dwd_full, ods_full in sorted(table_map.items()):
|
||
dwd_schema, dwd_table = _split_table(dwd_full, "dwd")
|
||
ods_schema, ods_table = _split_table(ods_full, "ods")
|
||
|
||
try:
|
||
dwd_columns = _fetch_table_columns(cur, dwd_schema, dwd_table)
|
||
ods_columns = _fetch_table_columns(cur, ods_schema, ods_table)
|
||
|
||
if not dwd_columns:
|
||
result = TableCheckResult(
|
||
table_name=dwd_full,
|
||
check_type="ods_vs_dwd",
|
||
passed=False,
|
||
error=f"DWD 表 {dwd_full} 不存在或无列",
|
||
)
|
||
report.ods_vs_dwd_results.append(result)
|
||
continue
|
||
|
||
mappings = fact_mappings.get(dwd_full)
|
||
result = check_ods_vs_dwd_mappings(
|
||
dwd_full, ods_full,
|
||
dwd_columns, ods_columns,
|
||
mappings,
|
||
)
|
||
|
||
# 补充行数统计
|
||
result.row_count_source = _fetch_row_count(cur, ods_schema, ods_table)
|
||
result.row_count_target = _fetch_row_count(cur, dwd_schema, dwd_table)
|
||
|
||
# 采样值对比
|
||
if sample_limit > 0:
|
||
explicit_map: Dict[str, Tuple[str, str | None]] = {}
|
||
if mappings:
|
||
for dwd_col, ods_expr, cast_type in mappings:
|
||
explicit_map[dwd_col.lower()] = (ods_expr, cast_type)
|
||
|
||
dwd_pk = _fetch_pk_columns(cur, dwd_schema, dwd_table)
|
||
ods_pk = _fetch_pk_columns(cur, ods_schema, ods_table)
|
||
|
||
samples = _sample_value_mismatches(
|
||
cur,
|
||
dwd_schema, dwd_table,
|
||
ods_schema, ods_table,
|
||
dwd_pk, ods_pk,
|
||
explicit_map,
|
||
dwd_columns, ods_columns,
|
||
limit=sample_limit,
|
||
)
|
||
result.sample_mismatches = samples
|
||
|
||
report.ods_vs_dwd_results.append(result)
|
||
|
||
except Exception as exc:
|
||
# CHANGE 2026-02-24 | rollback 防止 InFailedSqlTransaction 级联到后续表检查
|
||
try:
|
||
db_conn.conn.rollback()
|
||
except Exception:
|
||
pass
|
||
result = TableCheckResult(
|
||
table_name=dwd_full,
|
||
check_type="ods_vs_dwd",
|
||
passed=False,
|
||
error=f"{type(exc).__name__}: {exc}",
|
||
)
|
||
report.ods_vs_dwd_results.append(result)
|
||
|
||
return report
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 报告生成
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def generate_markdown_report(report: ConsistencyReport) -> str:
|
||
"""将 ConsistencyReport 转换为 Markdown 格式的黑盒测试报告"""
|
||
lines: list[str] = []
|
||
lines.append("# 数据一致性黑盒测试报告")
|
||
lines.append("")
|
||
lines.append(f"生成时间: {report.generated_at}")
|
||
lines.append(f"总体结果: **{'✅ 全部通过' if report.all_passed else '❌ 存在异常'}**")
|
||
lines.append("")
|
||
|
||
# --- 汇总表格 ---
|
||
lines.append("## 汇总")
|
||
lines.append("")
|
||
|
||
if report.api_vs_ods_results:
|
||
api_pass = sum(1 for r in report.api_vs_ods_results if r.passed)
|
||
api_total = len(report.api_vs_ods_results)
|
||
lines.append(f"- API vs ODS 字段完整性: {api_pass}/{api_total} 张表通过")
|
||
|
||
if report.ods_vs_dwd_results:
|
||
dwd_pass = sum(1 for r in report.ods_vs_dwd_results if r.passed)
|
||
dwd_total = len(report.ods_vs_dwd_results)
|
||
lines.append(f"- ODS vs DWD 映射正确性: {dwd_pass}/{dwd_total} 张表通过")
|
||
|
||
lines.append("")
|
||
|
||
# --- API vs ODS 详细结果 ---
|
||
if report.api_vs_ods_results:
|
||
lines.append("## API vs ODS 字段完整性检查")
|
||
lines.append("")
|
||
lines.append("| 表名 | 状态 | 总字段 | 通过 | 缺失 | 备注 |")
|
||
lines.append("|------|------|--------|------|------|------|")
|
||
|
||
for r in report.api_vs_ods_results:
|
||
status = "✅" if r.passed else "❌"
|
||
note = r.error or ""
|
||
lines.append(
|
||
f"| {r.table_name} | {status} | {r.total_fields} "
|
||
f"| {r.passed_fields} | {r.missing_fields} | {note} |"
|
||
)
|
||
|
||
lines.append("")
|
||
|
||
# 失败表的详细字段列表
|
||
failed = [r for r in report.api_vs_ods_results if not r.passed and r.field_results]
|
||
for r in failed:
|
||
lines.append(f"### {r.table_name} — 缺失字段明细")
|
||
lines.append("")
|
||
missing = [f for f in r.field_results if f.status == "missing"]
|
||
for f in missing:
|
||
lines.append(f"- `{f.field_name}`: {f.detail}")
|
||
lines.append("")
|
||
|
||
# --- ODS vs DWD 详细结果 ---
|
||
if report.ods_vs_dwd_results:
|
||
lines.append("## ODS vs DWD 映射正确性检查")
|
||
lines.append("")
|
||
lines.append("| DWD 表 | 状态 | 总字段 | 通过 | 缺失 | 不一致 | ODS 行数 | DWD 行数 | 备注 |")
|
||
lines.append("|--------|------|--------|------|------|--------|----------|----------|------|")
|
||
|
||
for r in report.ods_vs_dwd_results:
|
||
status = "✅" if r.passed else "❌"
|
||
note = r.error or ""
|
||
lines.append(
|
||
f"| {r.table_name} | {status} | {r.total_fields} "
|
||
f"| {r.passed_fields} | {r.missing_fields} | {r.mismatch_fields} "
|
||
f"| {r.row_count_source} | {r.row_count_target} | {note} |"
|
||
)
|
||
|
||
lines.append("")
|
||
|
||
# 失败表的详细字段列表
|
||
failed_dwd = [r for r in report.ods_vs_dwd_results if not r.passed and r.field_results]
|
||
for r in failed_dwd:
|
||
lines.append(f"### {r.table_name} — 映射异常明细")
|
||
lines.append("")
|
||
issues = [f for f in r.field_results if f.status in ("missing", "mismatch")]
|
||
for f in issues:
|
||
lines.append(f"- `{f.field_name}` [{f.status}]: {f.detail}")
|
||
lines.append("")
|
||
|
||
if r.sample_mismatches:
|
||
lines.append(f"#### 值不一致采样(前 {len(r.sample_mismatches)} 条)")
|
||
lines.append("")
|
||
lines.append("```json")
|
||
# 序列化时处理不可序列化的类型
|
||
safe_samples = _safe_serialize(r.sample_mismatches)
|
||
lines.append(json.dumps(safe_samples, ensure_ascii=False, indent=2))
|
||
lines.append("```")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def _safe_serialize(obj: Any) -> Any:
|
||
"""将不可 JSON 序列化的类型转为字符串"""
|
||
if isinstance(obj, list):
|
||
return [_safe_serialize(item) for item in obj]
|
||
if isinstance(obj, dict):
|
||
return {k: _safe_serialize(v) for k, v in obj.items()}
|
||
if isinstance(obj, (datetime,)):
|
||
return obj.isoformat()
|
||
if isinstance(obj, bytes):
|
||
return obj.hex()
|
||
try:
|
||
json.dumps(obj)
|
||
return obj
|
||
except (TypeError, ValueError):
|
||
return str(obj)
|
||
|
||
|
||
def write_consistency_report(
|
||
report: ConsistencyReport,
|
||
*,
|
||
report_path: Path | None = None,
|
||
) -> str:
|
||
"""
|
||
将一致性检查报告写入文件。
|
||
|
||
输出路径通过 ETL_REPORT_ROOT 环境变量控制。
|
||
"""
|
||
if report_path is None:
|
||
env_root = os.environ.get("ETL_REPORT_ROOT")
|
||
if not env_root:
|
||
raise KeyError(
|
||
"环境变量 ETL_REPORT_ROOT 未定义。"
|
||
"请在根 .env 中配置,参考 docs/deployment/EXPORT-PATHS.md"
|
||
)
|
||
root = Path(env_root)
|
||
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
report_path = root / f"consistency_report_{stamp}.md"
|
||
|
||
md_content = generate_markdown_report(report)
|
||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||
report_path.write_text(md_content, encoding="utf-8")
|
||
return str(report_path)
|