Files
Neo-ZQYY/apps/etl/connectors/feiqiu/scripts/debug/debug_blackbox.py

1204 lines
45 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""黑盒数据完整性校验脚本。
以独立视角从 API 源数据出发,逐层对比各 Schema 各表的数据完整性:
1. API → ODS逐端点拉取数据与 ODS 表记录数对比
2. ODS → DWD按 TABLE_MAP 逐对比较记录数和金额列汇总
3. DWD → DWS验证汇总表聚合结果与明细数据一致性
4. 可疑值检测:扫描各表中的边缘值、空值、重复记录
5. 抽样比对:随机抽样 100 条记录,逐字段与上游 API 源数据比对
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.debug_blackbox [--sample-size 100]
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime
from decimal import Decimal
from pathlib import Path
from typing import Any
from zoneinfo import ZoneInfo
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from api.client import APIClient
from tasks.ods.ods_tasks import ODS_TASK_SPECS, OdsTaskSpec
from tasks.dwd.dwd_load_task import DwdLoadTask
from orchestration.task_registry import default_registry
# ── 数据模型 ──────────────────────────────────────────────────
@dataclass
class BlackboxCheckResult:
"""黑盒校验单表结果"""
layer: str = "" # "API_ODS" / "ODS_DWD" / "DWD_DWS"
source_table: str = ""
target_table: str = ""
source_count: int = 0
target_count: int = 0
count_diff: int = 0
amount_diffs: list = field(default_factory=list)
missing_keys: list = field(default_factory=list)
mismatch_count: int = 0
status: str = "" # PASS / WARN / FAIL / ERROR / SKIP
message: str = ""
details: dict = field(default_factory=dict)
@dataclass
class SuspectRecord:
"""可疑值检测结果"""
table: str = ""
check_type: str = "" # "null_pk" / "duplicate" / "edge_value" / "negative_amount"
count: int = 0
sample_keys: list = field(default_factory=list)
message: str = ""
@dataclass
class SampleMismatch:
"""抽样比对不一致记录"""
table: str = ""
pk_value: Any = None
field_name: str = ""
api_value: Any = None
ods_value: Any = None
# ── 工具函数 ──────────────────────────────────────────────────
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("debug_blackbox")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
def _build_components(config: AppConfig, logger: logging.Logger):
"""构建 DB / API 组件。"""
db_conn = DatabaseConnection(
dsn=config["db"]["dsn"],
session=config["db"].get("session"),
connect_timeout=config["db"].get("connect_timeout_sec"),
)
api_client = APIClient(
base_url=config["api"]["base_url"],
token=config["api"]["token"],
timeout=config["api"].get("timeout_sec", 20),
retry_max=config["api"].get("retries", {}).get("max_attempts", 3),
headers_extra=config["api"].get("headers_extra"),
)
return db_conn, api_client
def _query_count(db_conn: DatabaseConnection, table: str) -> int:
"""查询表的总行数。"""
rows = db_conn.query(f"SELECT COUNT(*) AS cnt FROM {table}")
return int(rows[0]["cnt"]) if rows else 0
def _table_exists(db_conn: DatabaseConnection, table: str) -> bool:
"""检查表是否存在。"""
rows = db_conn.query("SELECT to_regclass(%s) AS reg", (table,))
return bool(rows and rows[0].get("reg"))
def _has_column(db_conn: DatabaseConnection, table: str, column: str) -> bool:
"""检查表是否包含指定列。"""
sql = """
SELECT 1 FROM information_schema.columns
WHERE table_schema || '.' || table_name = %s
AND column_name = %s
LIMIT 1
"""
rows = db_conn.query(sql, (table, column))
return bool(rows)
def _get_numeric_columns(db_conn: DatabaseConnection, table: str) -> list[str]:
"""获取表中所有 numeric/decimal 类型的列名(金额列候选)。"""
sql = """
SELECT column_name FROM information_schema.columns
WHERE table_schema || '.' || table_name = %s
AND data_type IN ('numeric', 'decimal', 'money')
ORDER BY ordinal_position
"""
rows = db_conn.query(sql, (table,))
return [r["column_name"] for r in rows]
def _get_pk_columns(db_conn: DatabaseConnection, table: str) -> list[str]:
"""获取表的主键列。"""
parts = table.split(".")
schema = parts[0] if len(parts) == 2 else "public"
tbl = parts[1] if len(parts) == 2 else parts[0]
sql = """
SELECT a.attname
FROM pg_index i
JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
WHERE i.indrelid = %s::regclass AND i.indisprimary
ORDER BY array_position(i.indkey, a.attnum)
"""
rows = db_conn.query(sql, (table,))
return [r["attname"] for r in rows]
def _sanitize_for_json(obj):
"""递归处理不可序列化的值。"""
if isinstance(obj, dict):
return {k: _sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_sanitize_for_json(v) for v in obj]
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, Decimal):
return float(obj)
if isinstance(obj, set):
return list(obj)
return obj
# ── 已启用的 ODS 任务代码(与 task_registry 一致)──────────────
_ENABLED_ODS_SPECS: list[OdsTaskSpec] = [
spec for spec in ODS_TASK_SPECS
if spec.code in {m.upper() for m in default_registry.get_tasks_by_layer("ODS")}
]
# ══════════════════════════════════════════════════════════════
# 步骤 1: API → ODS 记录数对比
# ══════════════════════════════════════════════════════════════
def _check_api_vs_ods(
db_conn: DatabaseConnection,
api_client: APIClient,
config: AppConfig,
logger: logging.Logger,
) -> list[BlackboxCheckResult]:
"""逐端点从 API 拉取数据,与 ODS 表记录数对比。"""
logger.info("=" * 60)
logger.info("步骤 1: API → ODS 记录数对比")
logger.info("=" * 60)
store_id = int(config.get("app.store_id"))
results: list[BlackboxCheckResult] = []
for spec in _ENABLED_ODS_SPECS:
result = BlackboxCheckResult(
layer="API_ODS",
source_table=f"API:{spec.endpoint}",
target_table=spec.table_name,
)
logger.info("%s%s", spec.code, spec.table_name)
# 查询 ODS 表记录数
try:
if not _table_exists(db_conn, spec.table_name):
result.status = "ERROR"
result.message = f"ODS 表不存在: {spec.table_name}"
logger.warning("%s", result.message)
results.append(result)
continue
ods_count = _query_count(db_conn, spec.table_name)
result.target_count = ods_count
except Exception as exc:
result.status = "ERROR"
result.message = f"查询 ODS 表失败: {exc}"
logger.error("%s", result.message)
results.append(result)
continue
# 从 API 拉取记录数
# ODS_SETTLEMENT_TICKET 是特殊任务(逐条拉取),跳过 API 对比
if spec.code == "ODS_SETTLEMENT_TICKET":
result.status = "SKIP"
result.message = f"特殊任务跳过 API 对比, ODS 行数={ods_count}"
logger.info("%s", result.message)
results.append(result)
continue
try:
params: dict[str, Any] = {}
if spec.include_site_id:
params["siteId"] = store_id
# 不带时间窗口参数,拉取全量(与 ODS 全表对比)
all_records, _ = api_client.get_paginated(
endpoint=spec.endpoint,
params=params,
page_size=200,
data_path=spec.data_path,
list_key=spec.list_key,
)
api_count = len(all_records)
result.source_count = api_count
except Exception as exc:
result.status = "ERROR"
result.message = f"API 拉取失败: {exc}"
logger.error("%s", result.message)
results.append(result)
continue
# 对比
diff = ods_count - api_count
result.count_diff = diff
if diff == 0:
result.status = "PASS"
result.message = f"记录数一致: API={api_count}, ODS={ods_count}"
elif abs(diff) <= max(5, int(api_count * 0.01)):
# 允许 1% 或 5 条以内的差异(历史数据/删除标记等)
result.status = "WARN"
result.message = f"记录数微差: API={api_count}, ODS={ods_count}, diff={diff}"
else:
result.status = "FAIL"
result.message = f"记录数差异较大: API={api_count}, ODS={ods_count}, diff={diff}"
icon = {"PASS": "", "WARN": "", "FAIL": ""}.get(result.status, "?")
logger.info(" %s %s", icon, result.message)
results.append(result)
db_conn.ensure_open()
return results
# ══════════════════════════════════════════════════════════════
# 步骤 2: ODS → DWD 记录数 + 金额对比
# ══════════════════════════════════════════════════════════════
def _check_ods_vs_dwd(
db_conn: DatabaseConnection,
logger: logging.Logger,
) -> list[BlackboxCheckResult]:
"""按 TABLE_MAP 逐对比较 ODS 与 DWD 的记录数和金额列汇总。"""
logger.info("")
logger.info("=" * 60)
logger.info("步骤 2: ODS → DWD 记录数 + 金额对比")
logger.info("=" * 60)
table_map = DwdLoadTask.TABLE_MAP
results: list[BlackboxCheckResult] = []
# 按 ODS 源表分组(多个 DWD 表可能映射同一个 ODS 表)
for dwd_table, ods_table in sorted(table_map.items()):
result = BlackboxCheckResult(
layer="ODS_DWD",
source_table=ods_table,
target_table=dwd_table,
)
logger.info("%s%s", ods_table, dwd_table)
# 检查表是否存在
for tbl, label in [(ods_table, "ODS"), (dwd_table, "DWD")]:
if not _table_exists(db_conn, tbl):
result.status = "ERROR"
result.message = f"{label} 表不存在: {tbl}"
logger.warning("%s", result.message)
break
if result.status == "ERROR":
results.append(result)
continue
try:
ods_count = _query_count(db_conn, ods_table)
dwd_count = _query_count(db_conn, dwd_table)
result.source_count = ods_count
result.target_count = dwd_count
result.count_diff = dwd_count - ods_count
except Exception as exc:
result.status = "ERROR"
result.message = f"查询记录数失败: {exc}"
logger.error("%s", result.message)
results.append(result)
continue
# 维度表dim_*)可能因 SCD2 导致 DWD 行数 >= ODS 行数
is_dim = "dim_" in dwd_table.split(".")[-1]
# 金额列对比:找两表共有的 numeric 列
try:
ods_num_cols = set(_get_numeric_columns(db_conn, ods_table))
dwd_num_cols = set(_get_numeric_columns(db_conn, dwd_table))
common_amount_cols = sorted(ods_num_cols & dwd_num_cols)
amount_diffs = []
for col in common_amount_cols[:5]: # 最多对比 5 个金额列
try:
ods_sum_rows = db_conn.query(
f'SELECT COALESCE(SUM("{col}"), 0) AS s FROM {ods_table}'
)
dwd_sum_rows = db_conn.query(
f'SELECT COALESCE(SUM("{col}"), 0) AS s FROM {dwd_table}'
)
ods_sum = float(ods_sum_rows[0]["s"]) if ods_sum_rows else 0.0
dwd_sum = float(dwd_sum_rows[0]["s"]) if dwd_sum_rows else 0.0
diff = round(dwd_sum - ods_sum, 2)
if abs(diff) > 0.01:
amount_diffs.append({
"column": col,
"ods_sum": ods_sum,
"dwd_sum": dwd_sum,
"diff": diff,
})
except Exception:
pass # 跳过无法聚合的列
result.amount_diffs = amount_diffs
except Exception:
pass
# 状态判定
if is_dim:
# 维度表DWD >= ODS 是正常的SCD2 历史版本)
if dwd_count >= ods_count:
result.status = "PASS"
result.message = (
f"维度表 SCD2: ODS={ods_count}, DWD={dwd_count} "
f"(+{dwd_count - ods_count} 历史版本)"
)
else:
result.status = "WARN"
result.message = f"维度表 DWD < ODS: ODS={ods_count}, DWD={dwd_count}"
else:
# 事实表:记录数应大致一致
if result.count_diff == 0:
result.status = "PASS"
result.message = f"记录数一致: {ods_count}"
elif abs(result.count_diff) <= max(5, int(ods_count * 0.02)):
result.status = "WARN"
result.message = (
f"记录数微差: ODS={ods_count}, DWD={dwd_count}, "
f"diff={result.count_diff}"
)
else:
result.status = "FAIL"
result.message = (
f"记录数差异较大: ODS={ods_count}, DWD={dwd_count}, "
f"diff={result.count_diff}"
)
if result.amount_diffs:
result.status = max(result.status, "WARN", key=lambda s: ["PASS", "WARN", "FAIL", "ERROR"].index(s) if s in ["PASS", "WARN", "FAIL", "ERROR"] else 0)
result.message += f" | {len(result.amount_diffs)} 个金额列有差异"
icon = {"PASS": "", "WARN": "", "FAIL": "", "ERROR": ""}.get(result.status, "?")
logger.info(" %s %s", icon, result.message)
results.append(result)
return results
# ══════════════════════════════════════════════════════════════
# 步骤 3: DWD → DWS 聚合一致性
# ══════════════════════════════════════════════════════════════
# 已知的 DWS→DWD 聚合关系映射
# 营业日口径:使用 dws.biz_date() 替代 ::date 自然日转换
_DWS_DWD_MAP: dict[str, dict] = {
"dws.dws_assistant_daily_detail": {
"dwd_source": "dwd.dwd_assistant_service_log",
"dws_date_col": "stat_date",
"dwd_date_col": "service_date",
"description": "助教日度明细 vs DWD 服务流水",
},
"dws.dws_finance_daily_summary": {
"dwd_source": "dwd.dwd_settlement_head",
"dws_date_col": "stat_date",
"dwd_date_col": "pay_time",
"dwd_date_cast": "dws.biz_date(%col%)",
"description": "财务日度汇总 vs DWD 结账记录",
},
"dws.dws_member_visit_detail": {
"dwd_source": "dwd.dwd_settlement_head",
"dws_date_col": "visit_date",
"dwd_date_col": "pay_time",
"dwd_date_cast": "dws.biz_date(%col%)",
"description": "会员到店明细 vs DWD 结账记录",
},
"dws.dws_member_consumption_summary": {
"dwd_source": "dwd.dwd_settlement_head",
"dws_date_col": "stat_month",
"dwd_date_col": "pay_time",
"dwd_date_cast": "date_trunc('month', dws.biz_date(%col%))::date",
"description": "会员消费汇总 vs DWD 结账记录",
},
"dws.dws_finance_recharge_summary": {
"dwd_source": "dwd.dwd_recharge_order",
"dws_date_col": "stat_date",
"dwd_date_col": "pay_time",
"dwd_date_cast": "dws.biz_date(%col%)",
"description": "充值汇总 vs DWD 充值订单",
},
}
def _check_dwd_vs_dws(
db_conn: DatabaseConnection,
config: AppConfig,
logger: logging.Logger,
) -> list[BlackboxCheckResult]:
"""验证 DWS 汇总表聚合结果与 DWD 明细数据的一致性。"""
logger.info("")
logger.info("=" * 60)
logger.info("步骤 3: DWD → DWS 聚合一致性")
logger.info("=" * 60)
results: list[BlackboxCheckResult] = []
# 获取所有 DWS 层任务的目标表
dws_codes = default_registry.get_tasks_by_layer("DWS")
dws_tables: list[str] = []
for code in sorted(dws_codes):
meta = default_registry.get_metadata(code)
if meta is None:
continue
try:
inst = meta.task_class(config, db_conn, None, logging.getLogger("noop"))
raw = inst.get_target_table()
full = f"dws.{raw}" if raw and "." not in raw else raw
if full:
dws_tables.append(full)
except Exception:
pass
for dws_table in sorted(set(dws_tables)):
result = BlackboxCheckResult(layer="DWD_DWS", target_table=dws_table)
if not _table_exists(db_conn, dws_table):
result.status = "SKIP"
result.message = f"DWS 表不存在: {dws_table}"
logger.info("%s", result.message)
results.append(result)
continue
mapping = _DWS_DWD_MAP.get(dws_table)
if not mapping:
# 无已知映射,仅做基本行数检查
try:
dws_count = _query_count(db_conn, dws_table)
result.target_count = dws_count
result.status = "PASS" if dws_count > 0 else "WARN"
result.message = f"无映射关系, DWS 行数={dws_count}"
except Exception as exc:
result.status = "ERROR"
result.message = f"查询失败: {exc}"
logger.info(" %s: %s", dws_table, result.message)
results.append(result)
continue
dwd_source = mapping["dwd_source"]
result.source_table = dwd_source
logger.info("%s%s (%s)", dwd_source, dws_table, mapping["description"])
if not _table_exists(db_conn, dwd_source):
result.status = "ERROR"
result.message = f"DWD 源表不存在: {dwd_source}"
logger.warning("%s", result.message)
results.append(result)
continue
try:
dws_count = _query_count(db_conn, dws_table)
dwd_count = _query_count(db_conn, dwd_source)
result.source_count = dwd_count
result.target_count = dws_count
result.count_diff = dws_count - dwd_count
# DWS 是聚合表,行数通常 <= DWD
if dws_count == 0 and dwd_count > 0:
result.status = "WARN"
result.message = f"DWS 无数据但 DWD 有 {dwd_count}"
elif dws_count > 0 and dwd_count == 0:
result.status = "WARN"
result.message = f"DWS 有 {dws_count} 行但 DWD 无数据"
else:
result.status = "PASS"
result.message = f"DWD={dwd_count}, DWS={dws_count}"
# 抽样日期对比:取 DWS 最近 3 个日期
dws_date_col = mapping.get("dws_date_col")
if dws_date_col and _has_column(db_conn, dws_table, dws_date_col):
sample_sql = f"""
SELECT DISTINCT "{dws_date_col}" AS d
FROM {dws_table}
ORDER BY d DESC LIMIT 3
"""
date_rows = db_conn.query(sample_sql)
date_checks = []
for dr in date_rows:
d = dr["d"]
dws_day_rows = db_conn.query(
f'SELECT COUNT(*) AS cnt FROM {dws_table} WHERE "{dws_date_col}" = %s',
(d,),
)
dws_day = int(dws_day_rows[0]["cnt"]) if dws_day_rows else 0
date_checks.append({"date": str(d), "dws_rows": dws_day})
result.details["date_samples"] = date_checks
except Exception as exc:
result.status = "ERROR"
result.message = f"对比失败: {exc}"
icon = {"PASS": "", "WARN": "", "FAIL": "", "ERROR": "", "SKIP": ""}.get(result.status, "?")
logger.info(" %s %s", icon, result.message)
results.append(result)
return results
# ══════════════════════════════════════════════════════════════
# 步骤 4: 可疑值检测
# ══════════════════════════════════════════════════════════════
def _detect_suspects(
db_conn: DatabaseConnection,
logger: logging.Logger,
) -> list[SuspectRecord]:
"""扫描各表中的边缘值、空值、重复记录。"""
logger.info("")
logger.info("=" * 60)
logger.info("步骤 4: 可疑值检测")
logger.info("=" * 60)
suspects: list[SuspectRecord] = []
# 扫描所有 ODS 和 DWD 表
tables_to_scan: list[str] = []
for spec in _ENABLED_ODS_SPECS:
tables_to_scan.append(spec.table_name)
for dwd_table in sorted(DwdLoadTask.TABLE_MAP.keys()):
tables_to_scan.append(dwd_table)
for table in sorted(set(tables_to_scan)):
if not _table_exists(db_conn, table):
continue
logger.info(" 扫描 %s ...", table)
pk_cols = _get_pk_columns(db_conn, table)
# 4a. 主键空值检测
if pk_cols:
for pk in pk_cols:
try:
null_sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE "{pk}" IS NULL'
rows = db_conn.query(null_sql)
null_count = int(rows[0]["cnt"]) if rows else 0
if null_count > 0:
suspects.append(SuspectRecord(
table=table,
check_type="null_pk",
count=null_count,
message=f"主键列 {pk}{null_count} 个 NULL 值",
))
logger.warning("%s: 主键 %s%d 个 NULL", table, pk, null_count)
except Exception:
pass
# 4b. 重复记录检测(基于主键)
if pk_cols:
pk_expr = ", ".join(f'"{c}"' for c in pk_cols)
try:
dup_sql = f"""
SELECT {pk_expr}, COUNT(*) AS cnt
FROM {table}
GROUP BY {pk_expr}
HAVING COUNT(*) > 1
LIMIT 10
"""
dup_rows = db_conn.query(dup_sql)
if dup_rows:
dup_count = len(dup_rows)
sample_keys = [
{c: r[c] for c in pk_cols if c in r}
for r in dup_rows[:5]
]
suspects.append(SuspectRecord(
table=table,
check_type="duplicate",
count=dup_count,
sample_keys=sample_keys,
message=f"发现 {dup_count} 组重复主键",
))
logger.warning("%s: %d 组重复主键", table, dup_count)
except Exception:
pass
# 4c. 金额列负值/极端值检测
try:
num_cols = _get_numeric_columns(db_conn, table)
# 只检查名称中含 amount/money/price/fee/sum 的列
amount_keywords = ("amount", "money", "price", "fee", "sum", "balance", "cost")
amount_cols = [c for c in num_cols if any(k in c.lower() for k in amount_keywords)]
for col in amount_cols[:5]:
try:
neg_sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE "{col}" < 0'
neg_rows = db_conn.query(neg_sql)
neg_count = int(neg_rows[0]["cnt"]) if neg_rows else 0
if neg_count > 0:
suspects.append(SuspectRecord(
table=table,
check_type="negative_amount",
count=neg_count,
message=f"金额列 {col}{neg_count} 个负值",
))
logger.info(" %s.%s: %d 个负值", table, col, neg_count)
except Exception:
pass
# 极端值:超过 100 万
try:
edge_sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE ABS("{col}") > 1000000'
edge_rows = db_conn.query(edge_sql)
edge_count = int(edge_rows[0]["cnt"]) if edge_rows else 0
if edge_count > 0:
suspects.append(SuspectRecord(
table=table,
check_type="edge_value",
count=edge_count,
message=f"金额列 {col}{edge_count} 个超百万值",
))
logger.warning("%s.%s: %d 个超百万值", table, col, edge_count)
except Exception:
pass
except Exception:
pass
db_conn.ensure_open()
logger.info(" 可疑值检测完成, 共发现 %d", len(suspects))
return suspects
# ══════════════════════════════════════════════════════════════
# 步骤 5: 抽样比对 API vs ODS
# ══════════════════════════════════════════════════════════════
def _sample_compare_api_vs_ods(
db_conn: DatabaseConnection,
api_client: APIClient,
config: AppConfig,
logger: logging.Logger,
sample_size: int = 100,
) -> list[SampleMismatch]:
"""从 ODS 中随机抽样记录,逐字段与上游 API 源数据比对。"""
logger.info("")
logger.info("=" * 60)
logger.info("步骤 5: 抽样比对 API vs ODS (%d 条)", sample_size)
logger.info("=" * 60)
store_id = int(config.get("app.store_id"))
mismatches: list[SampleMismatch] = []
total_sampled = 0
# 选择适合抽样的 ODS 任务(有明确 id 主键、非特殊任务)
sample_specs = [
s for s in _ENABLED_ODS_SPECS
if s.code != "ODS_SETTLEMENT_TICKET"
and any(pk.column == "id" for pk in s.pk_columns)
]
# 每个表分配的抽样数
per_table = max(1, sample_size // max(len(sample_specs), 1))
remaining = sample_size
for spec in sample_specs:
if remaining <= 0:
break
n = min(per_table, remaining)
logger.info("%s: 抽样 %d", spec.table_name, n)
if not _table_exists(db_conn, spec.table_name):
continue
# 从 ODS 随机抽样 id
try:
sample_sql = f"""
SELECT id FROM {spec.table_name}
ORDER BY RANDOM()
LIMIT %s
"""
id_rows = db_conn.query(sample_sql, (n,))
if not id_rows:
logger.info(" 表为空,跳过")
continue
sample_ids = [r["id"] for r in id_rows]
except Exception as exc:
logger.warning(" ⚠ 抽样失败: %s", exc)
continue
# 从 API 拉取全量数据(缓存在内存中用于比对)
try:
params: dict[str, Any] = {}
if spec.include_site_id:
params["siteId"] = store_id
api_records, _ = api_client.get_paginated(
endpoint=spec.endpoint,
params=params,
page_size=200,
data_path=spec.data_path,
list_key=spec.list_key,
)
# 按 id 建索引
api_by_id: dict[int, dict] = {}
for rec in api_records:
rec_id = rec.get("id")
if rec_id is not None:
try:
api_by_id[int(rec_id)] = rec
except (ValueError, TypeError):
pass
except Exception as exc:
logger.warning(" ⚠ API 拉取失败: %s", exc)
continue
# 逐条比对
table_mismatches = 0
for sid in sample_ids:
try:
sid_int = int(sid)
except (ValueError, TypeError):
continue
# 从 ODS 读取该条记录的 payload
try:
ods_row = db_conn.query(
f"SELECT payload FROM {spec.table_name} WHERE id = %s LIMIT 1",
(sid_int,),
)
if not ods_row or not ods_row[0].get("payload"):
continue
ods_payload = ods_row[0]["payload"]
# payload 可能是 JSON 字符串或已解析的 dict
if isinstance(ods_payload, str):
ods_payload = json.loads(ods_payload)
except Exception:
continue
api_rec = api_by_id.get(sid_int)
if api_rec is None:
mismatches.append(SampleMismatch(
table=spec.table_name,
pk_value=sid_int,
field_name="__missing__",
api_value=None,
ods_value="exists",
))
table_mismatches += 1
continue
# 逐字段比对(只比对 API 记录中的顶层字段)
for key, api_val in api_rec.items():
ods_val = ods_payload.get(key)
if not _values_match(api_val, ods_val):
mismatches.append(SampleMismatch(
table=spec.table_name,
pk_value=sid_int,
field_name=key,
api_value=str(api_val)[:200],
ods_value=str(ods_val)[:200],
))
table_mismatches += 1
total_sampled += 1
remaining -= len(sample_ids)
if table_mismatches > 0:
logger.info("%d 个字段不一致", table_mismatches)
else:
logger.info(" ✓ 抽样一致")
db_conn.ensure_open()
logger.info(" 抽样比对完成: 共抽样 %d 条, 发现 %d 处不一致", total_sampled, len(mismatches))
return mismatches
def _values_match(api_val: Any, ods_val: Any) -> bool:
"""宽松比较两个值是否一致。
处理常见的类型差异int vs str、None vs 空字符串、浮点精度等。
"""
if api_val is None and ods_val is None:
return True
if api_val is None and ods_val in ("", 0, "0", False):
return True
if ods_val is None and api_val in ("", 0, "0", False):
return True
# 都转字符串比较(处理 int/str 差异)
str_api = str(api_val).strip() if api_val is not None else ""
str_ods = str(ods_val).strip() if ods_val is not None else ""
if str_api == str_ods:
return True
# 数值比较(处理浮点精度)
try:
if abs(float(str_api) - float(str_ods)) < 0.01:
return True
except (ValueError, TypeError):
pass
# 嵌套对象/列表:转 JSON 比较
if isinstance(api_val, (dict, list)) or isinstance(ods_val, (dict, list)):
try:
return json.dumps(api_val, sort_keys=True, default=str) == json.dumps(ods_val, sort_keys=True, default=str)
except Exception:
pass
return False
# ══════════════════════════════════════════════════════════════
# 报告生成
# ══════════════════════════════════════════════════════════════
def _generate_report(
api_ods_results: list[BlackboxCheckResult],
ods_dwd_results: list[BlackboxCheckResult],
dwd_dws_results: list[BlackboxCheckResult],
suspects: list[SuspectRecord],
sample_mismatches: list[SampleMismatch],
config: AppConfig,
tz: ZoneInfo,
) -> dict:
"""汇总所有校验结果为结构化报告。"""
now = datetime.now(tz)
all_results = api_ods_results + ods_dwd_results + dwd_dws_results
pass_count = sum(1 for r in all_results if r.status == "PASS")
warn_count = sum(1 for r in all_results if r.status == "WARN")
fail_count = sum(1 for r in all_results if r.status == "FAIL")
error_count = sum(1 for r in all_results if r.status == "ERROR")
skip_count = sum(1 for r in all_results if r.status == "SKIP")
report = {
"title": "黑盒数据完整性校验报告",
"generated_at": now.isoformat(),
"environment": {
"store_id": config.get("app.store_id"),
"db_name": config.get("db.name", ""),
"api_base": config.get("api.base_url", ""),
},
"summary": {
"total_checks": len(all_results),
"pass": pass_count,
"warn": warn_count,
"fail": fail_count,
"error": error_count,
"skip": skip_count,
"suspect_count": len(suspects),
"sample_mismatch_count": len(sample_mismatches),
},
"api_ods": [asdict(r) for r in api_ods_results],
"ods_dwd": [asdict(r) for r in ods_dwd_results],
"dwd_dws": [asdict(r) for r in dwd_dws_results],
"suspects": [asdict(s) for s in suspects],
"sample_mismatches": [asdict(m) for m in sample_mismatches[:200]],
}
return report
def _generate_markdown_report(report: dict) -> str:
"""将结构化报告转为 Markdown 格式。"""
lines: list[str] = []
summary = report["summary"]
lines.append(f"# {report['title']}")
lines.append("")
lines.append(f"生成时间: {report['generated_at']}")
lines.append("")
# 环境信息
env = report["environment"]
lines.append("## 环境信息")
lines.append(f"- 门店 ID: {env.get('store_id')}")
lines.append(f"- 数据库: {env.get('db_name')}")
lines.append(f"- API: {env.get('api_base')}")
lines.append("")
# 汇总
lines.append("## 校验汇总")
lines.append(f"| 指标 | 数值 |")
lines.append(f"|------|------|")
lines.append(f"| 总检查项 | {summary['total_checks']} |")
lines.append(f"| ✓ PASS | {summary['pass']} |")
lines.append(f"| ⚠ WARN | {summary['warn']} |")
lines.append(f"| ✗ FAIL | {summary['fail']} |")
lines.append(f"| ✗ ERROR | {summary['error']} |")
lines.append(f"| ⊘ SKIP | {summary['skip']} |")
lines.append(f"| 可疑值 | {summary['suspect_count']} |")
lines.append(f"| 抽样不一致 | {summary['sample_mismatch_count']} |")
lines.append("")
# API → ODS
lines.append("## 步骤 1: API → ODS 记录数对比")
lines.append("")
_append_result_table(lines, report.get("api_ods", []))
# ODS → DWD
lines.append("## 步骤 2: ODS → DWD 记录数 + 金额对比")
lines.append("")
_append_result_table(lines, report.get("ods_dwd", []))
# 金额差异详情
amount_issues = [r for r in report.get("ods_dwd", []) if r.get("amount_diffs")]
if amount_issues:
lines.append("### 金额差异详情")
lines.append("")
for r in amount_issues:
lines.append(f"**{r['source_table']}{r['target_table']}**")
lines.append("")
lines.append("| 列名 | ODS 汇总 | DWD 汇总 | 差异 |")
lines.append("|------|----------|----------|------|")
for ad in r["amount_diffs"]:
lines.append(
f"| {ad['column']} | {ad['ods_sum']:.2f} | {ad['dwd_sum']:.2f} | {ad['diff']:.2f} |"
)
lines.append("")
# DWD → DWS
lines.append("## 步骤 3: DWD → DWS 聚合一致性")
lines.append("")
_append_result_table(lines, report.get("dwd_dws", []))
# 可疑值
lines.append("## 步骤 4: 可疑值检测")
lines.append("")
suspects_data = report.get("suspects", [])
if suspects_data:
lines.append("| 表 | 类型 | 数量 | 说明 |")
lines.append("|---|------|------|------|")
for s in suspects_data:
lines.append(f"| {s['table']} | {s['check_type']} | {s['count']} | {s['message']} |")
lines.append("")
else:
lines.append("未发现可疑值。")
lines.append("")
# 抽样比对
lines.append("## 步骤 5: 抽样比对 API vs ODS")
lines.append("")
sample_data = report.get("sample_mismatches", [])
if sample_data:
lines.append(f"共发现 {len(sample_data)} 处不一致:")
lines.append("")
lines.append("| 表 | 主键 | 字段 | API 值 | ODS 值 |")
lines.append("|---|------|------|--------|--------|")
for m in sample_data[:50]: # 最多展示 50 条
lines.append(
f"| {m['table']} | {m['pk_value']} | {m['field_name']} "
f"| {str(m.get('api_value', ''))[:60]} | {str(m.get('ods_value', ''))[:60]} |"
)
if len(sample_data) > 50:
lines.append(f"| ... | 共 {len(sample_data)} 条,仅展示前 50 条 | | | |")
lines.append("")
else:
lines.append("抽样比对全部一致。")
lines.append("")
return "\n".join(lines)
def _append_result_table(lines: list[str], results: list[dict]):
"""向 Markdown 追加校验结果表格。"""
if not results:
lines.append("无数据。")
lines.append("")
return
lines.append("| 状态 | 源 | 目标 | 源行数 | 目标行数 | 差异 | 说明 |")
lines.append("|------|---|------|--------|----------|------|------|")
for r in results:
icon = {"PASS": "", "WARN": "", "FAIL": "", "ERROR": "", "SKIP": ""}.get(r.get("status", ""), "?")
lines.append(
f"| {icon} {r.get('status', '')} "
f"| {r.get('source_table', '')} "
f"| {r.get('target_table', '')} "
f"| {r.get('source_count', '')} "
f"| {r.get('target_count', '')} "
f"| {r.get('count_diff', '')} "
f"| {r.get('message', '')} |"
)
lines.append("")
# ══════════════════════════════════════════════════════════════
# 主流程
# ══════════════════════════════════════════════════════════════
def run_blackbox_check(
sample_size: int = 100,
) -> dict:
"""执行完整的黑盒数据校验。
Returns:
结构化校验报告 dict
"""
logger = _setup_logging()
logger.info("" + "" * 58 + "")
logger.info("║ 黑盒数据完整性校验 ║")
logger.info("" + "" * 58 + "")
# 加载配置
config = AppConfig.load()
tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
logger.info("门店 ID: %s", config.get("app.store_id"))
logger.info("数据库: %s", config.get("db.name", ""))
logger.info("API: %s", config.get("api.base_url", ""))
logger.info("")
# 构建组件
db_conn, api_client = _build_components(config, logger)
t0 = time.monotonic()
# 步骤 1: API → ODS
try:
api_ods_results = _check_api_vs_ods(db_conn, api_client, config, logger)
except Exception as exc:
logger.error("步骤 1 异常: %s", exc)
logger.error(traceback.format_exc())
api_ods_results = []
db_conn.ensure_open()
# 步骤 2: ODS → DWD
try:
ods_dwd_results = _check_ods_vs_dwd(db_conn, logger)
except Exception as exc:
logger.error("步骤 2 异常: %s", exc)
logger.error(traceback.format_exc())
ods_dwd_results = []
db_conn.ensure_open()
# 步骤 3: DWD → DWS
try:
dwd_dws_results = _check_dwd_vs_dws(db_conn, config, logger)
except Exception as exc:
logger.error("步骤 3 异常: %s", exc)
logger.error(traceback.format_exc())
dwd_dws_results = []
db_conn.ensure_open()
# 步骤 4: 可疑值检测
try:
suspects = _detect_suspects(db_conn, logger)
except Exception as exc:
logger.error("步骤 4 异常: %s", exc)
logger.error(traceback.format_exc())
suspects = []
db_conn.ensure_open()
# 步骤 5: 抽样比对
try:
sample_mismatches = _sample_compare_api_vs_ods(
db_conn, api_client, config, logger, sample_size=sample_size,
)
except Exception as exc:
logger.error("步骤 5 异常: %s", exc)
logger.error(traceback.format_exc())
sample_mismatches = []
total_sec = round(time.monotonic() - t0, 1)
# 生成报告
report = _generate_report(
api_ods_results, ods_dwd_results, dwd_dws_results,
suspects, sample_mismatches, config, tz,
)
report["duration_sec"] = total_sec
# 输出 JSON
output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output"
output_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
json_path = output_dir / f"blackbox_{ts}.json"
json_path.write_text(
json.dumps(_sanitize_for_json(report), ensure_ascii=False, indent=2, default=str),
encoding="utf-8",
)
logger.info("")
logger.info("JSON 报告: %s", json_path)
# 输出 Markdown
_report_root = os.environ.get("ETL_REPORT_ROOT")
if not _report_root:
raise KeyError("环境变量 ETL_REPORT_ROOT 未定义。请在根 .env 中配置。")
reports_dir = Path(_report_root)
reports_dir.mkdir(parents=True, exist_ok=True)
md_path = reports_dir / f"blackbox_report_{ts}.md"
md_content = _generate_markdown_report(report)
md_path.write_text(md_content, encoding="utf-8")
logger.info("Markdown 报告: %s", md_path)
# 打印汇总
logger.info("")
logger.info("=" * 60)
logger.info("黑盒校验汇总")
logger.info("=" * 60)
s = report["summary"]
logger.info(" 总检查项: %d", s["total_checks"])
logger.info(" ✓ PASS: %d", s["pass"])
logger.info(" ⚠ WARN: %d", s["warn"])
logger.info(" ✗ FAIL: %d", s["fail"])
logger.info(" ✗ ERROR: %d", s["error"])
logger.info(" ⊘ SKIP: %d", s["skip"])
logger.info(" 可疑值: %d", s["suspect_count"])
logger.info(" 抽样不一致: %d", s["sample_mismatch_count"])
logger.info(" 总耗时: %.1f", total_sec)
# 清理
db_conn.close()
return report
# ── CLI 入口 ──────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser(description="黑盒数据完整性校验")
parser.add_argument("--sample-size", type=int, default=100,
help="抽样比对记录数(默认 100")
return parser.parse_args()
def main():
args = parse_args()
report = run_blackbox_check(sample_size=args.sample_size)
# 退出码: 有 FAIL 则非零
has_fail = report["summary"]["fail"] > 0
sys.exit(1 if has_fail else 0)
if __name__ == "__main__":
main()