# -*- coding: utf-8 -*- """黑盒数据完整性校验脚本。 以独立视角从 API 源数据出发,逐层对比各 Schema 各表的数据完整性: 1. API → ODS:逐端点拉取数据,与 ODS 表记录数对比 2. ODS → DWD:按 TABLE_MAP 逐对比较记录数和金额列汇总 3. DWD → DWS:验证汇总表聚合结果与明细数据一致性 4. 可疑值检测:扫描各表中的边缘值、空值、重复记录 5. 抽样比对:随机抽样 100 条记录,逐字段与上游 API 源数据比对 用法: cd apps/etl/connectors/feiqiu python -m scripts.debug.debug_blackbox [--sample-size 100] """ from __future__ import annotations import argparse import json import logging import sys import time import traceback from dataclasses import asdict, dataclass, field from datetime import datetime from decimal import Decimal from pathlib import Path from typing import Any from zoneinfo import ZoneInfo # ── 确保项目根目录在 sys.path ── _FEIQIU_ROOT = Path(__file__).resolve().parents[2] if str(_FEIQIU_ROOT) not in sys.path: sys.path.insert(0, str(_FEIQIU_ROOT)) from config.settings import AppConfig from database.connection import DatabaseConnection from api.client import APIClient from tasks.ods.ods_tasks import ODS_TASK_SPECS, OdsTaskSpec from tasks.dwd.dwd_load_task import DwdLoadTask from orchestration.task_registry import default_registry # ── 数据模型 ────────────────────────────────────────────────── @dataclass class BlackboxCheckResult: """黑盒校验单表结果""" layer: str = "" # "API_ODS" / "ODS_DWD" / "DWD_DWS" source_table: str = "" target_table: str = "" source_count: int = 0 target_count: int = 0 count_diff: int = 0 amount_diffs: list = field(default_factory=list) missing_keys: list = field(default_factory=list) mismatch_count: int = 0 status: str = "" # PASS / WARN / FAIL / ERROR / SKIP message: str = "" details: dict = field(default_factory=dict) @dataclass class SuspectRecord: """可疑值检测结果""" table: str = "" check_type: str = "" # "null_pk" / "duplicate" / "edge_value" / "negative_amount" count: int = 0 sample_keys: list = field(default_factory=list) message: str = "" @dataclass class SampleMismatch: """抽样比对不一致记录""" table: str = "" pk_value: Any = None field_name: str = "" api_value: Any = None ods_value: Any = None # ── 工具函数 ────────────────────────────────────────────────── def _setup_logging() -> logging.Logger: logger = logging.getLogger("debug_blackbox") logger.setLevel(logging.INFO) if not logger.handlers: handler = logging.StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter( "%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S" )) logger.addHandler(handler) return logger def _build_components(config: AppConfig, logger: logging.Logger): """构建 DB / API 组件。""" db_conn = DatabaseConnection( dsn=config["db"]["dsn"], session=config["db"].get("session"), connect_timeout=config["db"].get("connect_timeout_sec"), ) api_client = APIClient( base_url=config["api"]["base_url"], token=config["api"]["token"], timeout=config["api"].get("timeout_sec", 20), retry_max=config["api"].get("retries", {}).get("max_attempts", 3), headers_extra=config["api"].get("headers_extra"), ) return db_conn, api_client def _query_count(db_conn: DatabaseConnection, table: str) -> int: """查询表的总行数。""" rows = db_conn.query(f"SELECT COUNT(*) AS cnt FROM {table}") return int(rows[0]["cnt"]) if rows else 0 def _table_exists(db_conn: DatabaseConnection, table: str) -> bool: """检查表是否存在。""" rows = db_conn.query("SELECT to_regclass(%s) AS reg", (table,)) return bool(rows and rows[0].get("reg")) def _has_column(db_conn: DatabaseConnection, table: str, column: str) -> bool: """检查表是否包含指定列。""" sql = """ SELECT 1 FROM information_schema.columns WHERE table_schema || '.' || table_name = %s AND column_name = %s LIMIT 1 """ rows = db_conn.query(sql, (table, column)) return bool(rows) def _get_numeric_columns(db_conn: DatabaseConnection, table: str) -> list[str]: """获取表中所有 numeric/decimal 类型的列名(金额列候选)。""" sql = """ SELECT column_name FROM information_schema.columns WHERE table_schema || '.' || table_name = %s AND data_type IN ('numeric', 'decimal', 'money') ORDER BY ordinal_position """ rows = db_conn.query(sql, (table,)) return [r["column_name"] for r in rows] def _get_pk_columns(db_conn: DatabaseConnection, table: str) -> list[str]: """获取表的主键列。""" parts = table.split(".") schema = parts[0] if len(parts) == 2 else "public" tbl = parts[1] if len(parts) == 2 else parts[0] sql = """ SELECT a.attname FROM pg_index i JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) WHERE i.indrelid = %s::regclass AND i.indisprimary ORDER BY array_position(i.indkey, a.attnum) """ rows = db_conn.query(sql, (table,)) return [r["attname"] for r in rows] def _sanitize_for_json(obj): """递归处理不可序列化的值。""" if isinstance(obj, dict): return {k: _sanitize_for_json(v) for k, v in obj.items()} if isinstance(obj, (list, tuple)): return [_sanitize_for_json(v) for v in obj] if isinstance(obj, datetime): return obj.isoformat() if isinstance(obj, Decimal): return float(obj) if isinstance(obj, set): return list(obj) return obj # ── 已启用的 ODS 任务代码(与 task_registry 一致)────────────── _ENABLED_ODS_SPECS: list[OdsTaskSpec] = [ spec for spec in ODS_TASK_SPECS if spec.code in {m.upper() for m in default_registry.get_tasks_by_layer("ODS")} ] # ══════════════════════════════════════════════════════════════ # 步骤 1: API → ODS 记录数对比 # ══════════════════════════════════════════════════════════════ def _check_api_vs_ods( db_conn: DatabaseConnection, api_client: APIClient, config: AppConfig, logger: logging.Logger, ) -> list[BlackboxCheckResult]: """逐端点从 API 拉取数据,与 ODS 表记录数对比。""" logger.info("=" * 60) logger.info("步骤 1: API → ODS 记录数对比") logger.info("=" * 60) store_id = int(config.get("app.store_id")) results: list[BlackboxCheckResult] = [] for spec in _ENABLED_ODS_SPECS: result = BlackboxCheckResult( layer="API_ODS", source_table=f"API:{spec.endpoint}", target_table=spec.table_name, ) logger.info(" ▶ %s → %s", spec.code, spec.table_name) # 查询 ODS 表记录数 try: if not _table_exists(db_conn, spec.table_name): result.status = "ERROR" result.message = f"ODS 表不存在: {spec.table_name}" logger.warning(" ✗ %s", result.message) results.append(result) continue ods_count = _query_count(db_conn, spec.table_name) result.target_count = ods_count except Exception as exc: result.status = "ERROR" result.message = f"查询 ODS 表失败: {exc}" logger.error(" ✗ %s", result.message) results.append(result) continue # 从 API 拉取记录数 # ODS_SETTLEMENT_TICKET 是特殊任务(逐条拉取),跳过 API 对比 if spec.code == "ODS_SETTLEMENT_TICKET": result.status = "SKIP" result.message = f"特殊任务跳过 API 对比, ODS 行数={ods_count}" logger.info(" ⊘ %s", result.message) results.append(result) continue try: params: dict[str, Any] = {} if spec.include_site_id: params["siteId"] = store_id # 不带时间窗口参数,拉取全量(与 ODS 全表对比) all_records, _ = api_client.get_paginated( endpoint=spec.endpoint, params=params, page_size=200, data_path=spec.data_path, list_key=spec.list_key, ) api_count = len(all_records) result.source_count = api_count except Exception as exc: result.status = "ERROR" result.message = f"API 拉取失败: {exc}" logger.error(" ✗ %s", result.message) results.append(result) continue # 对比 diff = ods_count - api_count result.count_diff = diff if diff == 0: result.status = "PASS" result.message = f"记录数一致: API={api_count}, ODS={ods_count}" elif abs(diff) <= max(5, int(api_count * 0.01)): # 允许 1% 或 5 条以内的差异(历史数据/删除标记等) result.status = "WARN" result.message = f"记录数微差: API={api_count}, ODS={ods_count}, diff={diff}" else: result.status = "FAIL" result.message = f"记录数差异较大: API={api_count}, ODS={ods_count}, diff={diff}" icon = {"PASS": "✓", "WARN": "⚠", "FAIL": "✗"}.get(result.status, "?") logger.info(" %s %s", icon, result.message) results.append(result) db_conn.ensure_open() return results # ══════════════════════════════════════════════════════════════ # 步骤 2: ODS → DWD 记录数 + 金额对比 # ══════════════════════════════════════════════════════════════ def _check_ods_vs_dwd( db_conn: DatabaseConnection, logger: logging.Logger, ) -> list[BlackboxCheckResult]: """按 TABLE_MAP 逐对比较 ODS 与 DWD 的记录数和金额列汇总。""" logger.info("") logger.info("=" * 60) logger.info("步骤 2: ODS → DWD 记录数 + 金额对比") logger.info("=" * 60) table_map = DwdLoadTask.TABLE_MAP results: list[BlackboxCheckResult] = [] # 按 ODS 源表分组(多个 DWD 表可能映射同一个 ODS 表) for dwd_table, ods_table in sorted(table_map.items()): result = BlackboxCheckResult( layer="ODS_DWD", source_table=ods_table, target_table=dwd_table, ) logger.info(" ▶ %s → %s", ods_table, dwd_table) # 检查表是否存在 for tbl, label in [(ods_table, "ODS"), (dwd_table, "DWD")]: if not _table_exists(db_conn, tbl): result.status = "ERROR" result.message = f"{label} 表不存在: {tbl}" logger.warning(" ✗ %s", result.message) break if result.status == "ERROR": results.append(result) continue try: ods_count = _query_count(db_conn, ods_table) dwd_count = _query_count(db_conn, dwd_table) result.source_count = ods_count result.target_count = dwd_count result.count_diff = dwd_count - ods_count except Exception as exc: result.status = "ERROR" result.message = f"查询记录数失败: {exc}" logger.error(" ✗ %s", result.message) results.append(result) continue # 维度表(dim_*)可能因 SCD2 导致 DWD 行数 >= ODS 行数 is_dim = "dim_" in dwd_table.split(".")[-1] # 金额列对比:找两表共有的 numeric 列 try: ods_num_cols = set(_get_numeric_columns(db_conn, ods_table)) dwd_num_cols = set(_get_numeric_columns(db_conn, dwd_table)) common_amount_cols = sorted(ods_num_cols & dwd_num_cols) amount_diffs = [] for col in common_amount_cols[:5]: # 最多对比 5 个金额列 try: ods_sum_rows = db_conn.query( f'SELECT COALESCE(SUM("{col}"), 0) AS s FROM {ods_table}' ) dwd_sum_rows = db_conn.query( f'SELECT COALESCE(SUM("{col}"), 0) AS s FROM {dwd_table}' ) ods_sum = float(ods_sum_rows[0]["s"]) if ods_sum_rows else 0.0 dwd_sum = float(dwd_sum_rows[0]["s"]) if dwd_sum_rows else 0.0 diff = round(dwd_sum - ods_sum, 2) if abs(diff) > 0.01: amount_diffs.append({ "column": col, "ods_sum": ods_sum, "dwd_sum": dwd_sum, "diff": diff, }) except Exception: pass # 跳过无法聚合的列 result.amount_diffs = amount_diffs except Exception: pass # 状态判定 if is_dim: # 维度表:DWD >= ODS 是正常的(SCD2 历史版本) if dwd_count >= ods_count: result.status = "PASS" result.message = ( f"维度表 SCD2: ODS={ods_count}, DWD={dwd_count} " f"(+{dwd_count - ods_count} 历史版本)" ) else: result.status = "WARN" result.message = f"维度表 DWD < ODS: ODS={ods_count}, DWD={dwd_count}" else: # 事实表:记录数应大致一致 if result.count_diff == 0: result.status = "PASS" result.message = f"记录数一致: {ods_count}" elif abs(result.count_diff) <= max(5, int(ods_count * 0.02)): result.status = "WARN" result.message = ( f"记录数微差: ODS={ods_count}, DWD={dwd_count}, " f"diff={result.count_diff}" ) else: result.status = "FAIL" result.message = ( f"记录数差异较大: ODS={ods_count}, DWD={dwd_count}, " f"diff={result.count_diff}" ) if result.amount_diffs: result.status = max(result.status, "WARN", key=lambda s: ["PASS", "WARN", "FAIL", "ERROR"].index(s) if s in ["PASS", "WARN", "FAIL", "ERROR"] else 0) result.message += f" | {len(result.amount_diffs)} 个金额列有差异" icon = {"PASS": "✓", "WARN": "⚠", "FAIL": "✗", "ERROR": "✗"}.get(result.status, "?") logger.info(" %s %s", icon, result.message) results.append(result) return results # ══════════════════════════════════════════════════════════════ # 步骤 3: DWD → DWS 聚合一致性 # ══════════════════════════════════════════════════════════════ # 已知的 DWS→DWD 聚合关系映射 _DWS_DWD_MAP: dict[str, dict] = { "dws.dws_assistant_daily_detail": { "dwd_source": "dwd.dwd_assistant_service_log", "dws_date_col": "stat_date", "dwd_date_col": "service_date", "description": "助教日度明细 vs DWD 服务流水", }, "dws.dws_finance_daily_summary": { "dwd_source": "dwd.dwd_settlement_head", "dws_date_col": "stat_date", "dwd_date_col": "pay_time", "dwd_date_cast": "::date", "description": "财务日度汇总 vs DWD 结账记录", }, "dws.dws_member_visit_detail": { "dwd_source": "dwd.dwd_settlement_head", "dws_date_col": "visit_date", "dwd_date_col": "pay_time", "dwd_date_cast": "::date", "description": "会员到店明细 vs DWD 结账记录", }, "dws.dws_member_consumption_summary": { "dwd_source": "dwd.dwd_settlement_head", "dws_date_col": "stat_month", "dwd_date_col": "pay_time", "dwd_date_cast": "date_trunc('month', %col%)::date", "description": "会员消费汇总 vs DWD 结账记录", }, "dws.dws_finance_recharge_summary": { "dwd_source": "dwd.dwd_recharge_order", "dws_date_col": "stat_date", "dwd_date_col": "pay_time", "dwd_date_cast": "::date", "description": "充值汇总 vs DWD 充值订单", }, } def _check_dwd_vs_dws( db_conn: DatabaseConnection, config: AppConfig, logger: logging.Logger, ) -> list[BlackboxCheckResult]: """验证 DWS 汇总表聚合结果与 DWD 明细数据的一致性。""" logger.info("") logger.info("=" * 60) logger.info("步骤 3: DWD → DWS 聚合一致性") logger.info("=" * 60) results: list[BlackboxCheckResult] = [] # 获取所有 DWS 层任务的目标表 dws_codes = default_registry.get_tasks_by_layer("DWS") dws_tables: list[str] = [] for code in sorted(dws_codes): meta = default_registry.get_metadata(code) if meta is None: continue try: inst = meta.task_class(config, db_conn, None, logging.getLogger("noop")) raw = inst.get_target_table() full = f"dws.{raw}" if raw and "." not in raw else raw if full: dws_tables.append(full) except Exception: pass for dws_table in sorted(set(dws_tables)): result = BlackboxCheckResult(layer="DWD_DWS", target_table=dws_table) if not _table_exists(db_conn, dws_table): result.status = "SKIP" result.message = f"DWS 表不存在: {dws_table}" logger.info(" ⊘ %s", result.message) results.append(result) continue mapping = _DWS_DWD_MAP.get(dws_table) if not mapping: # 无已知映射,仅做基本行数检查 try: dws_count = _query_count(db_conn, dws_table) result.target_count = dws_count result.status = "PASS" if dws_count > 0 else "WARN" result.message = f"无映射关系, DWS 行数={dws_count}" except Exception as exc: result.status = "ERROR" result.message = f"查询失败: {exc}" logger.info(" ℹ %s: %s", dws_table, result.message) results.append(result) continue dwd_source = mapping["dwd_source"] result.source_table = dwd_source logger.info(" ▶ %s → %s (%s)", dwd_source, dws_table, mapping["description"]) if not _table_exists(db_conn, dwd_source): result.status = "ERROR" result.message = f"DWD 源表不存在: {dwd_source}" logger.warning(" ✗ %s", result.message) results.append(result) continue try: dws_count = _query_count(db_conn, dws_table) dwd_count = _query_count(db_conn, dwd_source) result.source_count = dwd_count result.target_count = dws_count result.count_diff = dws_count - dwd_count # DWS 是聚合表,行数通常 <= DWD if dws_count == 0 and dwd_count > 0: result.status = "WARN" result.message = f"DWS 无数据但 DWD 有 {dwd_count} 行" elif dws_count > 0 and dwd_count == 0: result.status = "WARN" result.message = f"DWS 有 {dws_count} 行但 DWD 无数据" else: result.status = "PASS" result.message = f"DWD={dwd_count}, DWS={dws_count}" # 抽样日期对比:取 DWS 最近 3 个日期 dws_date_col = mapping.get("dws_date_col") if dws_date_col and _has_column(db_conn, dws_table, dws_date_col): sample_sql = f""" SELECT DISTINCT "{dws_date_col}" AS d FROM {dws_table} ORDER BY d DESC LIMIT 3 """ date_rows = db_conn.query(sample_sql) date_checks = [] for dr in date_rows: d = dr["d"] dws_day_rows = db_conn.query( f'SELECT COUNT(*) AS cnt FROM {dws_table} WHERE "{dws_date_col}" = %s', (d,), ) dws_day = int(dws_day_rows[0]["cnt"]) if dws_day_rows else 0 date_checks.append({"date": str(d), "dws_rows": dws_day}) result.details["date_samples"] = date_checks except Exception as exc: result.status = "ERROR" result.message = f"对比失败: {exc}" icon = {"PASS": "✓", "WARN": "⚠", "FAIL": "✗", "ERROR": "✗", "SKIP": "⊘"}.get(result.status, "?") logger.info(" %s %s", icon, result.message) results.append(result) return results # ══════════════════════════════════════════════════════════════ # 步骤 4: 可疑值检测 # ══════════════════════════════════════════════════════════════ def _detect_suspects( db_conn: DatabaseConnection, logger: logging.Logger, ) -> list[SuspectRecord]: """扫描各表中的边缘值、空值、重复记录。""" logger.info("") logger.info("=" * 60) logger.info("步骤 4: 可疑值检测") logger.info("=" * 60) suspects: list[SuspectRecord] = [] # 扫描所有 ODS 和 DWD 表 tables_to_scan: list[str] = [] for spec in _ENABLED_ODS_SPECS: tables_to_scan.append(spec.table_name) for dwd_table in sorted(DwdLoadTask.TABLE_MAP.keys()): tables_to_scan.append(dwd_table) for table in sorted(set(tables_to_scan)): if not _table_exists(db_conn, table): continue logger.info(" 扫描 %s ...", table) pk_cols = _get_pk_columns(db_conn, table) # 4a. 主键空值检测 if pk_cols: for pk in pk_cols: try: null_sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE "{pk}" IS NULL' rows = db_conn.query(null_sql) null_count = int(rows[0]["cnt"]) if rows else 0 if null_count > 0: suspects.append(SuspectRecord( table=table, check_type="null_pk", count=null_count, message=f"主键列 {pk} 有 {null_count} 个 NULL 值", )) logger.warning(" ⚠ %s: 主键 %s 有 %d 个 NULL", table, pk, null_count) except Exception: pass # 4b. 重复记录检测(基于主键) if pk_cols: pk_expr = ", ".join(f'"{c}"' for c in pk_cols) try: dup_sql = f""" SELECT {pk_expr}, COUNT(*) AS cnt FROM {table} GROUP BY {pk_expr} HAVING COUNT(*) > 1 LIMIT 10 """ dup_rows = db_conn.query(dup_sql) if dup_rows: dup_count = len(dup_rows) sample_keys = [ {c: r[c] for c in pk_cols if c in r} for r in dup_rows[:5] ] suspects.append(SuspectRecord( table=table, check_type="duplicate", count=dup_count, sample_keys=sample_keys, message=f"发现 {dup_count} 组重复主键", )) logger.warning(" ⚠ %s: %d 组重复主键", table, dup_count) except Exception: pass # 4c. 金额列负值/极端值检测 try: num_cols = _get_numeric_columns(db_conn, table) # 只检查名称中含 amount/money/price/fee/sum 的列 amount_keywords = ("amount", "money", "price", "fee", "sum", "balance", "cost") amount_cols = [c for c in num_cols if any(k in c.lower() for k in amount_keywords)] for col in amount_cols[:5]: try: neg_sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE "{col}" < 0' neg_rows = db_conn.query(neg_sql) neg_count = int(neg_rows[0]["cnt"]) if neg_rows else 0 if neg_count > 0: suspects.append(SuspectRecord( table=table, check_type="negative_amount", count=neg_count, message=f"金额列 {col} 有 {neg_count} 个负值", )) logger.info(" ℹ %s.%s: %d 个负值", table, col, neg_count) except Exception: pass # 极端值:超过 100 万 try: edge_sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE ABS("{col}") > 1000000' edge_rows = db_conn.query(edge_sql) edge_count = int(edge_rows[0]["cnt"]) if edge_rows else 0 if edge_count > 0: suspects.append(SuspectRecord( table=table, check_type="edge_value", count=edge_count, message=f"金额列 {col} 有 {edge_count} 个超百万值", )) logger.warning(" ⚠ %s.%s: %d 个超百万值", table, col, edge_count) except Exception: pass except Exception: pass db_conn.ensure_open() logger.info(" 可疑值检测完成, 共发现 %d 项", len(suspects)) return suspects # ══════════════════════════════════════════════════════════════ # 步骤 5: 抽样比对 API vs ODS # ══════════════════════════════════════════════════════════════ def _sample_compare_api_vs_ods( db_conn: DatabaseConnection, api_client: APIClient, config: AppConfig, logger: logging.Logger, sample_size: int = 100, ) -> list[SampleMismatch]: """从 ODS 中随机抽样记录,逐字段与上游 API 源数据比对。""" logger.info("") logger.info("=" * 60) logger.info("步骤 5: 抽样比对 API vs ODS (%d 条)", sample_size) logger.info("=" * 60) store_id = int(config.get("app.store_id")) mismatches: list[SampleMismatch] = [] total_sampled = 0 # 选择适合抽样的 ODS 任务(有明确 id 主键、非特殊任务) sample_specs = [ s for s in _ENABLED_ODS_SPECS if s.code != "ODS_SETTLEMENT_TICKET" and any(pk.column == "id" for pk in s.pk_columns) ] # 每个表分配的抽样数 per_table = max(1, sample_size // max(len(sample_specs), 1)) remaining = sample_size for spec in sample_specs: if remaining <= 0: break n = min(per_table, remaining) logger.info(" ▶ %s: 抽样 %d 条", spec.table_name, n) if not _table_exists(db_conn, spec.table_name): continue # 从 ODS 随机抽样 id try: sample_sql = f""" SELECT id FROM {spec.table_name} ORDER BY RANDOM() LIMIT %s """ id_rows = db_conn.query(sample_sql, (n,)) if not id_rows: logger.info(" ℹ 表为空,跳过") continue sample_ids = [r["id"] for r in id_rows] except Exception as exc: logger.warning(" ⚠ 抽样失败: %s", exc) continue # 从 API 拉取全量数据(缓存在内存中用于比对) try: params: dict[str, Any] = {} if spec.include_site_id: params["siteId"] = store_id api_records, _ = api_client.get_paginated( endpoint=spec.endpoint, params=params, page_size=200, data_path=spec.data_path, list_key=spec.list_key, ) # 按 id 建索引 api_by_id: dict[int, dict] = {} for rec in api_records: rec_id = rec.get("id") if rec_id is not None: try: api_by_id[int(rec_id)] = rec except (ValueError, TypeError): pass except Exception as exc: logger.warning(" ⚠ API 拉取失败: %s", exc) continue # 逐条比对 table_mismatches = 0 for sid in sample_ids: try: sid_int = int(sid) except (ValueError, TypeError): continue # 从 ODS 读取该条记录的 payload try: ods_row = db_conn.query( f"SELECT payload FROM {spec.table_name} WHERE id = %s LIMIT 1", (sid_int,), ) if not ods_row or not ods_row[0].get("payload"): continue ods_payload = ods_row[0]["payload"] # payload 可能是 JSON 字符串或已解析的 dict if isinstance(ods_payload, str): ods_payload = json.loads(ods_payload) except Exception: continue api_rec = api_by_id.get(sid_int) if api_rec is None: mismatches.append(SampleMismatch( table=spec.table_name, pk_value=sid_int, field_name="__missing__", api_value=None, ods_value="exists", )) table_mismatches += 1 continue # 逐字段比对(只比对 API 记录中的顶层字段) for key, api_val in api_rec.items(): ods_val = ods_payload.get(key) if not _values_match(api_val, ods_val): mismatches.append(SampleMismatch( table=spec.table_name, pk_value=sid_int, field_name=key, api_value=str(api_val)[:200], ods_value=str(ods_val)[:200], )) table_mismatches += 1 total_sampled += 1 remaining -= len(sample_ids) if table_mismatches > 0: logger.info(" ⚠ %d 个字段不一致", table_mismatches) else: logger.info(" ✓ 抽样一致") db_conn.ensure_open() logger.info(" 抽样比对完成: 共抽样 %d 条, 发现 %d 处不一致", total_sampled, len(mismatches)) return mismatches def _values_match(api_val: Any, ods_val: Any) -> bool: """宽松比较两个值是否一致。 处理常见的类型差异:int vs str、None vs 空字符串、浮点精度等。 """ if api_val is None and ods_val is None: return True if api_val is None and ods_val in ("", 0, "0", False): return True if ods_val is None and api_val in ("", 0, "0", False): return True # 都转字符串比较(处理 int/str 差异) str_api = str(api_val).strip() if api_val is not None else "" str_ods = str(ods_val).strip() if ods_val is not None else "" if str_api == str_ods: return True # 数值比较(处理浮点精度) try: if abs(float(str_api) - float(str_ods)) < 0.01: return True except (ValueError, TypeError): pass # 嵌套对象/列表:转 JSON 比较 if isinstance(api_val, (dict, list)) or isinstance(ods_val, (dict, list)): try: return json.dumps(api_val, sort_keys=True, default=str) == json.dumps(ods_val, sort_keys=True, default=str) except Exception: pass return False # ══════════════════════════════════════════════════════════════ # 报告生成 # ══════════════════════════════════════════════════════════════ def _generate_report( api_ods_results: list[BlackboxCheckResult], ods_dwd_results: list[BlackboxCheckResult], dwd_dws_results: list[BlackboxCheckResult], suspects: list[SuspectRecord], sample_mismatches: list[SampleMismatch], config: AppConfig, tz: ZoneInfo, ) -> dict: """汇总所有校验结果为结构化报告。""" now = datetime.now(tz) all_results = api_ods_results + ods_dwd_results + dwd_dws_results pass_count = sum(1 for r in all_results if r.status == "PASS") warn_count = sum(1 for r in all_results if r.status == "WARN") fail_count = sum(1 for r in all_results if r.status == "FAIL") error_count = sum(1 for r in all_results if r.status == "ERROR") skip_count = sum(1 for r in all_results if r.status == "SKIP") report = { "title": "黑盒数据完整性校验报告", "generated_at": now.isoformat(), "environment": { "store_id": config.get("app.store_id"), "db_name": config.get("db.name", ""), "api_base": config.get("api.base_url", ""), }, "summary": { "total_checks": len(all_results), "pass": pass_count, "warn": warn_count, "fail": fail_count, "error": error_count, "skip": skip_count, "suspect_count": len(suspects), "sample_mismatch_count": len(sample_mismatches), }, "api_ods": [asdict(r) for r in api_ods_results], "ods_dwd": [asdict(r) for r in ods_dwd_results], "dwd_dws": [asdict(r) for r in dwd_dws_results], "suspects": [asdict(s) for s in suspects], "sample_mismatches": [asdict(m) for m in sample_mismatches[:200]], } return report def _generate_markdown_report(report: dict) -> str: """将结构化报告转为 Markdown 格式。""" lines: list[str] = [] summary = report["summary"] lines.append(f"# {report['title']}") lines.append("") lines.append(f"生成时间: {report['generated_at']}") lines.append("") # 环境信息 env = report["environment"] lines.append("## 环境信息") lines.append(f"- 门店 ID: {env.get('store_id')}") lines.append(f"- 数据库: {env.get('db_name')}") lines.append(f"- API: {env.get('api_base')}") lines.append("") # 汇总 lines.append("## 校验汇总") lines.append(f"| 指标 | 数值 |") lines.append(f"|------|------|") lines.append(f"| 总检查项 | {summary['total_checks']} |") lines.append(f"| ✓ PASS | {summary['pass']} |") lines.append(f"| ⚠ WARN | {summary['warn']} |") lines.append(f"| ✗ FAIL | {summary['fail']} |") lines.append(f"| ✗ ERROR | {summary['error']} |") lines.append(f"| ⊘ SKIP | {summary['skip']} |") lines.append(f"| 可疑值 | {summary['suspect_count']} |") lines.append(f"| 抽样不一致 | {summary['sample_mismatch_count']} |") lines.append("") # API → ODS lines.append("## 步骤 1: API → ODS 记录数对比") lines.append("") _append_result_table(lines, report.get("api_ods", [])) # ODS → DWD lines.append("## 步骤 2: ODS → DWD 记录数 + 金额对比") lines.append("") _append_result_table(lines, report.get("ods_dwd", [])) # 金额差异详情 amount_issues = [r for r in report.get("ods_dwd", []) if r.get("amount_diffs")] if amount_issues: lines.append("### 金额差异详情") lines.append("") for r in amount_issues: lines.append(f"**{r['source_table']} → {r['target_table']}**") lines.append("") lines.append("| 列名 | ODS 汇总 | DWD 汇总 | 差异 |") lines.append("|------|----------|----------|------|") for ad in r["amount_diffs"]: lines.append( f"| {ad['column']} | {ad['ods_sum']:.2f} | {ad['dwd_sum']:.2f} | {ad['diff']:.2f} |" ) lines.append("") # DWD → DWS lines.append("## 步骤 3: DWD → DWS 聚合一致性") lines.append("") _append_result_table(lines, report.get("dwd_dws", [])) # 可疑值 lines.append("## 步骤 4: 可疑值检测") lines.append("") suspects_data = report.get("suspects", []) if suspects_data: lines.append("| 表 | 类型 | 数量 | 说明 |") lines.append("|---|------|------|------|") for s in suspects_data: lines.append(f"| {s['table']} | {s['check_type']} | {s['count']} | {s['message']} |") lines.append("") else: lines.append("未发现可疑值。") lines.append("") # 抽样比对 lines.append("## 步骤 5: 抽样比对 API vs ODS") lines.append("") sample_data = report.get("sample_mismatches", []) if sample_data: lines.append(f"共发现 {len(sample_data)} 处不一致:") lines.append("") lines.append("| 表 | 主键 | 字段 | API 值 | ODS 值 |") lines.append("|---|------|------|--------|--------|") for m in sample_data[:50]: # 最多展示 50 条 lines.append( f"| {m['table']} | {m['pk_value']} | {m['field_name']} " f"| {str(m.get('api_value', ''))[:60]} | {str(m.get('ods_value', ''))[:60]} |" ) if len(sample_data) > 50: lines.append(f"| ... | 共 {len(sample_data)} 条,仅展示前 50 条 | | | |") lines.append("") else: lines.append("抽样比对全部一致。") lines.append("") return "\n".join(lines) def _append_result_table(lines: list[str], results: list[dict]): """向 Markdown 追加校验结果表格。""" if not results: lines.append("无数据。") lines.append("") return lines.append("| 状态 | 源 | 目标 | 源行数 | 目标行数 | 差异 | 说明 |") lines.append("|------|---|------|--------|----------|------|------|") for r in results: icon = {"PASS": "✓", "WARN": "⚠", "FAIL": "✗", "ERROR": "✗", "SKIP": "⊘"}.get(r.get("status", ""), "?") lines.append( f"| {icon} {r.get('status', '')} " f"| {r.get('source_table', '')} " f"| {r.get('target_table', '')} " f"| {r.get('source_count', '')} " f"| {r.get('target_count', '')} " f"| {r.get('count_diff', '')} " f"| {r.get('message', '')} |" ) lines.append("") # ══════════════════════════════════════════════════════════════ # 主流程 # ══════════════════════════════════════════════════════════════ def run_blackbox_check( sample_size: int = 100, ) -> dict: """执行完整的黑盒数据校验。 Returns: 结构化校验报告 dict """ logger = _setup_logging() logger.info("╔" + "═" * 58 + "╗") logger.info("║ 黑盒数据完整性校验 ║") logger.info("╚" + "═" * 58 + "╝") # 加载配置 config = AppConfig.load() tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai")) logger.info("门店 ID: %s", config.get("app.store_id")) logger.info("数据库: %s", config.get("db.name", "")) logger.info("API: %s", config.get("api.base_url", "")) logger.info("") # 构建组件 db_conn, api_client = _build_components(config, logger) t0 = time.monotonic() # 步骤 1: API → ODS try: api_ods_results = _check_api_vs_ods(db_conn, api_client, config, logger) except Exception as exc: logger.error("步骤 1 异常: %s", exc) logger.error(traceback.format_exc()) api_ods_results = [] db_conn.ensure_open() # 步骤 2: ODS → DWD try: ods_dwd_results = _check_ods_vs_dwd(db_conn, logger) except Exception as exc: logger.error("步骤 2 异常: %s", exc) logger.error(traceback.format_exc()) ods_dwd_results = [] db_conn.ensure_open() # 步骤 3: DWD → DWS try: dwd_dws_results = _check_dwd_vs_dws(db_conn, config, logger) except Exception as exc: logger.error("步骤 3 异常: %s", exc) logger.error(traceback.format_exc()) dwd_dws_results = [] db_conn.ensure_open() # 步骤 4: 可疑值检测 try: suspects = _detect_suspects(db_conn, logger) except Exception as exc: logger.error("步骤 4 异常: %s", exc) logger.error(traceback.format_exc()) suspects = [] db_conn.ensure_open() # 步骤 5: 抽样比对 try: sample_mismatches = _sample_compare_api_vs_ods( db_conn, api_client, config, logger, sample_size=sample_size, ) except Exception as exc: logger.error("步骤 5 异常: %s", exc) logger.error(traceback.format_exc()) sample_mismatches = [] total_sec = round(time.monotonic() - t0, 1) # 生成报告 report = _generate_report( api_ods_results, ods_dwd_results, dwd_dws_results, suspects, sample_mismatches, config, tz, ) report["duration_sec"] = total_sec # 输出 JSON output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output" output_dir.mkdir(parents=True, exist_ok=True) ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S") json_path = output_dir / f"blackbox_{ts}.json" json_path.write_text( json.dumps(_sanitize_for_json(report), ensure_ascii=False, indent=2, default=str), encoding="utf-8", ) logger.info("") logger.info("JSON 报告: %s", json_path) # 输出 Markdown reports_dir = _FEIQIU_ROOT / "docs" / "reports" reports_dir.mkdir(parents=True, exist_ok=True) md_path = reports_dir / f"blackbox_report_{ts}.md" md_content = _generate_markdown_report(report) md_path.write_text(md_content, encoding="utf-8") logger.info("Markdown 报告: %s", md_path) # 打印汇总 logger.info("") logger.info("=" * 60) logger.info("黑盒校验汇总") logger.info("=" * 60) s = report["summary"] logger.info(" 总检查项: %d", s["total_checks"]) logger.info(" ✓ PASS: %d", s["pass"]) logger.info(" ⚠ WARN: %d", s["warn"]) logger.info(" ✗ FAIL: %d", s["fail"]) logger.info(" ✗ ERROR: %d", s["error"]) logger.info(" ⊘ SKIP: %d", s["skip"]) logger.info(" 可疑值: %d 项", s["suspect_count"]) logger.info(" 抽样不一致: %d 处", s["sample_mismatch_count"]) logger.info(" 总耗时: %.1f 秒", total_sec) # 清理 db_conn.close() return report # ── CLI 入口 ────────────────────────────────────────────────── def parse_args(): parser = argparse.ArgumentParser(description="黑盒数据完整性校验") parser.add_argument("--sample-size", type=int, default=100, help="抽样比对记录数(默认 100)") return parser.parse_args() def main(): args = parse_args() report = run_blackbox_check(sample_size=args.sample_size) # 退出码: 有 FAIL 则非零 has_fail = report["summary"]["fail"] > 0 sys.exit(1 if has_fail else 0) if __name__ == "__main__": main()