# -*- coding: utf-8 -*- """ ETL 全链路数据一致性黑盒检查 黑盒手段:获取最近一次成功的 ETL 任务,对 API→ODS→DWD→DWS/INDEX 逐表逐字段进行实际数据(JSON + DB)比对,输出详细差异报告。 用法: cd C:\\NeoZQYY uv run python scripts/ops/etl_consistency_check.py 输出: ETL_REPORT_ROOT / consistency_check_.md """ from __future__ import annotations import json import os import re import sys from datetime import datetime from decimal import Decimal from pathlib import Path from zoneinfo import ZoneInfo import psycopg2 import psycopg2.extras from dotenv import load_dotenv _ROOT = Path(__file__).resolve().parents[2] load_dotenv(_ROOT / ".env", override=False) # ── 环境变量(缺失即报错) ── def _env(var: str) -> Path: val = os.environ.get(var) if not val: raise KeyError(f"环境变量 {var} 未定义") p = Path(val) p.mkdir(parents=True, exist_ok=True) return p REPORT_ROOT = _env("ETL_REPORT_ROOT") JSON_ROOT = _env("FETCH_ROOT") LOG_ROOT = _env("LOG_ROOT") PG_DSN = os.environ.get("TEST_DB_DSN") if not PG_DSN: raise RuntimeError("TEST_DB_DSN 未在 .env 中定义") TZ = ZoneInfo("Asia/Shanghai") NOW = datetime.now(TZ) TS = NOW.strftime("%Y%m%d_%H%M%S") # 采样条数上限 SAMPLE_LIMIT = 5 # 字段统计采样上限(用于 per-field 统计的表行数阈值,超过此值跳过 COUNT DISTINCT) FIELD_STATS_DISTINCT_THRESHOLD = 3000 # ── ODS 任务名 → ODS 表名映射(与 blackbox_test_report.py 保持一致) ── # CHANGE 2026-03-04 | 补充 ODS_STAFF_INFO,此前遗漏导致员工表不参与一致性检查 ODS_TASK_TO_TABLE = { "ODS_ASSISTANT_ACCOUNT": "assistant_accounts_master", "ODS_ASSISTANT_LEDGER": "assistant_service_records", "ODS_SETTLEMENT_RECORDS": "settlement_records", "ODS_TABLE_USE": "table_fee_transactions", "ODS_TABLE_FEE_DISCOUNT": "table_fee_discount_records", "ODS_TABLES": "site_tables_master", "ODS_PAYMENT": "payment_transactions", "ODS_REFUND": "refund_transactions", "ODS_PLATFORM_COUPON": "platform_coupon_redemption_records", "ODS_MEMBER": "member_profiles", "ODS_MEMBER_CARD": "member_stored_value_cards", "ODS_MEMBER_BALANCE": "member_balance_changes", "ODS_RECHARGE_SETTLE": "recharge_settlements", "ODS_GROUP_PACKAGE": "group_buy_packages", "ODS_GROUP_BUY_REDEMPTION": "group_buy_redemption_records", "ODS_INVENTORY_STOCK": "goods_stock_summary", "ODS_INVENTORY_CHANGE": "goods_stock_movements", "ODS_GOODS_CATEGORY": "stock_goods_category_tree", "ODS_STORE_GOODS": "store_goods_master", "ODS_STORE_GOODS_SALES": "store_goods_sales_records", "ODS_TENANT_GOODS": "tenant_goods_master", "ODS_STAFF_INFO": "staff_info_master", } # DWD 表 → ODS 表映射 # CHANGE 2026-03-04 | 补充 dim_staff、dim_site、 # dwd_goods_stock_movement、dwd_goods_stock_summary,此前遗漏 # CHANGE 2026-03-04 | 移除 dwd_assistant_trash_event(表已于 2026-02-22 DROP,禁止复活) DWD_TO_ODS = { "dwd.dim_assistant": "ods.assistant_accounts_master", "dwd.dim_member": "ods.member_profiles", "dwd.dim_member_card_account": "ods.member_stored_value_cards", "dwd.dim_table": "ods.site_tables_master", "dwd.dim_groupbuy_package": "ods.group_buy_packages", "dwd.dim_store_goods": "ods.store_goods_master", "dwd.dim_tenant_goods": "ods.tenant_goods_master", "dwd.dim_goods_category": "ods.stock_goods_category_tree", "dwd.dim_staff": "ods.staff_info_master", "dwd.dim_site": "ods.site_tables_master", "dwd.dwd_assistant_service_log": "ods.assistant_service_records", "dwd.dwd_member_balance_change": "ods.member_balance_changes", "dwd.dwd_recharge_order": "ods.recharge_settlements", "dwd.dwd_settlement_head": "ods.settlement_records", "dwd.dwd_table_fee_log": "ods.table_fee_transactions", "dwd.dwd_table_fee_adjust": "ods.table_fee_discount_records", "dwd.dwd_payment": "ods.payment_transactions", "dwd.dwd_refund": "ods.refund_transactions", "dwd.dwd_platform_coupon_redemption": "ods.platform_coupon_redemption_records", "dwd.dwd_groupbuy_redemption": "ods.group_buy_redemption_records", "dwd.dwd_store_goods_sale": "ods.store_goods_sales_records", "dwd.dwd_goods_stock_movement": "ods.goods_stock_movements", "dwd.dwd_goods_stock_summary": "ods.goods_stock_summary", } # ETL 元数据列(不参与值比对) ETL_META_COLS = {"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"} # SCD2 管理列(不参与 ODS→DWD 值比对) SCD2_COLS = { "valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id", "scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version", } # ── 数据库连接 ── def get_conn(): conn = psycopg2.connect(PG_DSN, cursor_factory=psycopg2.extras.RealDictCursor) conn.set_session(readonly=True) return conn # ── ETL 日志解析 ── def find_latest_log() -> Path | None: """找到最新的 ETL 日志文件""" logs = sorted(LOG_ROOT.glob("*.log"), key=lambda p: p.stat().st_mtime, reverse=True) return logs[0] if logs else None def parse_etl_log(log_path: Path) -> dict: """解析 ETL 日志,提取任务执行结果""" results = {} task_start_times = {} with open(log_path, "r", encoding="utf-8") as f: for line in f: m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*开始执行(\w+) \(ODS\)", line) if m: task_start_times[m.group(2)] = m.group(1) continue m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(\w+) ODS 任务完成: (\{.*\})", line) if m: task_name = m.group(2) try: counts = eval(m.group(3)) except Exception: counts = {} results[task_name] = { "status": "SUCC", "layer": "ODS", "start": task_start_times.get(task_name, ""), "end": m.group(1), "counts": counts, } continue m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*DWD_LOAD_FROM_ODS.*完成.*?(\d+).*表", line) if m: results["DWD_LOAD_FROM_ODS"] = {"status": "SUCC", "layer": "DWD", "end": m.group(1)} continue m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(\w+): 工具类任务执行成功", line) if m: results[m.group(2)] = {"status": "SUCC", "layer": "DWS/INDEX", "end": m.group(1)} continue m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(\w+): 工具类任务执行失败: (.*)", line) if m: results[m.group(2)] = { "status": "FAIL", "layer": "DWS/INDEX", "end": m.group(1), "error": m.group(3)[:120], } continue m = re.match(r".*任务 (\w+) 未启用或不存在", line) if m: results[m.group(1)] = {"status": "SKIP", "layer": "DWS", "error": "未注册"} return results # ── API JSON 读取 ── def load_api_json_records(task_name: str) -> list[dict] | None: """读取某个 ODS 任务最新一次 JSON 落盘的全部记录。 JSON 结构为 ETL 框架包装格式: {task_code, run_id, pages: [{page, request, response: {data: {: [...]}}}]} 数据列表字段名因端点而异(如 tenantMemberInfos、list 等), 取 response.data 下第一个 list 类型的值。 """ task_dir = JSON_ROOT / task_name if not task_dir.exists(): return None subdirs = sorted(task_dir.iterdir(), key=lambda p: p.name, reverse=True) for sd in subdirs: if not sd.is_dir(): continue records = [] for jf in sorted(sd.glob("*.json")): try: data = json.loads(jf.read_text(encoding="utf-8")) except Exception: continue # ETL 框架包装格式:pages[].response.data. if isinstance(data, dict) and "pages" in data: for page in data["pages"]: resp = page.get("response", {}) if not isinstance(resp, dict): continue resp_data = resp.get("data", {}) if isinstance(resp_data, dict): # 找 response.data 下第一个 list 类型的值 for v in resp_data.values(): if isinstance(v, list): records.extend(v) break elif isinstance(resp_data, list): records.extend(resp_data) # 兼容旧格式:直接是 list 或 {data: {list: [...]}} elif isinstance(data, list): records.extend(data) elif isinstance(data, dict): lst = data.get("data", {}).get("list", data.get("data", [])) if isinstance(lst, list): records.extend(lst) if records: return records return None # ── 表结构查询 ── def get_table_columns(conn, schema: str, table: str) -> list[str]: """获取表的列名列表""" with conn.cursor() as cur: cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s ORDER BY ordinal_position """, (schema, table)) return [r["column_name"] for r in cur.fetchall()] def get_table_row_count(conn, schema: str, table: str) -> int: """获取表行数""" with conn.cursor() as cur: try: cur.execute(f"SELECT COUNT(*) AS cnt FROM {schema}.{table}") return cur.fetchone()["cnt"] except Exception: conn.rollback() return -1 def sample_rows(conn, schema: str, table: str, limit: int = SAMPLE_LIMIT) -> list[dict]: """随机采样若干行""" with conn.cursor() as cur: try: cur.execute(f""" SELECT * FROM {schema}.{table} ORDER BY random() LIMIT {limit} """) rows = cur.fetchall() # 转换为可序列化的 dict result = [] for row in rows: d = {} for k, v in dict(row).items(): if isinstance(v, Decimal): d[k] = float(v) elif isinstance(v, datetime): d[k] = str(v) elif isinstance(v, (dict, list)): d[k] = v # JSON 类型保持原样 else: d[k] = v result.append(d) return result except Exception: conn.rollback() return [] # ── 字段级统计(类似 field_level_report 的 per-field 统计) ── def get_field_stats(conn, schema: str, table: str) -> list[dict]: """批量获取表中每个字段的统计信息(NULL率、数值 min/max/avg、唯一值等)""" cols_meta = [] with conn.cursor() as cur: cur.execute(""" SELECT column_name, udt_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s ORDER BY ordinal_position """, (schema, table)) cols_meta = [(r["column_name"], r["udt_name"]) for r in cur.fetchall()] if not cols_meta: return [] row_count = get_table_row_count(conn, schema, table) if row_count <= 0: return [{"column": c, "type": t, "total": row_count, "null_pct": "—"} for c, t in cols_meta] # 构造批量聚合 SQL(与 field_level_report.py 类似的策略) select_parts = [f"{row_count} AS _total"] skip_stats = {"payload", "content_hash", "record_index", "source_file", "source_endpoint"} col_plan = [] for cname, udt in cols_meta: safe = f'"{cname}"' alias = cname.replace(" ", "_").replace("-", "_") plan = {"column": cname, "type": udt, "stats": []} select_parts.append(f"COUNT(*) FILTER (WHERE {safe} IS NULL) AS null_{alias}") plan["stats"].append("null") if udt in ("jsonb", "json", "bytea") or cname in skip_stats: col_plan.append(plan) continue if udt in ("int2", "int4", "int8", "float4", "float8", "numeric"): select_parts.append(f"MIN({safe}) AS min_{alias}") select_parts.append(f"MAX({safe}) AS max_{alias}") select_parts.append(f"ROUND(AVG({safe})::numeric, 2) AS avg_{alias}") plan["stats"].extend(["min", "max", "avg"]) elif udt in ("date", "timestamp", "timestamptz"): select_parts.append(f"MIN({safe}::text) FILTER (WHERE {safe}::text >= '0001') AS min_{alias}") select_parts.append(f"MAX({safe}::text) FILTER (WHERE {safe}::text <= '9999') AS max_{alias}") plan["stats"].extend(["earliest", "latest"]) elif udt in ("text", "varchar", "bpchar", "name"): select_parts.append(f"MIN(LENGTH({safe})) AS minlen_{alias}") select_parts.append(f"MAX(LENGTH({safe})) AS maxlen_{alias}") plan["stats"].extend(["min_len", "max_len"]) elif udt == "bool": select_parts.append(f"COUNT(*) FILTER (WHERE {safe} = TRUE) AS true_{alias}") select_parts.append(f"COUNT(*) FILTER (WHERE {safe} = FALSE) AS false_{alias}") plan["stats"].extend(["true_count", "false_count"]) col_plan.append(plan) with conn.cursor() as cur: try: cur.execute(f"SELECT {', '.join(select_parts)} FROM {schema}.{table}") agg = cur.fetchone() except Exception: conn.rollback() return [{"column": c, "type": t, "total": row_count, "null_pct": "?", "error": True} for c, t in cols_meta] results = [] for plan in col_plan: cname = plan["column"] udt = plan["type"] alias = cname.replace(" ", "_").replace("-", "_") null_cnt = agg.get(f"null_{alias}", 0) or 0 null_pct = round(null_cnt / row_count * 100, 1) if row_count > 0 else 0 r = {"column": cname, "type": udt, "total": row_count, "null_count": null_cnt, "null_pct": f"{null_pct}%"} if udt in ("jsonb", "json", "bytea") or cname in skip_stats: r["note"] = f"({udt.upper()})" if udt in ("jsonb", "json", "bytea") else "(ETL元数据)" results.append(r) continue if "min" in plan["stats"]: r["min"] = agg.get(f"min_{alias}") r["max"] = agg.get(f"max_{alias}") r["avg"] = agg.get(f"avg_{alias}") if "earliest" in plan["stats"]: r["earliest"] = str(agg.get(f"min_{alias}") or "") r["latest"] = str(agg.get(f"max_{alias}") or "") if "min_len" in plan["stats"]: r["min_len"] = agg.get(f"minlen_{alias}") r["max_len"] = agg.get(f"maxlen_{alias}") if "true_count" in plan["stats"]: r["true_count"] = agg.get(f"true_{alias}") r["false_count"] = agg.get(f"false_{alias}") results.append(r) # 小表补充 distinct 计数 if row_count <= FIELD_STATS_DISTINCT_THRESHOLD: for r in results: cname = r["column"] if r.get("note"): r["distinct"] = "-" continue with conn.cursor() as cur: try: cur.execute(f'SELECT COUNT(DISTINCT "{cname}") AS d FROM {schema}.{table}') r["distinct"] = cur.fetchone()["d"] except Exception: conn.rollback() r["distinct"] = "?" else: for r in results: r["distinct"] = "-" return results # ── 数据截止日期查询 ── # 每张 ODS 表的截止日期字段映射:大部分用 create_time/createtime,两张维表用 fetched_at _CUTOFF_DATE_COLUMN: dict[str, str] = { "goods_stock_summary": "fetched_at", "stock_goods_category_tree": "fetched_at", # 以下表使用 createtime(无下划线) "goods_stock_movements": "createtime", "settlement_records": "createtime", "recharge_settlements": "createtime", } def get_data_cutoff_date(conn, schema: str, table: str) -> str | None: """查询表中数据的最后截止日期(MAX of 时间字段)""" col = _CUTOFF_DATE_COLUMN.get(table, "create_time") with conn.cursor() as cur: try: cur.execute( f'SELECT MAX("{col}")::date::text AS cutoff FROM {schema}.{table}' ) row = cur.fetchone() return row["cutoff"] if row and row["cutoff"] else None except Exception: conn.rollback() return None # ── API vs ODS 字段级比对 ── def check_api_vs_ods(conn, task_name: str, ods_table: str) -> dict: """比对 API JSON 字段与 ODS 表列,并采样值比对""" result = { "task": task_name, "ods_table": ods_table, "api_records": 0, "ods_rows": 0, "ods_distinct_ids": 0, "field_check": {"api_fields": 0, "ods_cols": 0, "matched": 0, "api_only": [], "ods_only": []}, "value_diffs": [], "status": "✅", } # 读取 API JSON api_records = load_api_json_records(task_name) if not api_records: result["status"] = "⚠️ 无 API JSON" return result result["api_records"] = len(api_records) # ODS 行数 ods_cols = get_table_columns(conn, "ods", ods_table) result["ods_rows"] = get_table_row_count(conn, "ods", ods_table) with conn.cursor() as cur: try: cur.execute(f"SELECT COUNT(DISTINCT id) AS cnt FROM ods.{ods_table}") result["ods_distinct_ids"] = cur.fetchone()["cnt"] except Exception: conn.rollback() result["ods_distinct_ids"] = -1 # 字段比对:API JSON 顶层 key vs ODS 列 api_fields = set() for rec in api_records[:50]: # 取前 50 条的 key 并集 if isinstance(rec, dict): api_fields.update(rec.keys()) ods_col_set = set(ods_cols) - ETL_META_COLS api_lower = {f.lower(): f for f in api_fields} ods_lower = {c.lower(): c for c in ods_col_set} matched = set(api_lower.keys()) & set(ods_lower.keys()) api_only = sorted(set(api_lower.keys()) - set(ods_lower.keys())) ods_only = sorted(set(ods_lower.keys()) - set(api_lower.keys())) result["field_check"]["api_fields"] = len(api_fields) result["field_check"]["ods_cols"] = len(ods_col_set) result["field_check"]["matched"] = len(matched) result["field_check"]["api_only"] = [api_lower[k] for k in api_only] result["field_check"]["ods_only"] = [ods_lower[k] for k in ods_only] # 值采样比对:取 API 中前 N 条记录的 id,查 ODS 对应行比较 value_diffs = [] sample_ids = [] for rec in api_records[:SAMPLE_LIMIT]: if isinstance(rec, dict) and "id" in rec: sample_ids.append(rec["id"]) if sample_ids and matched: compare_cols = sorted(matched)[:10] # 最多比 10 个字段 with conn.cursor() as cur: for sid in sample_ids: try: cur.execute( f"SELECT * FROM ods.{ods_table} WHERE id = %s ORDER BY fetched_at DESC LIMIT 1", (sid,) ) ods_row = cur.fetchone() except Exception: conn.rollback() ods_row = None if not ods_row: continue api_rec = next((r for r in api_records if r.get("id") == sid), None) if not api_rec: continue for col_lower in compare_cols: api_key = api_lower[col_lower] ods_key = ods_lower[col_lower] api_val = api_rec.get(api_key) ods_val = ods_row.get(ods_key) is_diff, reason = _values_differ(api_val, ods_val) if is_diff or reason == "whitelist": value_diffs.append({ "id": sid, "field": col_lower, "api_val": _fmt_val(api_val), "ods_val": _fmt_val(ods_val), "whitelist": reason == "whitelist", }) result["value_diffs"] = value_diffs[:20] # 最多报 20 条差异 # 只有真正差异(非白名单)才标记为异常 real_diffs = [d for d in value_diffs if not d.get("whitelist")] if api_only or ods_only or real_diffs: result["status"] = "❌ 存在差异" # 字段级统计(ODS 表) result["ods_field_stats"] = get_field_stats(conn, "ods", ods_table) # 数据截止日期 result["data_cutoff"] = get_data_cutoff_date(conn, "ods", ods_table) return result def _values_differ(api_val, ods_val) -> tuple[bool, str]: """比较两个值是否实质不同(容忍类型差异)。 返回 (is_different, reason): - (False, "") — 值相同 - (False, "whitelist") — API 空字符串 vs DB None,视为等价(白名单) - (True, "") — 值确实不同 注意:0 与 None 绝不相等! """ if api_val is None and ods_val is None: return False, "" # API 空字符串 "" vs DB None → 白名单(等价但标记) if api_val is not None and ods_val is None: if isinstance(api_val, str) and api_val.strip() == "": return False, "whitelist" return True, "" if api_val is None and ods_val is not None: if isinstance(ods_val, str) and str(ods_val).strip() == "": return False, "whitelist" return True, "" # 都转字符串比较(容忍 int vs str、Decimal vs float 等) a = str(api_val).strip() b = str(ods_val).strip() if a == b: return False, "" # 数值比较 try: if float(a) == float(b): return False, "" except (ValueError, TypeError): pass return True, "" def _fmt_val(val) -> str: """完整展示差异值,不截断""" return str(val) # ── ODS vs DWD 字段级比对 ── def _find_ex_table(conn, dwd_schema: str, dwd_table: str) -> str | None: """查找 DWD 主表对应的 EX 表(如 dim_assistant → dim_assistant_ex)""" ex_name = dwd_table + "_ex" with conn.cursor() as cur: cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = %s AND table_name = %s AND table_type = 'BASE TABLE' """, (dwd_schema, ex_name)) row = cur.fetchone() return ex_name if row else None def _get_dwd_pk(conn, dwd_schema: str, dwd_table: str) -> str | None: """获取 DWD 主表的第一列(即 PK 列名)""" with conn.cursor() as cur: cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s ORDER BY ordinal_position LIMIT 1 """, (dwd_schema, dwd_table)) row = cur.fetchone() return row["column_name"] if row else None def check_ods_vs_dwd(conn, dwd_full: str, ods_full: str) -> dict: """比对 ODS 与 DWD(主表 + EX 表合并)的行数和字段值""" dwd_s, dwd_t = dwd_full.split(".") ods_s, ods_t = ods_full.split(".") is_dim = dwd_t.startswith("dim_") result = { "dwd_table": dwd_full, "ods_table": ods_full, "type": "维度(SCD2)" if is_dim else "事实", "ods_rows": 0, "ods_distinct_ids": 0, "dwd_rows": 0, "dwd_cols": [], "ods_cols": [], "common_cols": [], "dwd_only_cols": [], "ods_only_cols": [], "value_diffs": [], "status": "✅", "ex_table": None, } # 行数 result["ods_rows"] = get_table_row_count(conn, ods_s, ods_t) result["dwd_rows"] = get_table_row_count(conn, dwd_s, dwd_t) with conn.cursor() as cur: try: cur.execute(f"SELECT COUNT(DISTINCT id) AS cnt FROM {ods_s}.{ods_t}") result["ods_distinct_ids"] = cur.fetchone()["cnt"] except Exception: conn.rollback() # DWD PK 列名(第一列) dwd_pk = _get_dwd_pk(conn, dwd_s, dwd_t) if not dwd_pk: return result # 查找 EX 表 ex_table = _find_ex_table(conn, dwd_s, dwd_t) result["ex_table"] = f"{dwd_s}.{ex_table}" if ex_table else None # 列比对:合并主表 + EX 表的列集合 dwd_main_cols = set(get_table_columns(conn, dwd_s, dwd_t)) dwd_ex_cols = set() if ex_table: dwd_ex_cols = set(get_table_columns(conn, dwd_s, ex_table)) # 合并后去掉 SCD2 管理列和 PK 重复列 dwd_all_cols = dwd_main_cols | dwd_ex_cols dwd_biz_cols = dwd_all_cols - SCD2_COLS ods_cols = set(get_table_columns(conn, ods_s, ods_t)) - ETL_META_COLS result["dwd_cols"] = sorted(dwd_all_cols) result["ods_cols"] = sorted(ods_cols) # 找共同列(忽略大小写) dwd_lower = {c.lower(): c for c in dwd_biz_cols} ods_lower = {c.lower(): c for c in ods_cols} common = set(dwd_lower.keys()) & set(ods_lower.keys()) dwd_only = sorted(set(dwd_lower.keys()) - set(ods_lower.keys())) ods_only = sorted(set(ods_lower.keys()) - set(dwd_lower.keys())) result["common_cols"] = sorted(common) result["dwd_only_cols"] = [dwd_lower[k] for k in dwd_only] result["ods_only_cols"] = [ods_lower[k] for k in ods_only] if not common: return result # 确定每个共同列来自主表还是 EX 表 main_lower = {c.lower(): c for c in dwd_main_cols} ex_lower = {c.lower(): c for c in dwd_ex_cols} if ex_table else {} compare_cols = sorted(common)[:15] # 最多比 15 个字段 value_diffs = [] with conn.cursor() as cur: # 取 DWD 采样行(主表) try: if is_dim: cur.execute(f""" SELECT * FROM {dwd_s}.{dwd_t} WHERE scd2_is_current = true ORDER BY random() LIMIT {SAMPLE_LIMIT} """) else: cur.execute(f""" SELECT * FROM {dwd_s}.{dwd_t} ORDER BY random() LIMIT {SAMPLE_LIMIT} """) dwd_samples = cur.fetchall() except Exception: conn.rollback() dwd_samples = [] for dwd_row in dwd_samples: pk_val = dwd_row.get(dwd_pk) if pk_val is None: continue # 如果有 EX 表,查 EX 行并合并 dwd_merged = dict(dwd_row) if ex_table: try: cur.execute( f'SELECT * FROM {dwd_s}.{ex_table} WHERE "{dwd_pk}" = %s LIMIT 1', (pk_val,) ) ex_row = cur.fetchone() if ex_row: dwd_merged.update(dict(ex_row)) except Exception: conn.rollback() # 查 ODS 对应行(最新快照,ODS 用 id 列) try: cur.execute( f"SELECT * FROM {ods_s}.{ods_t} WHERE id = %s ORDER BY fetched_at DESC LIMIT 1", (pk_val,) ) ods_row = cur.fetchone() except Exception: conn.rollback() ods_row = None if not ods_row: value_diffs.append({ "id": pk_val, "field": "(全行)", "dwd_val": "存在", "ods_val": "ODS 中未找到", "whitelist": False, }) continue for col_lower in compare_cols: dwd_key = dwd_lower.get(col_lower) or main_lower.get(col_lower) or ex_lower.get(col_lower) ods_key = ods_lower[col_lower] dwd_val = dwd_merged.get(dwd_key) ods_val = ods_row.get(ods_key) is_diff, reason = _values_differ(dwd_val, ods_val) if is_diff or reason == "whitelist": value_diffs.append({ "id": pk_val, "field": col_lower, "dwd_val": _fmt_val(dwd_val), "ods_val": _fmt_val(ods_val), "whitelist": reason == "whitelist", }) result["value_diffs"] = value_diffs[:30] real_diffs = [d for d in value_diffs if not d.get("whitelist")] if dwd_only or real_diffs: result["status"] = "❌ 存在差异" # 字段级统计(DWD 主表) result["dwd_field_stats"] = get_field_stats(conn, dwd_s, dwd_t) # 数据截止日期(从 ODS 源表查询) result["data_cutoff"] = get_data_cutoff_date(conn, ods_s, ods_t) return result # ── DWD vs DWS 聚合逻辑验证 ── def check_dwd_vs_dws(conn) -> list[dict]: """检查 DWS 表的数据是否与 DWD 源表一致(行数 + 关键指标抽查)""" results = [] # 获取 DWS 层所有表 with conn.cursor() as cur: cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'dws' AND table_type = 'BASE TABLE' ORDER BY table_name """) dws_tables = [r["table_name"] for r in cur.fetchall()] for dws_t in dws_tables: row_count = get_table_row_count(conn, "dws", dws_t) cols = get_table_columns(conn, "dws", dws_t) entry = { "table": f"dws.{dws_t}", "rows": row_count, "col_count": len(cols), "status": "✅" if row_count > 0 else "⚠️ 空表", "sample_checks": [], } if row_count > 0: # 对数值列做基本健全性检查(负值、NULL 率) numeric_cols = [] with conn.cursor() as cur: cur.execute(""" SELECT column_name, udt_name FROM information_schema.columns WHERE table_schema = 'dws' AND table_name = %s AND udt_name IN ('int2','int4','int8','float4','float8','numeric') ORDER BY ordinal_position """, (dws_t,)) numeric_cols = [(r["column_name"], r["udt_name"]) for r in cur.fetchall()] for col_name, col_type in numeric_cols[:8]: # 最多检查 8 个数值列 with conn.cursor() as cur: try: cur.execute(f""" SELECT COUNT(*) FILTER (WHERE "{col_name}" IS NULL) AS null_cnt, COUNT(*) FILTER (WHERE "{col_name}" < 0) AS neg_cnt, MIN("{col_name}") AS min_val, MAX("{col_name}") AS max_val, COUNT(*) AS total FROM dws.{dws_t} """) stats = cur.fetchone() total = stats["total"] null_pct = round(stats["null_cnt"] / total * 100, 1) if total > 0 else 0 check = { "col": col_name, "null_pct": f"{null_pct}%", "neg_count": stats["neg_cnt"], "min": str(stats["min_val"]), "max": str(stats["max_val"]), "status": "✅", } # 金额列出现负值可能异常 if stats["neg_cnt"] > 0 and "amount" in col_name.lower(): check["status"] = "⚠️ 金额负值" if null_pct > 50: check["status"] = "⚠️ 高 NULL 率" entry["sample_checks"].append(check) except Exception: conn.rollback() results.append(entry) return results # ── 报告生成 ── def generate_report( log_path: Path, log_results: dict, api_ods_checks: list[dict], ods_dwd_checks: list[dict], dws_checks: list[dict], ) -> str: """生成 Markdown 报告""" lines: list[str] = [] def w(s: str = ""): lines.append(s) w("# ETL 全链路数据一致性检查报告") w() w(f"生成时间: {NOW.strftime('%Y-%m-%d %H:%M:%S')} CST") w(f"ETL 日志: `{log_path.name}`") w() def _fmt_field_stat(fs: dict) -> str: """格式化单个字段统计为表格行""" col = fs.get("column", "?") typ = fs.get("type", "?") null_pct = fs.get("null_pct", "—") distinct = fs.get("distinct", "-") parts = [] if "min" in fs and fs["min"] is not None: parts.append(f"min={fs['min']}, max={fs['max']}, avg={fs['avg']}") if "earliest" in fs and fs["earliest"]: parts.append(f"{fs['earliest']} ~ {fs['latest']}") if "min_len" in fs and fs["min_len"] is not None: parts.append(f"len={fs['min_len']}~{fs['max_len']}") if "true_count" in fs: parts.append(f"T={fs['true_count']}, F={fs['false_count']}") if fs.get("note"): parts.append(fs["note"]) stats = "; ".join(parts) if parts else "—" return f"| `{col}` | {typ} | {null_pct} | {distinct} | {stats} |" def _write_field_stats_table(w, stats: list[dict], table_label: str): """输出字段级统计表格(折叠展示)""" if not stats: return w(f"
📊 {table_label} 字段级统计({len(stats)} 列)") w() w("| 字段 | 类型 | NULL率 | 唯一值 | 统计 |") w("|------|------|--------|--------|------|") for fs in stats: w(_fmt_field_stat(fs)) w() w("
") w() # ── 1. ETL 执行概览 ── w("## 1. ETL 执行概览") w() succ = sum(1 for v in log_results.values() if v.get("status") == "SUCC") fail = sum(1 for v in log_results.values() if v.get("status") == "FAIL") skip = sum(1 for v in log_results.values() if v.get("status") == "SKIP") w(f"| 指标 | 值 |") w(f"|------|-----|") w(f"| 成功任务 | {succ} |") w(f"| 失败任务 | {fail} |") w(f"| 跳过任务 | {skip} |") w(f"| 总计 | {len(log_results)} |") w() if fail > 0: w("### 失败任务") w() for name, info in sorted(log_results.items()): if info.get("status") == "FAIL": w(f"- **{name}**: {info.get('error', '未知错误')}") w() # ── 2. API↔ODS 数据一致性 ── w("## 2. API↔ODS 数据一致性") w() w("### 2.1 汇总") w() w("| ODS 表 | API 记录数 | ODS 行数 | ODS 去重ID | 数据截止 | 字段匹配 | API独有 | ODS独有 | 值差异 | 白名单 | 状态 |") w("|--------|-----------|---------|-----------|---------|---------|--------|--------|-------|--------|------|") for c in api_ods_checks: fc = c["field_check"] real_diffs = [d for d in c["value_diffs"] if not d.get("whitelist")] wl_diffs = [d for d in c["value_diffs"] if d.get("whitelist")] cutoff = c.get("data_cutoff") or "—" w(f"| `{c['ods_table']}` | {c['api_records']} | {c['ods_rows']} | {c['ods_distinct_ids']} " f"| {cutoff} " f"| {fc['matched']}/{fc['api_fields']} | {len(fc['api_only'])} | {len(fc['ods_only'])} " f"| {len(real_diffs)} | {len(wl_diffs)} | {c['status']} |") w() # 逐表差异详情 + 字段统计 for c in api_ods_checks: fc = c["field_check"] has_diff = fc["api_only"] or fc["ods_only"] or c["value_diffs"] has_stats = c.get("ods_field_stats") if not has_diff and not has_stats: continue w(f"### 2.2 {c['ods_table']} 详情") w() if fc["api_only"]: w(f"**API 独有字段** ({len(fc['api_only'])} 个): `{'`, `'.join(fc['api_only'][:15])}`") w() if fc["ods_only"]: w(f"**ODS 独有字段** ({len(fc['ods_only'])} 个): `{'`, `'.join(fc['ods_only'][:15])}`") w() if c["value_diffs"]: real = [d for d in c["value_diffs"] if not d.get("whitelist")] wl = [d for d in c["value_diffs"] if d.get("whitelist")] if real: w(f"**值差异采样** ({len(real)} 条)") w() w("| ID | 字段 | API 值 | ODS 值 |") w("|----|------|--------|--------|") for d in real: w(f"| {d['id']} | `{d['field']}` | {d['api_val']} | {d['ods_val']} |") w() if wl: w(f"
🔕 白名单差异({len(wl)} 条)— API 空字符串 vs DB None,视为等价") w() w("| ID | 字段 | API 值 | ODS 值 |") w("|----|------|--------|--------|") for d in wl: w(f"| {d['id']} | `{d['field']}` | {d['api_val']} | {d['ods_val']} |") w() w("
") w() # ODS 字段级统计 if has_stats: _write_field_stats_table(w, c["ods_field_stats"], f"ods.{c['ods_table']}") # ── 3. ODS↔DWD 数据一致性 ── w("## 3. ODS↔DWD 数据一致性") w() w("### 3.1 汇总") w() w("| DWD 表 | EX 表 | ODS 表 | 类型 | ODS 行 | ODS 去重ID | DWD 行 | 数据截止 | 共同列 | DWD独有 | 值差异 | 白名单 | 状态 |") w("|--------|-------|--------|------|-------|-----------|-------|---------|-------|--------|-------|--------|------|") for c in ods_dwd_checks: real_diffs = [d for d in c["value_diffs"] if not d.get("whitelist")] wl_diffs = [d for d in c["value_diffs"] if d.get("whitelist")] ex_label = c.get("ex_table", "—") or "—" cutoff = c.get("data_cutoff") or "—" w(f"| `{c['dwd_table']}` | `{ex_label}` | `{c['ods_table']}` | {c['type']} " f"| {c['ods_rows']} | {c['ods_distinct_ids']} | {c['dwd_rows']} " f"| {cutoff} " f"| {len(c['common_cols'])} | {len(c['dwd_only_cols'])} " f"| {len(real_diffs)} | {len(wl_diffs)} | {c['status']} |") w() # 逐表差异详情 + 字段统计 for c in ods_dwd_checks: has_diff = c["dwd_only_cols"] or c["value_diffs"] has_stats = c.get("dwd_field_stats") if not has_diff and not has_stats: continue w(f"### 3.2 {c['dwd_table']} 详情") w() if c["dwd_only_cols"]: w(f"**DWD 独有列** ({len(c['dwd_only_cols'])} 个): `{'`, `'.join(c['dwd_only_cols'][:15])}`") w() if c["ods_only_cols"]: w(f"**ODS 独有列** ({len(c['ods_only_cols'])} 个): `{'`, `'.join(c['ods_only_cols'][:15])}`") w() if c["value_diffs"]: real = [d for d in c["value_diffs"] if not d.get("whitelist")] wl = [d for d in c["value_diffs"] if d.get("whitelist")] if real: w(f"**值差异采样** ({len(real)} 条)") w() w("| ID | 字段 | DWD 值 | ODS 值 |") w("|----|------|--------|--------|") for d in real: w(f"| {d['id']} | `{d['field']}` | {d['dwd_val']} | {d['ods_val']} |") w() if wl: w(f"
🔕 白名单差异({len(wl)} 条)— 空字符串 vs None,视为等价") w() w("| ID | 字段 | DWD 值 | ODS 值 |") w("|----|------|--------|--------|") for d in wl: w(f"| {d['id']} | `{d['field']}` | {d['dwd_val']} | {d['ods_val']} |") w() w("
") w() # DWD 字段级统计 if has_stats: _write_field_stats_table(w, c["dwd_field_stats"], c["dwd_table"]) # ── 4. DWD↔DWS 数据一致性 ── w("## 4. DWD↔DWS 数据一致性") w() w("### 4.1 DWS 表概览") w() non_empty = sum(1 for c in dws_checks if c["rows"] > 0) w(f"DWS 层共 {len(dws_checks)} 张表,{non_empty} 张有数据,{len(dws_checks) - non_empty} 张为空。") w() w("| DWS 表 | 行数 | 列数 | 状态 |") w("|--------|------|------|------|") for c in dws_checks: w(f"| `{c['table']}` | {c['rows']} | {c['col_count']} | {c['status']} |") w() # DWS 数值列健全性检查 has_checks = [c for c in dws_checks if c["sample_checks"]] if has_checks: w("### 4.2 DWS 数值列健全性检查") w() for c in has_checks: anomalies = [sc for sc in c["sample_checks"] if sc["status"] != "✅"] if not anomalies: continue w(f"#### {c['table']}") w() w("| 列 | NULL率 | 负值数 | 最小值 | 最大值 | 状态 |") w("|----|--------|--------|--------|--------|------|") for sc in anomalies: w(f"| `{sc['col']}` | {sc['null_pct']} | {sc['neg_count']} | {sc['min']} | {sc['max']} | {sc['status']} |") w() # ── 5. 异常汇总 ── w("## 5. 异常汇总与建议") w() issues = [] for c in api_ods_checks: if c["status"] != "✅": issues.append(f"API↔ODS `{c['ods_table']}`: {c['status']}") for c in ods_dwd_checks: if c["status"] != "✅": issues.append(f"ODS↔DWD `{c['dwd_table']}`: {c['status']}") for c in dws_checks: for sc in c.get("sample_checks", []): if sc["status"] != "✅": issues.append(f"DWS `{c['table']}.{sc['col']}`: {sc['status']}") if issues: w(f"共发现 {len(issues)} 项异常:") w() for i, issue in enumerate(issues, 1): w(f"{i}. {issue}") else: w("✅ 未发现数据一致性异常。") w() return "\n".join(lines) # ── 主入口 ── def main(): print(f"[{NOW.strftime('%H:%M:%S')}] ETL 全链路数据一致性检查开始...") # 1. 找到最新 ETL 日志 log_path = find_latest_log() if not log_path: print("❌ 未找到 ETL 日志文件") sys.exit(1) print(f" 日志: {log_path.name}") # 2. 解析日志 log_results = parse_etl_log(log_path) succ_ods = {k: v for k, v in log_results.items() if v.get("status") == "SUCC" and v.get("layer") == "ODS"} print(f" 成功 ODS 任务: {len(succ_ods)}") # 3. 连接数据库 conn = get_conn() print(f" 数据库连接成功(只读模式)") try: # 4. API vs ODS 检查 print(f"\n[API↔ODS] 开始逐表检查...") api_ods_checks = [] for task_name, ods_table in sorted(ODS_TASK_TO_TABLE.items()): if task_name not in succ_ods: continue print(f" 检查 {task_name} → ods.{ods_table}...", end=" ") result = check_api_vs_ods(conn, task_name, ods_table) api_ods_checks.append(result) print(result["status"]) # 5. ODS vs DWD 检查 print(f"\n[ODS↔DWD] 开始逐表检查...") ods_dwd_checks = [] for dwd_full, ods_full in sorted(DWD_TO_ODS.items()): print(f" 检查 {dwd_full} ← {ods_full}...", end=" ") result = check_ods_vs_dwd(conn, dwd_full, ods_full) ods_dwd_checks.append(result) print(result["status"]) # 6. DWD vs DWS 检查 print(f"\n[DWD↔DWS] 开始检查...") dws_checks = check_dwd_vs_dws(conn) non_empty = sum(1 for c in dws_checks if c["rows"] > 0) print(f" DWS 表: {len(dws_checks)} 张,{non_empty} 张有数据") # 7. 生成报告 report = generate_report(log_path, log_results, api_ods_checks, ods_dwd_checks, dws_checks) out_file = REPORT_ROOT / f"consistency_check_{TS}.md" out_file.write_text(report, encoding="utf-8") print(f"\n✅ 报告已生成: {out_file}") finally: conn.close() if __name__ == "__main__": main()