1169 lines
44 KiB
Python
1169 lines
44 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
ETL 全链路数据一致性黑盒检查
|
||
|
||
黑盒手段:获取最近一次成功的 ETL 任务,对 API→ODS→DWD→DWS/INDEX
|
||
逐表逐字段进行实际数据(JSON + DB)比对,输出详细差异报告。
|
||
|
||
用法:
|
||
cd C:\\NeoZQYY
|
||
uv run python scripts/ops/etl_consistency_check.py
|
||
|
||
输出: ETL_REPORT_ROOT / consistency_check_<timestamp>.md
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
from datetime import datetime
|
||
from decimal import Decimal
|
||
from pathlib import Path
|
||
from zoneinfo import ZoneInfo
|
||
|
||
import psycopg2
|
||
import psycopg2.extras
|
||
from dotenv import load_dotenv
|
||
|
||
_ROOT = Path(__file__).resolve().parents[2]
|
||
load_dotenv(_ROOT / ".env", override=False)
|
||
|
||
# ── 环境变量(缺失即报错) ──
|
||
def _env(var: str) -> Path:
|
||
val = os.environ.get(var)
|
||
if not val:
|
||
raise KeyError(f"环境变量 {var} 未定义")
|
||
p = Path(val)
|
||
p.mkdir(parents=True, exist_ok=True)
|
||
return p
|
||
|
||
REPORT_ROOT = _env("ETL_REPORT_ROOT")
|
||
JSON_ROOT = _env("FETCH_ROOT")
|
||
LOG_ROOT = _env("LOG_ROOT")
|
||
|
||
PG_DSN = os.environ.get("TEST_DB_DSN")
|
||
if not PG_DSN:
|
||
raise RuntimeError("TEST_DB_DSN 未在 .env 中定义")
|
||
|
||
TZ = ZoneInfo("Asia/Shanghai")
|
||
NOW = datetime.now(TZ)
|
||
TS = NOW.strftime("%Y%m%d_%H%M%S")
|
||
|
||
# 采样条数上限
|
||
SAMPLE_LIMIT = 5
|
||
|
||
# 字段统计采样上限(用于 per-field 统计的表行数阈值,超过此值跳过 COUNT DISTINCT)
|
||
FIELD_STATS_DISTINCT_THRESHOLD = 3000
|
||
|
||
# ── ODS 任务名 → ODS 表名映射(与 blackbox_test_report.py 保持一致) ──
|
||
# CHANGE 2026-03-04 | 补充 ODS_STAFF_INFO,此前遗漏导致员工表不参与一致性检查
|
||
ODS_TASK_TO_TABLE = {
|
||
"ODS_ASSISTANT_ACCOUNT": "assistant_accounts_master",
|
||
"ODS_ASSISTANT_LEDGER": "assistant_service_records",
|
||
"ODS_SETTLEMENT_RECORDS": "settlement_records",
|
||
"ODS_TABLE_USE": "table_fee_transactions",
|
||
"ODS_TABLE_FEE_DISCOUNT": "table_fee_discount_records",
|
||
"ODS_TABLES": "site_tables_master",
|
||
"ODS_PAYMENT": "payment_transactions",
|
||
"ODS_REFUND": "refund_transactions",
|
||
"ODS_PLATFORM_COUPON": "platform_coupon_redemption_records",
|
||
"ODS_MEMBER": "member_profiles",
|
||
"ODS_MEMBER_CARD": "member_stored_value_cards",
|
||
"ODS_MEMBER_BALANCE": "member_balance_changes",
|
||
"ODS_RECHARGE_SETTLE": "recharge_settlements",
|
||
"ODS_GROUP_PACKAGE": "group_buy_packages",
|
||
"ODS_GROUP_BUY_REDEMPTION": "group_buy_redemption_records",
|
||
"ODS_INVENTORY_STOCK": "goods_stock_summary",
|
||
"ODS_INVENTORY_CHANGE": "goods_stock_movements",
|
||
"ODS_GOODS_CATEGORY": "stock_goods_category_tree",
|
||
"ODS_STORE_GOODS": "store_goods_master",
|
||
"ODS_STORE_GOODS_SALES": "store_goods_sales_records",
|
||
"ODS_TENANT_GOODS": "tenant_goods_master",
|
||
"ODS_STAFF_INFO": "staff_info_master",
|
||
}
|
||
|
||
# DWD 表 → ODS 表映射
|
||
# CHANGE 2026-03-04 | 补充 dim_staff、dim_site、
|
||
# dwd_goods_stock_movement、dwd_goods_stock_summary,此前遗漏
|
||
# CHANGE 2026-03-04 | 移除 dwd_assistant_trash_event(表已于 2026-02-22 DROP,禁止复活)
|
||
DWD_TO_ODS = {
|
||
"dwd.dim_assistant": "ods.assistant_accounts_master",
|
||
"dwd.dim_member": "ods.member_profiles",
|
||
"dwd.dim_member_card_account": "ods.member_stored_value_cards",
|
||
"dwd.dim_table": "ods.site_tables_master",
|
||
"dwd.dim_groupbuy_package": "ods.group_buy_packages",
|
||
"dwd.dim_store_goods": "ods.store_goods_master",
|
||
"dwd.dim_tenant_goods": "ods.tenant_goods_master",
|
||
"dwd.dim_goods_category": "ods.stock_goods_category_tree",
|
||
"dwd.dim_staff": "ods.staff_info_master",
|
||
"dwd.dim_site": "ods.site_tables_master",
|
||
"dwd.dwd_assistant_service_log": "ods.assistant_service_records",
|
||
"dwd.dwd_member_balance_change": "ods.member_balance_changes",
|
||
"dwd.dwd_recharge_order": "ods.recharge_settlements",
|
||
"dwd.dwd_settlement_head": "ods.settlement_records",
|
||
"dwd.dwd_table_fee_log": "ods.table_fee_transactions",
|
||
"dwd.dwd_table_fee_adjust": "ods.table_fee_discount_records",
|
||
"dwd.dwd_payment": "ods.payment_transactions",
|
||
"dwd.dwd_refund": "ods.refund_transactions",
|
||
"dwd.dwd_platform_coupon_redemption": "ods.platform_coupon_redemption_records",
|
||
"dwd.dwd_groupbuy_redemption": "ods.group_buy_redemption_records",
|
||
"dwd.dwd_store_goods_sale": "ods.store_goods_sales_records",
|
||
"dwd.dwd_goods_stock_movement": "ods.goods_stock_movements",
|
||
"dwd.dwd_goods_stock_summary": "ods.goods_stock_summary",
|
||
}
|
||
|
||
# ETL 元数据列(不参与值比对)
|
||
ETL_META_COLS = {"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"}
|
||
# SCD2 管理列(不参与 ODS→DWD 值比对)
|
||
SCD2_COLS = {
|
||
"valid_from", "valid_to", "is_current", "etl_loaded_at", "etl_batch_id",
|
||
"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version",
|
||
}
|
||
|
||
|
||
# ── 数据库连接 ──
|
||
def get_conn():
|
||
conn = psycopg2.connect(PG_DSN, cursor_factory=psycopg2.extras.RealDictCursor)
|
||
conn.set_session(readonly=True)
|
||
return conn
|
||
|
||
|
||
# ── ETL 日志解析 ──
|
||
def find_latest_log() -> Path | None:
|
||
"""找到最新的 ETL 日志文件"""
|
||
logs = sorted(LOG_ROOT.glob("*.log"), key=lambda p: p.stat().st_mtime, reverse=True)
|
||
return logs[0] if logs else None
|
||
|
||
|
||
def parse_etl_log(log_path: Path) -> dict:
|
||
"""解析 ETL 日志,提取任务执行结果"""
|
||
results = {}
|
||
task_start_times = {}
|
||
|
||
with open(log_path, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*开始执行(\w+) \(ODS\)", line)
|
||
if m:
|
||
task_start_times[m.group(2)] = m.group(1)
|
||
continue
|
||
|
||
m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(\w+) ODS 任务完成: (\{.*\})", line)
|
||
if m:
|
||
task_name = m.group(2)
|
||
try:
|
||
counts = eval(m.group(3))
|
||
except Exception:
|
||
counts = {}
|
||
results[task_name] = {
|
||
"status": "SUCC", "layer": "ODS",
|
||
"start": task_start_times.get(task_name, ""),
|
||
"end": m.group(1), "counts": counts,
|
||
}
|
||
continue
|
||
|
||
m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*DWD_LOAD_FROM_ODS.*完成.*?(\d+).*表", line)
|
||
if m:
|
||
results["DWD_LOAD_FROM_ODS"] = {"status": "SUCC", "layer": "DWD", "end": m.group(1)}
|
||
continue
|
||
|
||
m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(\w+): 工具类任务执行成功", line)
|
||
if m:
|
||
results[m.group(2)] = {"status": "SUCC", "layer": "DWS/INDEX", "end": m.group(1)}
|
||
continue
|
||
|
||
m = re.match(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*?(\w+): 工具类任务执行失败: (.*)", line)
|
||
if m:
|
||
results[m.group(2)] = {
|
||
"status": "FAIL", "layer": "DWS/INDEX",
|
||
"end": m.group(1), "error": m.group(3)[:120],
|
||
}
|
||
continue
|
||
|
||
m = re.match(r".*任务 (\w+) 未启用或不存在", line)
|
||
if m:
|
||
results[m.group(1)] = {"status": "SKIP", "layer": "DWS", "error": "未注册"}
|
||
|
||
return results
|
||
|
||
|
||
# ── API JSON 读取 ──
|
||
def load_api_json_records(task_name: str) -> list[dict] | None:
|
||
"""读取某个 ODS 任务最新一次 JSON 落盘的全部记录。
|
||
|
||
JSON 结构为 ETL 框架包装格式:
|
||
{task_code, run_id, pages: [{page, request, response: {data: {<list_field>: [...]}}}]}
|
||
数据列表字段名因端点而异(如 tenantMemberInfos、list 等),
|
||
取 response.data 下第一个 list 类型的值。
|
||
"""
|
||
task_dir = JSON_ROOT / task_name
|
||
if not task_dir.exists():
|
||
return None
|
||
subdirs = sorted(task_dir.iterdir(), key=lambda p: p.name, reverse=True)
|
||
for sd in subdirs:
|
||
if not sd.is_dir():
|
||
continue
|
||
records = []
|
||
for jf in sorted(sd.glob("*.json")):
|
||
try:
|
||
data = json.loads(jf.read_text(encoding="utf-8"))
|
||
except Exception:
|
||
continue
|
||
|
||
# ETL 框架包装格式:pages[].response.data.<list_field>
|
||
if isinstance(data, dict) and "pages" in data:
|
||
for page in data["pages"]:
|
||
resp = page.get("response", {})
|
||
if not isinstance(resp, dict):
|
||
continue
|
||
resp_data = resp.get("data", {})
|
||
if isinstance(resp_data, dict):
|
||
# 找 response.data 下第一个 list 类型的值
|
||
for v in resp_data.values():
|
||
if isinstance(v, list):
|
||
records.extend(v)
|
||
break
|
||
elif isinstance(resp_data, list):
|
||
records.extend(resp_data)
|
||
# 兼容旧格式:直接是 list 或 {data: {list: [...]}}
|
||
elif isinstance(data, list):
|
||
records.extend(data)
|
||
elif isinstance(data, dict):
|
||
lst = data.get("data", {}).get("list", data.get("data", []))
|
||
if isinstance(lst, list):
|
||
records.extend(lst)
|
||
|
||
if records:
|
||
return records
|
||
return None
|
||
|
||
|
||
# ── 表结构查询 ──
|
||
def get_table_columns(conn, schema: str, table: str) -> list[str]:
|
||
"""获取表的列名列表"""
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT column_name FROM information_schema.columns
|
||
WHERE table_schema = %s AND table_name = %s
|
||
ORDER BY ordinal_position
|
||
""", (schema, table))
|
||
return [r["column_name"] for r in cur.fetchall()]
|
||
|
||
|
||
def get_table_row_count(conn, schema: str, table: str) -> int:
|
||
"""获取表行数"""
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f"SELECT COUNT(*) AS cnt FROM {schema}.{table}")
|
||
return cur.fetchone()["cnt"]
|
||
except Exception:
|
||
conn.rollback()
|
||
return -1
|
||
|
||
|
||
def sample_rows(conn, schema: str, table: str, limit: int = SAMPLE_LIMIT) -> list[dict]:
|
||
"""随机采样若干行"""
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT * FROM {schema}.{table}
|
||
ORDER BY random() LIMIT {limit}
|
||
""")
|
||
rows = cur.fetchall()
|
||
# 转换为可序列化的 dict
|
||
result = []
|
||
for row in rows:
|
||
d = {}
|
||
for k, v in dict(row).items():
|
||
if isinstance(v, Decimal):
|
||
d[k] = float(v)
|
||
elif isinstance(v, datetime):
|
||
d[k] = str(v)
|
||
elif isinstance(v, (dict, list)):
|
||
d[k] = v # JSON 类型保持原样
|
||
else:
|
||
d[k] = v
|
||
result.append(d)
|
||
return result
|
||
except Exception:
|
||
conn.rollback()
|
||
return []
|
||
|
||
|
||
# ── 字段级统计(类似 field_level_report 的 per-field 统计) ──
|
||
def get_field_stats(conn, schema: str, table: str) -> list[dict]:
|
||
"""批量获取表中每个字段的统计信息(NULL率、数值 min/max/avg、唯一值等)"""
|
||
cols_meta = []
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT column_name, udt_name
|
||
FROM information_schema.columns
|
||
WHERE table_schema = %s AND table_name = %s
|
||
ORDER BY ordinal_position
|
||
""", (schema, table))
|
||
cols_meta = [(r["column_name"], r["udt_name"]) for r in cur.fetchall()]
|
||
|
||
if not cols_meta:
|
||
return []
|
||
|
||
row_count = get_table_row_count(conn, schema, table)
|
||
if row_count <= 0:
|
||
return [{"column": c, "type": t, "total": row_count, "null_pct": "—"} for c, t in cols_meta]
|
||
|
||
# 构造批量聚合 SQL(与 field_level_report.py 类似的策略)
|
||
select_parts = [f"{row_count} AS _total"]
|
||
skip_stats = {"payload", "content_hash", "record_index", "source_file", "source_endpoint"}
|
||
col_plan = []
|
||
|
||
for cname, udt in cols_meta:
|
||
safe = f'"{cname}"'
|
||
alias = cname.replace(" ", "_").replace("-", "_")
|
||
plan = {"column": cname, "type": udt, "stats": []}
|
||
|
||
select_parts.append(f"COUNT(*) FILTER (WHERE {safe} IS NULL) AS null_{alias}")
|
||
plan["stats"].append("null")
|
||
|
||
if udt in ("jsonb", "json", "bytea") or cname in skip_stats:
|
||
col_plan.append(plan)
|
||
continue
|
||
|
||
if udt in ("int2", "int4", "int8", "float4", "float8", "numeric"):
|
||
select_parts.append(f"MIN({safe}) AS min_{alias}")
|
||
select_parts.append(f"MAX({safe}) AS max_{alias}")
|
||
select_parts.append(f"ROUND(AVG({safe})::numeric, 2) AS avg_{alias}")
|
||
plan["stats"].extend(["min", "max", "avg"])
|
||
elif udt in ("date", "timestamp", "timestamptz"):
|
||
select_parts.append(f"MIN({safe}::text) FILTER (WHERE {safe}::text >= '0001') AS min_{alias}")
|
||
select_parts.append(f"MAX({safe}::text) FILTER (WHERE {safe}::text <= '9999') AS max_{alias}")
|
||
plan["stats"].extend(["earliest", "latest"])
|
||
elif udt in ("text", "varchar", "bpchar", "name"):
|
||
select_parts.append(f"MIN(LENGTH({safe})) AS minlen_{alias}")
|
||
select_parts.append(f"MAX(LENGTH({safe})) AS maxlen_{alias}")
|
||
plan["stats"].extend(["min_len", "max_len"])
|
||
elif udt == "bool":
|
||
select_parts.append(f"COUNT(*) FILTER (WHERE {safe} = TRUE) AS true_{alias}")
|
||
select_parts.append(f"COUNT(*) FILTER (WHERE {safe} = FALSE) AS false_{alias}")
|
||
plan["stats"].extend(["true_count", "false_count"])
|
||
|
||
col_plan.append(plan)
|
||
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f"SELECT {', '.join(select_parts)} FROM {schema}.{table}")
|
||
agg = cur.fetchone()
|
||
except Exception:
|
||
conn.rollback()
|
||
return [{"column": c, "type": t, "total": row_count, "null_pct": "?", "error": True}
|
||
for c, t in cols_meta]
|
||
|
||
results = []
|
||
for plan in col_plan:
|
||
cname = plan["column"]
|
||
udt = plan["type"]
|
||
alias = cname.replace(" ", "_").replace("-", "_")
|
||
null_cnt = agg.get(f"null_{alias}", 0) or 0
|
||
null_pct = round(null_cnt / row_count * 100, 1) if row_count > 0 else 0
|
||
|
||
r = {"column": cname, "type": udt, "total": row_count,
|
||
"null_count": null_cnt, "null_pct": f"{null_pct}%"}
|
||
|
||
if udt in ("jsonb", "json", "bytea") or cname in skip_stats:
|
||
r["note"] = f"({udt.upper()})" if udt in ("jsonb", "json", "bytea") else "(ETL元数据)"
|
||
results.append(r)
|
||
continue
|
||
|
||
if "min" in plan["stats"]:
|
||
r["min"] = agg.get(f"min_{alias}")
|
||
r["max"] = agg.get(f"max_{alias}")
|
||
r["avg"] = agg.get(f"avg_{alias}")
|
||
if "earliest" in plan["stats"]:
|
||
r["earliest"] = str(agg.get(f"min_{alias}") or "")
|
||
r["latest"] = str(agg.get(f"max_{alias}") or "")
|
||
if "min_len" in plan["stats"]:
|
||
r["min_len"] = agg.get(f"minlen_{alias}")
|
||
r["max_len"] = agg.get(f"maxlen_{alias}")
|
||
if "true_count" in plan["stats"]:
|
||
r["true_count"] = agg.get(f"true_{alias}")
|
||
r["false_count"] = agg.get(f"false_{alias}")
|
||
|
||
results.append(r)
|
||
|
||
# 小表补充 distinct 计数
|
||
if row_count <= FIELD_STATS_DISTINCT_THRESHOLD:
|
||
for r in results:
|
||
cname = r["column"]
|
||
if r.get("note"):
|
||
r["distinct"] = "-"
|
||
continue
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f'SELECT COUNT(DISTINCT "{cname}") AS d FROM {schema}.{table}')
|
||
r["distinct"] = cur.fetchone()["d"]
|
||
except Exception:
|
||
conn.rollback()
|
||
r["distinct"] = "?"
|
||
else:
|
||
for r in results:
|
||
r["distinct"] = "-"
|
||
|
||
return results
|
||
|
||
|
||
# ── 数据截止日期查询 ──
|
||
# 每张 ODS 表的截止日期字段映射:大部分用 create_time/createtime,两张维表用 fetched_at
|
||
_CUTOFF_DATE_COLUMN: dict[str, str] = {
|
||
"goods_stock_summary": "fetched_at",
|
||
"stock_goods_category_tree": "fetched_at",
|
||
# 以下表使用 createtime(无下划线)
|
||
"goods_stock_movements": "createtime",
|
||
"settlement_records": "createtime",
|
||
"recharge_settlements": "createtime",
|
||
}
|
||
|
||
|
||
def get_data_cutoff_date(conn, schema: str, table: str) -> str | None:
|
||
"""查询表中数据的最后截止日期(MAX of 时间字段)"""
|
||
col = _CUTOFF_DATE_COLUMN.get(table, "create_time")
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(
|
||
f'SELECT MAX("{col}")::date::text AS cutoff FROM {schema}.{table}'
|
||
)
|
||
row = cur.fetchone()
|
||
return row["cutoff"] if row and row["cutoff"] else None
|
||
except Exception:
|
||
conn.rollback()
|
||
return None
|
||
|
||
|
||
# ── API vs ODS 字段级比对 ──
|
||
def check_api_vs_ods(conn, task_name: str, ods_table: str) -> dict:
|
||
"""比对 API JSON 字段与 ODS 表列,并采样值比对"""
|
||
result = {
|
||
"task": task_name, "ods_table": ods_table,
|
||
"api_records": 0, "ods_rows": 0, "ods_distinct_ids": 0,
|
||
"field_check": {"api_fields": 0, "ods_cols": 0, "matched": 0,
|
||
"api_only": [], "ods_only": []},
|
||
"value_diffs": [], "status": "✅",
|
||
}
|
||
|
||
# 读取 API JSON
|
||
api_records = load_api_json_records(task_name)
|
||
if not api_records:
|
||
result["status"] = "⚠️ 无 API JSON"
|
||
return result
|
||
result["api_records"] = len(api_records)
|
||
|
||
# ODS 行数
|
||
ods_cols = get_table_columns(conn, "ods", ods_table)
|
||
result["ods_rows"] = get_table_row_count(conn, "ods", ods_table)
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f"SELECT COUNT(DISTINCT id) AS cnt FROM ods.{ods_table}")
|
||
result["ods_distinct_ids"] = cur.fetchone()["cnt"]
|
||
except Exception:
|
||
conn.rollback()
|
||
result["ods_distinct_ids"] = -1
|
||
|
||
# 字段比对:API JSON 顶层 key vs ODS 列
|
||
api_fields = set()
|
||
for rec in api_records[:50]: # 取前 50 条的 key 并集
|
||
if isinstance(rec, dict):
|
||
api_fields.update(rec.keys())
|
||
|
||
ods_col_set = set(ods_cols) - ETL_META_COLS
|
||
api_lower = {f.lower(): f for f in api_fields}
|
||
ods_lower = {c.lower(): c for c in ods_col_set}
|
||
|
||
matched = set(api_lower.keys()) & set(ods_lower.keys())
|
||
api_only = sorted(set(api_lower.keys()) - set(ods_lower.keys()))
|
||
ods_only = sorted(set(ods_lower.keys()) - set(api_lower.keys()))
|
||
|
||
result["field_check"]["api_fields"] = len(api_fields)
|
||
result["field_check"]["ods_cols"] = len(ods_col_set)
|
||
result["field_check"]["matched"] = len(matched)
|
||
result["field_check"]["api_only"] = [api_lower[k] for k in api_only]
|
||
result["field_check"]["ods_only"] = [ods_lower[k] for k in ods_only]
|
||
|
||
# 值采样比对:取 API 中前 N 条记录的 id,查 ODS 对应行比较
|
||
value_diffs = []
|
||
sample_ids = []
|
||
for rec in api_records[:SAMPLE_LIMIT]:
|
||
if isinstance(rec, dict) and "id" in rec:
|
||
sample_ids.append(rec["id"])
|
||
|
||
if sample_ids and matched:
|
||
compare_cols = sorted(matched)[:10] # 最多比 10 个字段
|
||
with conn.cursor() as cur:
|
||
for sid in sample_ids:
|
||
try:
|
||
cur.execute(
|
||
f"SELECT * FROM ods.{ods_table} WHERE id = %s ORDER BY fetched_at DESC LIMIT 1",
|
||
(sid,)
|
||
)
|
||
ods_row = cur.fetchone()
|
||
except Exception:
|
||
conn.rollback()
|
||
ods_row = None
|
||
|
||
if not ods_row:
|
||
continue
|
||
|
||
api_rec = next((r for r in api_records if r.get("id") == sid), None)
|
||
if not api_rec:
|
||
continue
|
||
|
||
for col_lower in compare_cols:
|
||
api_key = api_lower[col_lower]
|
||
ods_key = ods_lower[col_lower]
|
||
api_val = api_rec.get(api_key)
|
||
ods_val = ods_row.get(ods_key)
|
||
|
||
is_diff, reason = _values_differ(api_val, ods_val)
|
||
if is_diff or reason == "whitelist":
|
||
value_diffs.append({
|
||
"id": sid, "field": col_lower,
|
||
"api_val": _fmt_val(api_val),
|
||
"ods_val": _fmt_val(ods_val),
|
||
"whitelist": reason == "whitelist",
|
||
})
|
||
|
||
result["value_diffs"] = value_diffs[:20] # 最多报 20 条差异
|
||
# 只有真正差异(非白名单)才标记为异常
|
||
real_diffs = [d for d in value_diffs if not d.get("whitelist")]
|
||
if api_only or ods_only or real_diffs:
|
||
result["status"] = "❌ 存在差异"
|
||
|
||
# 字段级统计(ODS 表)
|
||
result["ods_field_stats"] = get_field_stats(conn, "ods", ods_table)
|
||
|
||
# 数据截止日期
|
||
result["data_cutoff"] = get_data_cutoff_date(conn, "ods", ods_table)
|
||
|
||
return result
|
||
|
||
|
||
def _values_differ(api_val, ods_val) -> tuple[bool, str]:
|
||
"""比较两个值是否实质不同(容忍类型差异)。
|
||
|
||
返回 (is_different, reason):
|
||
- (False, "") — 值相同
|
||
- (False, "whitelist") — API 空字符串 vs DB None,视为等价(白名单)
|
||
- (True, "") — 值确实不同
|
||
注意:0 与 None 绝不相等!
|
||
"""
|
||
if api_val is None and ods_val is None:
|
||
return False, ""
|
||
|
||
# API 空字符串 "" vs DB None → 白名单(等价但标记)
|
||
if api_val is not None and ods_val is None:
|
||
if isinstance(api_val, str) and api_val.strip() == "":
|
||
return False, "whitelist"
|
||
return True, ""
|
||
if api_val is None and ods_val is not None:
|
||
if isinstance(ods_val, str) and str(ods_val).strip() == "":
|
||
return False, "whitelist"
|
||
return True, ""
|
||
|
||
# 都转字符串比较(容忍 int vs str、Decimal vs float 等)
|
||
a = str(api_val).strip()
|
||
b = str(ods_val).strip()
|
||
if a == b:
|
||
return False, ""
|
||
# 数值比较
|
||
try:
|
||
if float(a) == float(b):
|
||
return False, ""
|
||
except (ValueError, TypeError):
|
||
pass
|
||
return True, ""
|
||
|
||
|
||
def _fmt_val(val) -> str:
|
||
"""完整展示差异值,不截断"""
|
||
return str(val)
|
||
|
||
|
||
# ── ODS vs DWD 字段级比对 ──
|
||
def _find_ex_table(conn, dwd_schema: str, dwd_table: str) -> str | None:
|
||
"""查找 DWD 主表对应的 EX 表(如 dim_assistant → dim_assistant_ex)"""
|
||
ex_name = dwd_table + "_ex"
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT table_name FROM information_schema.tables
|
||
WHERE table_schema = %s AND table_name = %s AND table_type = 'BASE TABLE'
|
||
""", (dwd_schema, ex_name))
|
||
row = cur.fetchone()
|
||
return ex_name if row else None
|
||
|
||
|
||
def _get_dwd_pk(conn, dwd_schema: str, dwd_table: str) -> str | None:
|
||
"""获取 DWD 主表的第一列(即 PK 列名)"""
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT column_name FROM information_schema.columns
|
||
WHERE table_schema = %s AND table_name = %s
|
||
ORDER BY ordinal_position LIMIT 1
|
||
""", (dwd_schema, dwd_table))
|
||
row = cur.fetchone()
|
||
return row["column_name"] if row else None
|
||
|
||
|
||
def check_ods_vs_dwd(conn, dwd_full: str, ods_full: str) -> dict:
|
||
"""比对 ODS 与 DWD(主表 + EX 表合并)的行数和字段值"""
|
||
dwd_s, dwd_t = dwd_full.split(".")
|
||
ods_s, ods_t = ods_full.split(".")
|
||
is_dim = dwd_t.startswith("dim_")
|
||
|
||
result = {
|
||
"dwd_table": dwd_full, "ods_table": ods_full,
|
||
"type": "维度(SCD2)" if is_dim else "事实",
|
||
"ods_rows": 0, "ods_distinct_ids": 0, "dwd_rows": 0,
|
||
"dwd_cols": [], "ods_cols": [],
|
||
"common_cols": [], "dwd_only_cols": [], "ods_only_cols": [],
|
||
"value_diffs": [], "status": "✅",
|
||
"ex_table": None,
|
||
}
|
||
|
||
# 行数
|
||
result["ods_rows"] = get_table_row_count(conn, ods_s, ods_t)
|
||
result["dwd_rows"] = get_table_row_count(conn, dwd_s, dwd_t)
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f"SELECT COUNT(DISTINCT id) AS cnt FROM {ods_s}.{ods_t}")
|
||
result["ods_distinct_ids"] = cur.fetchone()["cnt"]
|
||
except Exception:
|
||
conn.rollback()
|
||
|
||
# DWD PK 列名(第一列)
|
||
dwd_pk = _get_dwd_pk(conn, dwd_s, dwd_t)
|
||
if not dwd_pk:
|
||
return result
|
||
|
||
# 查找 EX 表
|
||
ex_table = _find_ex_table(conn, dwd_s, dwd_t)
|
||
result["ex_table"] = f"{dwd_s}.{ex_table}" if ex_table else None
|
||
|
||
# 列比对:合并主表 + EX 表的列集合
|
||
dwd_main_cols = set(get_table_columns(conn, dwd_s, dwd_t))
|
||
dwd_ex_cols = set()
|
||
if ex_table:
|
||
dwd_ex_cols = set(get_table_columns(conn, dwd_s, ex_table))
|
||
# 合并后去掉 SCD2 管理列和 PK 重复列
|
||
dwd_all_cols = dwd_main_cols | dwd_ex_cols
|
||
dwd_biz_cols = dwd_all_cols - SCD2_COLS
|
||
|
||
ods_cols = set(get_table_columns(conn, ods_s, ods_t)) - ETL_META_COLS
|
||
|
||
result["dwd_cols"] = sorted(dwd_all_cols)
|
||
result["ods_cols"] = sorted(ods_cols)
|
||
|
||
# 找共同列(忽略大小写)
|
||
dwd_lower = {c.lower(): c for c in dwd_biz_cols}
|
||
ods_lower = {c.lower(): c for c in ods_cols}
|
||
common = set(dwd_lower.keys()) & set(ods_lower.keys())
|
||
dwd_only = sorted(set(dwd_lower.keys()) - set(ods_lower.keys()))
|
||
ods_only = sorted(set(ods_lower.keys()) - set(dwd_lower.keys()))
|
||
|
||
result["common_cols"] = sorted(common)
|
||
result["dwd_only_cols"] = [dwd_lower[k] for k in dwd_only]
|
||
result["ods_only_cols"] = [ods_lower[k] for k in ods_only]
|
||
|
||
if not common:
|
||
return result
|
||
|
||
# 确定每个共同列来自主表还是 EX 表
|
||
main_lower = {c.lower(): c for c in dwd_main_cols}
|
||
ex_lower = {c.lower(): c for c in dwd_ex_cols} if ex_table else {}
|
||
|
||
compare_cols = sorted(common)[:15] # 最多比 15 个字段
|
||
value_diffs = []
|
||
|
||
with conn.cursor() as cur:
|
||
# 取 DWD 采样行(主表)
|
||
try:
|
||
if is_dim:
|
||
cur.execute(f"""
|
||
SELECT * FROM {dwd_s}.{dwd_t}
|
||
WHERE scd2_is_current = true
|
||
ORDER BY random() LIMIT {SAMPLE_LIMIT}
|
||
""")
|
||
else:
|
||
cur.execute(f"""
|
||
SELECT * FROM {dwd_s}.{dwd_t}
|
||
ORDER BY random() LIMIT {SAMPLE_LIMIT}
|
||
""")
|
||
dwd_samples = cur.fetchall()
|
||
except Exception:
|
||
conn.rollback()
|
||
dwd_samples = []
|
||
|
||
for dwd_row in dwd_samples:
|
||
pk_val = dwd_row.get(dwd_pk)
|
||
if pk_val is None:
|
||
continue
|
||
|
||
# 如果有 EX 表,查 EX 行并合并
|
||
dwd_merged = dict(dwd_row)
|
||
if ex_table:
|
||
try:
|
||
cur.execute(
|
||
f'SELECT * FROM {dwd_s}.{ex_table} WHERE "{dwd_pk}" = %s LIMIT 1',
|
||
(pk_val,)
|
||
)
|
||
ex_row = cur.fetchone()
|
||
if ex_row:
|
||
dwd_merged.update(dict(ex_row))
|
||
except Exception:
|
||
conn.rollback()
|
||
|
||
# 查 ODS 对应行(最新快照,ODS 用 id 列)
|
||
try:
|
||
cur.execute(
|
||
f"SELECT * FROM {ods_s}.{ods_t} WHERE id = %s ORDER BY fetched_at DESC LIMIT 1",
|
||
(pk_val,)
|
||
)
|
||
ods_row = cur.fetchone()
|
||
except Exception:
|
||
conn.rollback()
|
||
ods_row = None
|
||
|
||
if not ods_row:
|
||
value_diffs.append({
|
||
"id": pk_val, "field": "(全行)",
|
||
"dwd_val": "存在", "ods_val": "ODS 中未找到",
|
||
"whitelist": False,
|
||
})
|
||
continue
|
||
|
||
for col_lower in compare_cols:
|
||
dwd_key = dwd_lower.get(col_lower) or main_lower.get(col_lower) or ex_lower.get(col_lower)
|
||
ods_key = ods_lower[col_lower]
|
||
dwd_val = dwd_merged.get(dwd_key)
|
||
ods_val = ods_row.get(ods_key)
|
||
|
||
is_diff, reason = _values_differ(dwd_val, ods_val)
|
||
if is_diff or reason == "whitelist":
|
||
value_diffs.append({
|
||
"id": pk_val, "field": col_lower,
|
||
"dwd_val": _fmt_val(dwd_val),
|
||
"ods_val": _fmt_val(ods_val),
|
||
"whitelist": reason == "whitelist",
|
||
})
|
||
|
||
result["value_diffs"] = value_diffs[:30]
|
||
real_diffs = [d for d in value_diffs if not d.get("whitelist")]
|
||
if dwd_only or real_diffs:
|
||
result["status"] = "❌ 存在差异"
|
||
|
||
# 字段级统计(DWD 主表)
|
||
result["dwd_field_stats"] = get_field_stats(conn, dwd_s, dwd_t)
|
||
|
||
# 数据截止日期(从 ODS 源表查询)
|
||
result["data_cutoff"] = get_data_cutoff_date(conn, ods_s, ods_t)
|
||
|
||
return result
|
||
|
||
|
||
# ── DWD vs DWS 聚合逻辑验证 ──
|
||
def check_dwd_vs_dws(conn) -> list[dict]:
|
||
"""检查 DWS 表的数据是否与 DWD 源表一致(行数 + 关键指标抽查)"""
|
||
results = []
|
||
|
||
# 获取 DWS 层所有表
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT table_name FROM information_schema.tables
|
||
WHERE table_schema = 'dws' AND table_type = 'BASE TABLE'
|
||
ORDER BY table_name
|
||
""")
|
||
dws_tables = [r["table_name"] for r in cur.fetchall()]
|
||
|
||
for dws_t in dws_tables:
|
||
row_count = get_table_row_count(conn, "dws", dws_t)
|
||
cols = get_table_columns(conn, "dws", dws_t)
|
||
|
||
entry = {
|
||
"table": f"dws.{dws_t}",
|
||
"rows": row_count,
|
||
"col_count": len(cols),
|
||
"status": "✅" if row_count > 0 else "⚠️ 空表",
|
||
"sample_checks": [],
|
||
}
|
||
|
||
if row_count > 0:
|
||
# 对数值列做基本健全性检查(负值、NULL 率)
|
||
numeric_cols = []
|
||
with conn.cursor() as cur:
|
||
cur.execute("""
|
||
SELECT column_name, udt_name FROM information_schema.columns
|
||
WHERE table_schema = 'dws' AND table_name = %s
|
||
AND udt_name IN ('int2','int4','int8','float4','float8','numeric')
|
||
ORDER BY ordinal_position
|
||
""", (dws_t,))
|
||
numeric_cols = [(r["column_name"], r["udt_name"]) for r in cur.fetchall()]
|
||
|
||
for col_name, col_type in numeric_cols[:8]: # 最多检查 8 个数值列
|
||
with conn.cursor() as cur:
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT
|
||
COUNT(*) FILTER (WHERE "{col_name}" IS NULL) AS null_cnt,
|
||
COUNT(*) FILTER (WHERE "{col_name}" < 0) AS neg_cnt,
|
||
MIN("{col_name}") AS min_val,
|
||
MAX("{col_name}") AS max_val,
|
||
COUNT(*) AS total
|
||
FROM dws.{dws_t}
|
||
""")
|
||
stats = cur.fetchone()
|
||
total = stats["total"]
|
||
null_pct = round(stats["null_cnt"] / total * 100, 1) if total > 0 else 0
|
||
check = {
|
||
"col": col_name,
|
||
"null_pct": f"{null_pct}%",
|
||
"neg_count": stats["neg_cnt"],
|
||
"min": str(stats["min_val"]),
|
||
"max": str(stats["max_val"]),
|
||
"status": "✅",
|
||
}
|
||
# 金额列出现负值可能异常
|
||
if stats["neg_cnt"] > 0 and "amount" in col_name.lower():
|
||
check["status"] = "⚠️ 金额负值"
|
||
if null_pct > 50:
|
||
check["status"] = "⚠️ 高 NULL 率"
|
||
entry["sample_checks"].append(check)
|
||
except Exception:
|
||
conn.rollback()
|
||
|
||
results.append(entry)
|
||
|
||
return results
|
||
|
||
|
||
# ── 报告生成 ──
|
||
def generate_report(
|
||
log_path: Path,
|
||
log_results: dict,
|
||
api_ods_checks: list[dict],
|
||
ods_dwd_checks: list[dict],
|
||
dws_checks: list[dict],
|
||
) -> str:
|
||
"""生成 Markdown 报告"""
|
||
lines: list[str] = []
|
||
|
||
def w(s: str = ""):
|
||
lines.append(s)
|
||
|
||
w("# ETL 全链路数据一致性检查报告")
|
||
w()
|
||
w(f"生成时间: {NOW.strftime('%Y-%m-%d %H:%M:%S')} CST")
|
||
w(f"ETL 日志: `{log_path.name}`")
|
||
w()
|
||
|
||
def _fmt_field_stat(fs: dict) -> str:
|
||
"""格式化单个字段统计为表格行"""
|
||
col = fs.get("column", "?")
|
||
typ = fs.get("type", "?")
|
||
null_pct = fs.get("null_pct", "—")
|
||
distinct = fs.get("distinct", "-")
|
||
parts = []
|
||
if "min" in fs and fs["min"] is not None:
|
||
parts.append(f"min={fs['min']}, max={fs['max']}, avg={fs['avg']}")
|
||
if "earliest" in fs and fs["earliest"]:
|
||
parts.append(f"{fs['earliest']} ~ {fs['latest']}")
|
||
if "min_len" in fs and fs["min_len"] is not None:
|
||
parts.append(f"len={fs['min_len']}~{fs['max_len']}")
|
||
if "true_count" in fs:
|
||
parts.append(f"T={fs['true_count']}, F={fs['false_count']}")
|
||
if fs.get("note"):
|
||
parts.append(fs["note"])
|
||
stats = "; ".join(parts) if parts else "—"
|
||
return f"| `{col}` | {typ} | {null_pct} | {distinct} | {stats} |"
|
||
|
||
def _write_field_stats_table(w, stats: list[dict], table_label: str):
|
||
"""输出字段级统计表格(折叠展示)"""
|
||
if not stats:
|
||
return
|
||
w(f"<details><summary>📊 {table_label} 字段级统计({len(stats)} 列)</summary>")
|
||
w()
|
||
w("| 字段 | 类型 | NULL率 | 唯一值 | 统计 |")
|
||
w("|------|------|--------|--------|------|")
|
||
for fs in stats:
|
||
w(_fmt_field_stat(fs))
|
||
w()
|
||
w("</details>")
|
||
w()
|
||
|
||
# ── 1. ETL 执行概览 ──
|
||
w("## 1. ETL 执行概览")
|
||
w()
|
||
succ = sum(1 for v in log_results.values() if v.get("status") == "SUCC")
|
||
fail = sum(1 for v in log_results.values() if v.get("status") == "FAIL")
|
||
skip = sum(1 for v in log_results.values() if v.get("status") == "SKIP")
|
||
w(f"| 指标 | 值 |")
|
||
w(f"|------|-----|")
|
||
w(f"| 成功任务 | {succ} |")
|
||
w(f"| 失败任务 | {fail} |")
|
||
w(f"| 跳过任务 | {skip} |")
|
||
w(f"| 总计 | {len(log_results)} |")
|
||
w()
|
||
|
||
if fail > 0:
|
||
w("### 失败任务")
|
||
w()
|
||
for name, info in sorted(log_results.items()):
|
||
if info.get("status") == "FAIL":
|
||
w(f"- **{name}**: {info.get('error', '未知错误')}")
|
||
w()
|
||
|
||
# ── 2. API↔ODS 数据一致性 ──
|
||
w("## 2. API↔ODS 数据一致性")
|
||
w()
|
||
w("### 2.1 汇总")
|
||
w()
|
||
w("| ODS 表 | API 记录数 | ODS 行数 | ODS 去重ID | 数据截止 | 字段匹配 | API独有 | ODS独有 | 值差异 | 白名单 | 状态 |")
|
||
w("|--------|-----------|---------|-----------|---------|---------|--------|--------|-------|--------|------|")
|
||
for c in api_ods_checks:
|
||
fc = c["field_check"]
|
||
real_diffs = [d for d in c["value_diffs"] if not d.get("whitelist")]
|
||
wl_diffs = [d for d in c["value_diffs"] if d.get("whitelist")]
|
||
cutoff = c.get("data_cutoff") or "—"
|
||
w(f"| `{c['ods_table']}` | {c['api_records']} | {c['ods_rows']} | {c['ods_distinct_ids']} "
|
||
f"| {cutoff} "
|
||
f"| {fc['matched']}/{fc['api_fields']} | {len(fc['api_only'])} | {len(fc['ods_only'])} "
|
||
f"| {len(real_diffs)} | {len(wl_diffs)} | {c['status']} |")
|
||
w()
|
||
|
||
# 逐表差异详情 + 字段统计
|
||
for c in api_ods_checks:
|
||
fc = c["field_check"]
|
||
has_diff = fc["api_only"] or fc["ods_only"] or c["value_diffs"]
|
||
has_stats = c.get("ods_field_stats")
|
||
|
||
if not has_diff and not has_stats:
|
||
continue
|
||
|
||
w(f"### 2.2 {c['ods_table']} 详情")
|
||
w()
|
||
|
||
if fc["api_only"]:
|
||
w(f"**API 独有字段** ({len(fc['api_only'])} 个): `{'`, `'.join(fc['api_only'][:15])}`")
|
||
w()
|
||
|
||
if fc["ods_only"]:
|
||
w(f"**ODS 独有字段** ({len(fc['ods_only'])} 个): `{'`, `'.join(fc['ods_only'][:15])}`")
|
||
w()
|
||
|
||
if c["value_diffs"]:
|
||
real = [d for d in c["value_diffs"] if not d.get("whitelist")]
|
||
wl = [d for d in c["value_diffs"] if d.get("whitelist")]
|
||
|
||
if real:
|
||
w(f"**值差异采样** ({len(real)} 条)")
|
||
w()
|
||
w("| ID | 字段 | API 值 | ODS 值 |")
|
||
w("|----|------|--------|--------|")
|
||
for d in real:
|
||
w(f"| {d['id']} | `{d['field']}` | {d['api_val']} | {d['ods_val']} |")
|
||
w()
|
||
|
||
if wl:
|
||
w(f"<details><summary>🔕 白名单差异({len(wl)} 条)— API 空字符串 vs DB None,视为等价</summary>")
|
||
w()
|
||
w("| ID | 字段 | API 值 | ODS 值 |")
|
||
w("|----|------|--------|--------|")
|
||
for d in wl:
|
||
w(f"| {d['id']} | `{d['field']}` | {d['api_val']} | {d['ods_val']} |")
|
||
w()
|
||
w("</details>")
|
||
w()
|
||
|
||
# ODS 字段级统计
|
||
if has_stats:
|
||
_write_field_stats_table(w, c["ods_field_stats"], f"ods.{c['ods_table']}")
|
||
|
||
# ── 3. ODS↔DWD 数据一致性 ──
|
||
w("## 3. ODS↔DWD 数据一致性")
|
||
w()
|
||
w("### 3.1 汇总")
|
||
w()
|
||
w("| DWD 表 | EX 表 | ODS 表 | 类型 | ODS 行 | ODS 去重ID | DWD 行 | 数据截止 | 共同列 | DWD独有 | 值差异 | 白名单 | 状态 |")
|
||
w("|--------|-------|--------|------|-------|-----------|-------|---------|-------|--------|-------|--------|------|")
|
||
for c in ods_dwd_checks:
|
||
real_diffs = [d for d in c["value_diffs"] if not d.get("whitelist")]
|
||
wl_diffs = [d for d in c["value_diffs"] if d.get("whitelist")]
|
||
ex_label = c.get("ex_table", "—") or "—"
|
||
cutoff = c.get("data_cutoff") or "—"
|
||
w(f"| `{c['dwd_table']}` | `{ex_label}` | `{c['ods_table']}` | {c['type']} "
|
||
f"| {c['ods_rows']} | {c['ods_distinct_ids']} | {c['dwd_rows']} "
|
||
f"| {cutoff} "
|
||
f"| {len(c['common_cols'])} | {len(c['dwd_only_cols'])} "
|
||
f"| {len(real_diffs)} | {len(wl_diffs)} | {c['status']} |")
|
||
w()
|
||
|
||
# 逐表差异详情 + 字段统计
|
||
for c in ods_dwd_checks:
|
||
has_diff = c["dwd_only_cols"] or c["value_diffs"]
|
||
has_stats = c.get("dwd_field_stats")
|
||
|
||
if not has_diff and not has_stats:
|
||
continue
|
||
|
||
w(f"### 3.2 {c['dwd_table']} 详情")
|
||
w()
|
||
|
||
if c["dwd_only_cols"]:
|
||
w(f"**DWD 独有列** ({len(c['dwd_only_cols'])} 个): `{'`, `'.join(c['dwd_only_cols'][:15])}`")
|
||
w()
|
||
|
||
if c["ods_only_cols"]:
|
||
w(f"**ODS 独有列** ({len(c['ods_only_cols'])} 个): `{'`, `'.join(c['ods_only_cols'][:15])}`")
|
||
w()
|
||
|
||
if c["value_diffs"]:
|
||
real = [d for d in c["value_diffs"] if not d.get("whitelist")]
|
||
wl = [d for d in c["value_diffs"] if d.get("whitelist")]
|
||
|
||
if real:
|
||
w(f"**值差异采样** ({len(real)} 条)")
|
||
w()
|
||
w("| ID | 字段 | DWD 值 | ODS 值 |")
|
||
w("|----|------|--------|--------|")
|
||
for d in real:
|
||
w(f"| {d['id']} | `{d['field']}` | {d['dwd_val']} | {d['ods_val']} |")
|
||
w()
|
||
|
||
if wl:
|
||
w(f"<details><summary>🔕 白名单差异({len(wl)} 条)— 空字符串 vs None,视为等价</summary>")
|
||
w()
|
||
w("| ID | 字段 | DWD 值 | ODS 值 |")
|
||
w("|----|------|--------|--------|")
|
||
for d in wl:
|
||
w(f"| {d['id']} | `{d['field']}` | {d['dwd_val']} | {d['ods_val']} |")
|
||
w()
|
||
w("</details>")
|
||
w()
|
||
|
||
# DWD 字段级统计
|
||
if has_stats:
|
||
_write_field_stats_table(w, c["dwd_field_stats"], c["dwd_table"])
|
||
|
||
# ── 4. DWD↔DWS 数据一致性 ──
|
||
w("## 4. DWD↔DWS 数据一致性")
|
||
w()
|
||
w("### 4.1 DWS 表概览")
|
||
w()
|
||
non_empty = sum(1 for c in dws_checks if c["rows"] > 0)
|
||
w(f"DWS 层共 {len(dws_checks)} 张表,{non_empty} 张有数据,{len(dws_checks) - non_empty} 张为空。")
|
||
w()
|
||
w("| DWS 表 | 行数 | 列数 | 状态 |")
|
||
w("|--------|------|------|------|")
|
||
for c in dws_checks:
|
||
w(f"| `{c['table']}` | {c['rows']} | {c['col_count']} | {c['status']} |")
|
||
w()
|
||
|
||
# DWS 数值列健全性检查
|
||
has_checks = [c for c in dws_checks if c["sample_checks"]]
|
||
if has_checks:
|
||
w("### 4.2 DWS 数值列健全性检查")
|
||
w()
|
||
for c in has_checks:
|
||
anomalies = [sc for sc in c["sample_checks"] if sc["status"] != "✅"]
|
||
if not anomalies:
|
||
continue
|
||
w(f"#### {c['table']}")
|
||
w()
|
||
w("| 列 | NULL率 | 负值数 | 最小值 | 最大值 | 状态 |")
|
||
w("|----|--------|--------|--------|--------|------|")
|
||
for sc in anomalies:
|
||
w(f"| `{sc['col']}` | {sc['null_pct']} | {sc['neg_count']} | {sc['min']} | {sc['max']} | {sc['status']} |")
|
||
w()
|
||
|
||
# ── 5. 异常汇总 ──
|
||
w("## 5. 异常汇总与建议")
|
||
w()
|
||
issues = []
|
||
for c in api_ods_checks:
|
||
if c["status"] != "✅":
|
||
issues.append(f"API↔ODS `{c['ods_table']}`: {c['status']}")
|
||
for c in ods_dwd_checks:
|
||
if c["status"] != "✅":
|
||
issues.append(f"ODS↔DWD `{c['dwd_table']}`: {c['status']}")
|
||
for c in dws_checks:
|
||
for sc in c.get("sample_checks", []):
|
||
if sc["status"] != "✅":
|
||
issues.append(f"DWS `{c['table']}.{sc['col']}`: {sc['status']}")
|
||
|
||
if issues:
|
||
w(f"共发现 {len(issues)} 项异常:")
|
||
w()
|
||
for i, issue in enumerate(issues, 1):
|
||
w(f"{i}. {issue}")
|
||
else:
|
||
w("✅ 未发现数据一致性异常。")
|
||
w()
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
# ── 主入口 ──
|
||
def main():
|
||
print(f"[{NOW.strftime('%H:%M:%S')}] ETL 全链路数据一致性检查开始...")
|
||
|
||
# 1. 找到最新 ETL 日志
|
||
log_path = find_latest_log()
|
||
if not log_path:
|
||
print("❌ 未找到 ETL 日志文件")
|
||
sys.exit(1)
|
||
print(f" 日志: {log_path.name}")
|
||
|
||
# 2. 解析日志
|
||
log_results = parse_etl_log(log_path)
|
||
succ_ods = {k: v for k, v in log_results.items()
|
||
if v.get("status") == "SUCC" and v.get("layer") == "ODS"}
|
||
print(f" 成功 ODS 任务: {len(succ_ods)}")
|
||
|
||
# 3. 连接数据库
|
||
conn = get_conn()
|
||
print(f" 数据库连接成功(只读模式)")
|
||
|
||
try:
|
||
# 4. API vs ODS 检查
|
||
print(f"\n[API↔ODS] 开始逐表检查...")
|
||
api_ods_checks = []
|
||
for task_name, ods_table in sorted(ODS_TASK_TO_TABLE.items()):
|
||
if task_name not in succ_ods:
|
||
continue
|
||
print(f" 检查 {task_name} → ods.{ods_table}...", end=" ")
|
||
result = check_api_vs_ods(conn, task_name, ods_table)
|
||
api_ods_checks.append(result)
|
||
print(result["status"])
|
||
|
||
# 5. ODS vs DWD 检查
|
||
print(f"\n[ODS↔DWD] 开始逐表检查...")
|
||
ods_dwd_checks = []
|
||
for dwd_full, ods_full in sorted(DWD_TO_ODS.items()):
|
||
print(f" 检查 {dwd_full} ← {ods_full}...", end=" ")
|
||
result = check_ods_vs_dwd(conn, dwd_full, ods_full)
|
||
ods_dwd_checks.append(result)
|
||
print(result["status"])
|
||
|
||
# 6. DWD vs DWS 检查
|
||
print(f"\n[DWD↔DWS] 开始检查...")
|
||
dws_checks = check_dwd_vs_dws(conn)
|
||
non_empty = sum(1 for c in dws_checks if c["rows"] > 0)
|
||
print(f" DWS 表: {len(dws_checks)} 张,{non_empty} 张有数据")
|
||
|
||
# 7. 生成报告
|
||
report = generate_report(log_path, log_results, api_ods_checks, ods_dwd_checks, dws_checks)
|
||
out_file = REPORT_ROOT / f"consistency_check_{TS}.md"
|
||
out_file.write_text(report, encoding="utf-8")
|
||
print(f"\n✅ 报告已生成: {out_file}")
|
||
|
||
finally:
|
||
conn.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|