在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

View File

@@ -0,0 +1,455 @@
# -*- coding: utf-8 -*-
"""DWS 汇总层批量校验器
校验逻辑:对比 DWD 聚合数据与 DWS 表数据
- 按日期/门店聚合对比
- 对比数值一致性
- 批量重算 UPSERT 补齐
"""
import logging
from datetime import datetime
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_verifier import BaseVerifier, VerificationFetchError
class DwsVerifier(BaseVerifier):
"""DWS 汇总层校验器"""
def __init__(
self,
db_connection: Any,
logger: Optional[logging.Logger] = None,
):
"""
初始化 DWS 校验器
Args:
db_connection: 数据库连接
logger: 日志器
"""
super().__init__(db_connection, logger)
self._table_config = self._load_table_config()
@property
def layer_name(self) -> str:
return "DWS"
def _load_table_config(self) -> Dict[str, dict]:
"""加载 DWS 汇总表配置"""
# DWS 汇总表通常有以下结构:
# - 主键site_id, stat_date 或类似组合
# - 数值列:各种统计值
# - 源表:对应的 DWD 事实表
return {
# 财务日度汇总表 - 包含结算、台费、商品、助教等汇总数据
# 注意:实际 DWS 表使用 gross_amount, table_fee_amount, goods_amount 等列
"dws_finance_daily_summary": {
"pk_columns": ["site_id", "stat_date"],
"time_column": "stat_date",
"source_table": "dwd.dwd_settlement_head",
"source_time_column": "pay_time",
"agg_sql": """
SELECT
site_id,
tenant_id,
DATE(pay_time) as stat_date,
COALESCE(SUM(pay_amount), 0) as cash_pay_amount,
COALESCE(SUM(table_charge_money), 0) as table_fee_amount,
COALESCE(SUM(goods_money), 0) as goods_amount,
COALESCE(SUM(table_charge_money) + SUM(goods_money) + COALESCE(SUM(assistant_pd_money), 0) + COALESCE(SUM(assistant_cx_money), 0), 0) as gross_amount
FROM dwd.dwd_settlement_head
WHERE pay_time >= %s AND pay_time < %s
GROUP BY site_id, tenant_id, DATE(pay_time)
""",
"compare_columns": ["cash_pay_amount", "table_fee_amount", "goods_amount", "gross_amount"],
},
# 助教日度明细表 - 按助教+日期汇总服务次数、时长、金额
# 注意DWD 表中使用 site_assistant_idDWS 表中使用 assistant_id
"dws_assistant_daily_detail": {
"pk_columns": ["site_id", "assistant_id", "stat_date"],
"time_column": "stat_date",
"source_table": "dwd.dwd_assistant_service_log",
"source_time_column": "start_use_time",
"agg_sql": """
SELECT
site_id,
tenant_id,
site_assistant_id as assistant_id,
DATE(start_use_time) as stat_date,
COUNT(*) as total_service_count,
COALESCE(SUM(income_seconds), 0) as total_seconds,
COALESCE(SUM(ledger_amount), 0) as total_ledger_amount
FROM dwd.dwd_assistant_service_log
WHERE start_use_time >= %s AND start_use_time < %s
AND is_delete = 0
GROUP BY site_id, tenant_id, site_assistant_id, DATE(start_use_time)
""",
"compare_columns": ["total_service_count", "total_seconds", "total_ledger_amount"],
},
# 会员来店明细表 - 按会员+订单记录每次来店消费
# 注意DWD 表主键是 order_settle_id不是 id
"dws_member_visit_detail": {
"pk_columns": ["site_id", "member_id", "order_settle_id"],
"time_column": "visit_date",
"source_table": "dwd.dwd_settlement_head",
"source_time_column": "pay_time",
"agg_sql": """
SELECT
site_id,
tenant_id,
member_id,
order_settle_id,
DATE(pay_time) as visit_date,
COALESCE(table_charge_money, 0) as table_fee,
COALESCE(goods_money, 0) as goods_amount,
COALESCE(pay_amount, 0) as actual_pay
FROM dwd.dwd_settlement_head
WHERE pay_time >= %s AND pay_time < %s
AND member_id > 0
""",
"compare_columns": ["table_fee", "goods_amount", "actual_pay"],
},
}
def get_tables(self) -> List[str]:
"""获取需要校验的 DWS 汇总表列表"""
if self._table_config:
return list(self._table_config.keys())
sql = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'dws'
AND table_type = 'BASE TABLE'
AND table_name LIKE 'dws_%'
AND table_name NOT LIKE 'cfg_%'
ORDER BY table_name
"""
try:
with self.db.conn.cursor() as cur:
cur.execute(sql)
return [row[0] for row in cur.fetchall()]
except Exception as e:
self.logger.warning("获取 DWS 表列表失败: %s", e)
try:
self.db.conn.rollback()
except Exception:
pass
return []
def get_primary_keys(self, table: str) -> List[str]:
"""获取表的主键列"""
if table in self._table_config:
return self._table_config[table].get("pk_columns", ["site_id", "stat_date"])
return ["site_id", "stat_date"]
def get_time_column(self, table: str) -> Optional[str]:
"""获取表的时间列"""
if table in self._table_config:
return self._table_config[table].get("time_column", "stat_date")
return "stat_date"
def fetch_source_keys(
self,
table: str,
window_start: datetime,
window_end: datetime,
) -> Set[Tuple]:
"""从 DWD 聚合获取源数据主键集合"""
config = self._table_config.get(table, {})
agg_sql = config.get("agg_sql")
if not agg_sql:
return set()
pk_cols = self.get_primary_keys(table)
try:
with self.db.conn.cursor() as cur:
cur.execute(agg_sql, (window_start, window_end))
columns = [desc[0] for desc in cur.description]
pk_indices = [columns.index(c) for c in pk_cols if c in columns]
return {tuple(row[i] for i in pk_indices) for row in cur.fetchall()}
except Exception as e:
self.logger.warning("获取 DWD 聚合主键失败: %s, error=%s", table, e)
try:
self.db.conn.rollback()
except Exception:
pass
raise VerificationFetchError(f"获取 DWD 聚合主键失败: {table}") from e
def fetch_target_keys(
self,
table: str,
window_start: datetime,
window_end: datetime,
) -> Set[Tuple]:
"""从 DWS 表获取目标数据主键集合"""
pk_cols = self.get_primary_keys(table)
time_col = self.get_time_column(table)
pk_select = ", ".join(pk_cols)
sql = f"""
SELECT {pk_select}
FROM dws.{table}
WHERE {time_col} >= %s AND {time_col} < %s
"""
try:
with self.db.conn.cursor() as cur:
cur.execute(sql, (window_start.date(), window_end.date()))
return {tuple(row) for row in cur.fetchall()}
except Exception as e:
self.logger.warning("获取 DWS 主键失败: %s, error=%s", table, e)
try:
self.db.conn.rollback()
except Exception:
pass
raise VerificationFetchError(f"获取 DWS 主键失败: {table}") from e
def fetch_source_hashes(
self,
table: str,
window_start: datetime,
window_end: datetime,
) -> Dict[Tuple, str]:
"""从 DWD 聚合获取数据,返回主键->聚合值字符串"""
config = self._table_config.get(table, {})
agg_sql = config.get("agg_sql")
compare_cols = config.get("compare_columns", [])
if not agg_sql:
return {}
pk_cols = self.get_primary_keys(table)
result = {}
try:
with self.db.conn.cursor() as cur:
cur.execute(agg_sql, (window_start, window_end))
columns = [desc[0] for desc in cur.description]
pk_indices = [columns.index(c) for c in pk_cols if c in columns]
value_indices = [columns.index(c) for c in compare_cols if c in columns]
for row in cur.fetchall():
pk = tuple(row[i] for i in pk_indices)
values = tuple(row[i] for i in value_indices)
result[pk] = str(values)
except Exception as e:
self.logger.warning("获取 DWD 聚合数据失败: %s, error=%s", table, e)
try:
self.db.conn.rollback()
except Exception:
pass
raise VerificationFetchError(f"获取 DWD 聚合数据失败: {table}") from e
return result
def fetch_target_hashes(
self,
table: str,
window_start: datetime,
window_end: datetime,
) -> Dict[Tuple, str]:
"""从 DWS 表获取数据,返回主键->值字符串"""
config = self._table_config.get(table, {})
compare_cols = config.get("compare_columns", [])
pk_cols = self.get_primary_keys(table)
time_col = self.get_time_column(table)
all_cols = pk_cols + compare_cols
col_select = ", ".join(all_cols)
sql = f"""
SELECT {col_select}
FROM dws.{table}
WHERE {time_col} >= %s AND {time_col} < %s
"""
result = {}
try:
with self.db.conn.cursor() as cur:
cur.execute(sql, (window_start.date(), window_end.date()))
for row in cur.fetchall():
pk = tuple(row[:len(pk_cols)])
values = tuple(row[len(pk_cols):])
result[pk] = str(values)
except Exception as e:
self.logger.warning("获取 DWS 数据失败: %s, error=%s", table, e)
try:
self.db.conn.rollback()
except Exception:
pass
raise VerificationFetchError(f"获取 DWS 数据失败: {table}") from e
return result
def backfill_missing(
self,
table: str,
missing_keys: Set[Tuple],
window_start: datetime,
window_end: datetime,
) -> int:
"""批量补齐缺失数据(重新计算并插入)"""
if not missing_keys:
return 0
self.logger.info(
"DWS 补齐缺失: 表=%s, 数量=%d",
table, len(missing_keys)
)
# 在执行之前确保事务状态干净
try:
self.db.conn.rollback()
except Exception:
pass
# 重新计算汇总数据
return self._recalculate_and_upsert(table, window_start, window_end, missing_keys)
def backfill_mismatch(
self,
table: str,
mismatch_keys: Set[Tuple],
window_start: datetime,
window_end: datetime,
) -> int:
"""批量更新不一致数据(重新计算并更新)"""
if not mismatch_keys:
return 0
self.logger.info(
"DWS 更新不一致: 表=%s, 数量=%d",
table, len(mismatch_keys)
)
# 在执行之前确保事务状态干净
try:
self.db.conn.rollback()
except Exception:
pass
# 重新计算汇总数据
return self._recalculate_and_upsert(table, window_start, window_end, mismatch_keys)
def _recalculate_and_upsert(
self,
table: str,
window_start: datetime,
window_end: datetime,
target_keys: Optional[Set[Tuple]] = None,
) -> int:
"""重新计算汇总数据并 UPSERT"""
config = self._table_config.get(table, {})
agg_sql = config.get("agg_sql")
if not agg_sql:
return 0
pk_cols = self.get_primary_keys(table)
# 执行聚合查询
try:
with self.db.conn.cursor() as cur:
cur.execute(agg_sql, (window_start, window_end))
columns = [desc[0] for desc in cur.description]
records = [dict(zip(columns, row)) for row in cur.fetchall()]
except Exception as e:
self.logger.error("聚合查询失败: %s", e)
try:
self.db.conn.rollback()
except Exception:
pass
return 0
if not records:
return 0
# 如果指定了目标主键,只处理这些记录
if target_keys:
records = [
r for r in records
if tuple(r.get(c) for c in pk_cols) in target_keys
]
if not records:
return 0
# 构建 UPSERT SQL
col_list = ", ".join(columns)
placeholders = ", ".join(["%s"] * len(columns))
pk_list = ", ".join(pk_cols)
update_cols = [c for c in columns if c not in pk_cols]
update_set = ", ".join(f"{c} = EXCLUDED.{c}" for c in update_cols)
upsert_sql = f"""
INSERT INTO dws.{table} ({col_list})
VALUES ({placeholders})
ON CONFLICT ({pk_list}) DO UPDATE SET {update_set}
"""
count = 0
with self.db.conn.cursor() as cur:
for record in records:
values = [record.get(c) for c in columns]
try:
cur.execute(upsert_sql, values)
count += 1
except Exception as e:
self.logger.warning("UPSERT 失败: %s", e)
self.db.commit()
return count
def verify_aggregation(
self,
table: str,
window_start: datetime,
window_end: datetime,
) -> Dict[str, Any]:
"""
详细校验聚合数据
返回源和目标的详细对比
"""
config = self._table_config.get(table, {})
compare_cols = config.get("compare_columns", [])
source_hashes = self.fetch_source_hashes(table, window_start, window_end)
target_hashes = self.fetch_target_hashes(table, window_start, window_end)
source_keys = set(source_hashes.keys())
target_keys = set(target_hashes.keys())
missing = source_keys - target_keys
extra = target_keys - source_keys
# 对比数值
mismatch_details = []
for key in source_keys & target_keys:
if source_hashes[key] != target_hashes[key]:
mismatch_details.append({
"key": key,
"source": source_hashes[key],
"target": target_hashes[key],
})
return {
"table": table,
"window": f"{window_start.date()} ~ {window_end.date()}",
"source_count": len(source_hashes),
"target_count": len(target_hashes),
"missing_count": len(missing),
"extra_count": len(extra),
"mismatch_count": len(mismatch_details),
"is_consistent": len(missing) == 0 and len(mismatch_details) == 0,
"missing_keys": list(missing)[:10], # 只返回前10个
"mismatch_details": mismatch_details[:10],
}