在准备环境前提交次全部更改。

2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions
--- a/apps/etl/connectors/feiqiu/tasks/verification/models.py
+++ b/apps/etl/connectors/feiqiu/tasks/verification/models.py
@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+"""校验结果数据模型"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from typing import List, Optional, Dict, Any
+
+
+class VerificationStatus(Enum):
+    """校验状态"""
+    OK = "OK"                     # 数据一致
+    MISSING = "MISSING"           # 有缺失数据
+    MISMATCH = "MISMATCH"         # 有不一致数据
+    BACKFILLED = "BACKFILLED"     # 已补齐
+    ERROR = "ERROR"               # 校验出错
+
+
+@dataclass
+class VerificationResult:
+    """单表校验结果"""
+    layer: str                    # 数据层: "ODS" / "DWD" / "DWS" / "INDEX"
+    table: str                    # 表名
+    window_start: datetime        # 校验窗口开始
+    window_end: datetime          # 校验窗口结束
+    source_count: int = 0         # 源数据量
+    target_count: int = 0         # 目标数据量
+    missing_count: int = 0        # 缺失记录数
+    mismatch_count: int = 0       # 不一致记录数
+    backfilled_count: int = 0     # 已补齐记录数（缺失 + 不一致）
+    backfilled_missing_count: int = 0   # 缺失补齐数
+    backfilled_mismatch_count: int = 0  # 不一致补齐数
+    status: VerificationStatus = VerificationStatus.OK
+    elapsed_seconds: float = 0.0  # 耗时（秒）
+    error_message: Optional[str] = None  # 错误信息
+    details: Dict[str, Any] = field(default_factory=dict)  # 额外详情
+    
+    @property
+    def is_consistent(self) -> bool:
+        """数据是否一致"""
+        return self.status == VerificationStatus.OK
+    
+    @property
+    def needs_backfill(self) -> bool:
+        """是否需要补齐"""
+        return self.missing_count > 0 or self.mismatch_count > 0
+    
+    def to_dict(self) -> dict:
+        """转换为字典"""
+        return {
+            "layer": self.layer,
+            "table": self.table,
+            "window_start": self.window_start.isoformat() if self.window_start else None,
+            "window_end": self.window_end.isoformat() if self.window_end else None,
+            "source_count": self.source_count,
+            "target_count": self.target_count,
+            "missing_count": self.missing_count,
+            "mismatch_count": self.mismatch_count,
+            "backfilled_count": self.backfilled_count,
+            "backfilled_missing_count": self.backfilled_missing_count,
+            "backfilled_mismatch_count": self.backfilled_mismatch_count,
+            "status": self.status.value,
+            "elapsed_seconds": self.elapsed_seconds,
+            "error_message": self.error_message,
+            "details": self.details,
+        }
+    
+    def format_summary(self) -> str:
+        """格式化摘要"""
+        lines = [
+            f"表: {self.table}",
+            f"层: {self.layer}",
+            f"窗口: {self.window_start.strftime('%Y-%m-%d %H:%M')} ~ {self.window_end.strftime('%Y-%m-%d %H:%M')}",
+            f"源数据量: {self.source_count:,}",
+            f"目标数据量: {self.target_count:,}",
+            f"缺失: {self.missing_count:,}",
+            f"不一致: {self.mismatch_count:,}",
+            f"缺失补齐: {self.backfilled_missing_count:,}",
+            f"不一致补齐: {self.backfilled_mismatch_count:,}",
+            f"已补齐: {self.backfilled_count:,}",
+            f"状态: {self.status.value}",
+            f"耗时: {self.elapsed_seconds:.2f}s",
+        ]
+        if self.error_message:
+            lines.append(f"错误: {self.error_message}")
+        return "\n".join(lines)
+
+
+@dataclass
+class VerificationSummary:
+    """校验汇总结果"""
+    layer: str                    # 数据层
+    window_start: datetime        # 校验窗口开始
+    window_end: datetime          # 校验窗口结束
+    total_tables: int = 0         # 总表数
+    consistent_tables: int = 0    # 一致的表数
+    inconsistent_tables: int = 0  # 不一致的表数
+    total_source_count: int = 0   # 总源数据量
+    total_target_count: int = 0   # 总目标数据量
+    total_missing: int = 0        # 总缺失数
+    total_mismatch: int = 0       # 总不一致数
+    total_backfilled: int = 0     # 总补齐数
+    total_backfilled_missing: int = 0  # 总缺失补齐数
+    total_backfilled_mismatch: int = 0  # 总不一致补齐数
+    error_tables: int = 0         # 发生错误的表数
+    elapsed_seconds: float = 0.0  # 总耗时
+    results: List[VerificationResult] = field(default_factory=list)  # 各表结果
+    status: VerificationStatus = VerificationStatus.OK
+    
+    def add_result(self, result: VerificationResult):
+        """添加单表结果"""
+        self.results.append(result)
+        self.total_tables += 1
+        self.total_source_count += result.source_count
+        self.total_target_count += result.target_count
+        self.total_missing += result.missing_count
+        self.total_mismatch += result.mismatch_count
+        self.total_backfilled += result.backfilled_count
+        self.total_backfilled_missing += result.backfilled_missing_count
+        self.total_backfilled_mismatch += result.backfilled_mismatch_count
+        self.elapsed_seconds += result.elapsed_seconds
+        
+        if result.status == VerificationStatus.ERROR:
+            self.error_tables += 1
+            self.inconsistent_tables += 1
+            # 错误优先级最高，直接覆盖汇总状态
+            self.status = VerificationStatus.ERROR
+        elif result.is_consistent:
+            self.consistent_tables += 1
+        else:
+            self.inconsistent_tables += 1
+            if self.status == VerificationStatus.OK:
+                self.status = result.status
+    
+    @property
+    def is_all_consistent(self) -> bool:
+        """是否全部一致"""
+        return self.inconsistent_tables == 0
+    
+    def to_dict(self) -> dict:
+        """转换为字典"""
+        return {
+            "layer": self.layer,
+            "window_start": self.window_start.isoformat() if self.window_start else None,
+            "window_end": self.window_end.isoformat() if self.window_end else None,
+            "total_tables": self.total_tables,
+            "consistent_tables": self.consistent_tables,
+            "inconsistent_tables": self.inconsistent_tables,
+            "total_source_count": self.total_source_count,
+            "total_target_count": self.total_target_count,
+            "total_missing": self.total_missing,
+            "total_mismatch": self.total_mismatch,
+            "total_backfilled": self.total_backfilled,
+            "total_backfilled_missing": self.total_backfilled_missing,
+            "total_backfilled_mismatch": self.total_backfilled_mismatch,
+            "error_tables": self.error_tables,
+            "elapsed_seconds": self.elapsed_seconds,
+            "status": self.status.value,
+            "results": [r.to_dict() for r in self.results],
+        }
+    
+    def format_summary(self) -> str:
+        """格式化汇总摘要"""
+        lines = [
+            f"{'=' * 60}",
+            f"校验汇总 - {self.layer}",
+            f"{'=' * 60}",
+            f"窗口: {self.window_start.strftime('%Y-%m-%d %H:%M')} ~ {self.window_end.strftime('%Y-%m-%d %H:%M')}",
+            f"表数: {self.total_tables} (一致: {self.consistent_tables}, 不一致: {self.inconsistent_tables})",
+            f"源数据量: {self.total_source_count:,}",
+            f"目标数据量: {self.total_target_count:,}",
+            f"总缺失: {self.total_missing:,}",
+            f"总不一致: {self.total_mismatch:,}",
+            f"总补齐: {self.total_backfilled:,} (缺失: {self.total_backfilled_missing:,}, 不一致: {self.total_backfilled_mismatch:,})",
+            f"错误表数: {self.error_tables}",
+            f"总耗时: {self.elapsed_seconds:.2f}s",
+            f"状态: {self.status.value}",
+            f"{'=' * 60}",
+        ]
+        return "\n".join(lines)
+
+
+@dataclass
+class WindowSegment:
+    """时间窗口片段"""
+    start: datetime
+    end: datetime
+    index: int = 0
+    total: int = 1
+    
+    @property
+    def label(self) -> str:
+        """片段标签"""
+        return f"{self.start.strftime('%Y-%m-%d')} ~ {self.end.strftime('%Y-%m-%d')}"
+
+
+def build_window_segments(
+    window_start: datetime,
+    window_end: datetime,
+    split_unit: str = "month",
+) -> List[WindowSegment]:
+    """
+    按指定单位切分时间窗口
+    
+    Args:
+        window_start: 开始时间
+        window_end: 结束时间
+        split_unit: 切分单位 ("none", "day", "week", "month")
+    
+    Returns:
+        时间窗口片段列表
+    """
+    if split_unit == "none" or not split_unit:
+        return [WindowSegment(start=window_start, end=window_end, index=0, total=1)]
+    
+    segments = []
+    current = window_start
+    
+    while current < window_end:
+        if split_unit == "day":
+            # 按天切分
+            next_boundary = current.replace(hour=0, minute=0, second=0, microsecond=0)
+            next_boundary = next_boundary + timedelta(days=1)
+        elif split_unit == "week":
+            # 按周切分（周一为起点）
+            days_until_monday = (7 - current.weekday()) % 7
+            if days_until_monday == 0:
+                days_until_monday = 7
+            next_boundary = current.replace(hour=0, minute=0, second=0, microsecond=0)
+            next_boundary = next_boundary + timedelta(days=days_until_monday)
+        elif split_unit == "month":
+            # 按月切分
+            if current.month == 12:
+                next_boundary = current.replace(year=current.year + 1, month=1, day=1,
+                                                 hour=0, minute=0, second=0, microsecond=0)
+            else:
+                next_boundary = current.replace(month=current.month + 1, day=1,
+                                                 hour=0, minute=0, second=0, microsecond=0)
+        else:
+            # 默认不切分
+            next_boundary = window_end
+        
+        segment_end = min(next_boundary, window_end)
+        segments.append(WindowSegment(start=current, end=segment_end))
+        current = segment_end
+    
+    # 更新索引
+    total = len(segments)
+    for i, seg in enumerate(segments):
+        seg.index = i
+        seg.total = total
+    
+    return segments
+
+def filter_verify_tables(layer: str, tables: list[str] | None) -> list[str] | None:
+    """按层过滤校验表名，避免非目标层全量校验。
+
+    Args:
+        layer: 数据层名称（"ODS" / "DWD" / "DWS" / "INDEX"）
+        tables: 待过滤的表名列表，为 None 或空时直接返回 None
+
+    Returns:
+        过滤后的表名列表，或 None
+    """
+    if not tables:
+        return None
+    layer_upper = layer.upper()
+    normalized = [t.strip().lower() for t in tables if t and t.strip()]
+    if layer_upper == "DWD":
+        return [t for t in normalized if t.startswith(("dwd_", "dim_", "fact_"))]
+    if layer_upper == "DWS":
+        return [t for t in normalized if t.startswith("dws_")]
+    if layer_upper == "INDEX":
+        return [t for t in normalized if t.startswith("v_") or t.endswith("_index")]
+    if layer_upper == "ODS":
+        return [t for t in normalized if t.startswith("ods_")]
+    return normalized
+
+
+
+
+# 需要导入 timedelta
+from datetime import timedelta