init: 项目初始提交 - NeoZQYY Monorepo 完整代码

2026-02-15 14:58:14 +08:00
commit ded6dfb9d8
769 changed files with 182616 additions and 0 deletions
--- a/apps/etl/pipelines/feiqiu/tasks/verification/base_verifier.py
+++ b/apps/etl/pipelines/feiqiu/tasks/verification/base_verifier.py
@@ -0,0 +1,382 @@
+# -*- coding: utf-8 -*-
+"""批量校验基类"""
+
+import logging
+import time
+from abc import ABC, abstractmethod
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from .models import (
+    VerificationResult,
+    VerificationSummary,
+    VerificationStatus,
+    WindowSegment,
+    build_window_segments,
+)
+
+
+class VerificationFetchError(RuntimeError):
+    """校验数据获取失败（用于显式标记 ERROR）。"""
+
+
+class BaseVerifier(ABC):
+    """批量校验基类
+    
+    提供统一的校验流程：
+    1. 切分时间窗口
+    2. 批量读取源数据
+    3. 批量读取目标数据
+    4. 内存对比
+    5. 批量补齐
+    """
+    
+    def __init__(
+        self,
+        db_connection: Any,
+        logger: Optional[logging.Logger] = None,
+    ):
+        """
+        初始化校验器
+        
+        Args:
+            db_connection: 数据库连接
+            logger: 日志器
+        """
+        self.db = db_connection
+        self.logger = logger or logging.getLogger(self.__class__.__name__)
+    
+    @property
+    @abstractmethod
+    def layer_name(self) -> str:
+        """数据层名称"""
+        pass
+    
+    @abstractmethod
+    def get_tables(self) -> List[str]:
+        """获取需要校验的表列表"""
+        pass
+    
+    @abstractmethod
+    def get_primary_keys(self, table: str) -> List[str]:
+        """获取表的主键列"""
+        pass
+    
+    @abstractmethod
+    def get_time_column(self, table: str) -> Optional[str]:
+        """获取表的时间列（用于窗口过滤）"""
+        pass
+    
+    @abstractmethod
+    def fetch_source_keys(
+        self,
+        table: str,
+        window_start: datetime,
+        window_end: datetime,
+    ) -> Set[Tuple]:
+        """批量获取源数据主键集合"""
+        pass
+    
+    @abstractmethod
+    def fetch_target_keys(
+        self,
+        table: str,
+        window_start: datetime,
+        window_end: datetime,
+    ) -> Set[Tuple]:
+        """批量获取目标数据主键集合"""
+        pass
+    
+    @abstractmethod
+    def fetch_source_hashes(
+        self,
+        table: str,
+        window_start: datetime,
+        window_end: datetime,
+    ) -> Dict[Tuple, str]:
+        """批量获取源数据主键->内容哈希映射"""
+        pass
+    
+    @abstractmethod
+    def fetch_target_hashes(
+        self,
+        table: str,
+        window_start: datetime,
+        window_end: datetime,
+    ) -> Dict[Tuple, str]:
+        """批量获取目标数据主键->内容哈希映射"""
+        pass
+    
+    @abstractmethod
+    def backfill_missing(
+        self,
+        table: str,
+        missing_keys: Set[Tuple],
+        window_start: datetime,
+        window_end: datetime,
+    ) -> int:
+        """批量补齐缺失数据，返回补齐的记录数"""
+        pass
+    
+    @abstractmethod
+    def backfill_mismatch(
+        self,
+        table: str,
+        mismatch_keys: Set[Tuple],
+        window_start: datetime,
+        window_end: datetime,
+    ) -> int:
+        """批量更新不一致数据，返回更新的记录数"""
+        pass
+    
+    def verify_table(
+        self,
+        table: str,
+        window_start: datetime,
+        window_end: datetime,
+        auto_backfill: bool = False,
+        compare_content: bool = True,
+    ) -> VerificationResult:
+        """
+        校验单表
+        
+        Args:
+            table: 表名
+            window_start: 窗口开始
+            window_end: 窗口结束
+            auto_backfill: 是否自动补齐
+            compare_content: 是否对比内容（True=对比hash，False=仅对比主键）
+        
+        Returns:
+            校验结果
+        """
+        start_time = time.time()
+        result = VerificationResult(
+            layer=self.layer_name,
+            table=table,
+            window_start=window_start,
+            window_end=window_end,
+        )
+        
+        try:
+            # 确保连接可用，避免“connection already closed”导致误判 OK
+            self._ensure_connection()
+            self.logger.info(
+                "%s 校验开始: %s [%s ~ %s]",
+                self.layer_name, table,
+                window_start.strftime("%Y-%m-%d %H:%M"),
+                window_end.strftime("%Y-%m-%d %H:%M")
+            )
+            
+            if compare_content:
+                # 对比内容哈希
+                source_hashes = self.fetch_source_hashes(table, window_start, window_end)
+                target_hashes = self.fetch_target_hashes(table, window_start, window_end)
+                
+                result.source_count = len(source_hashes)
+                result.target_count = len(target_hashes)
+                
+                source_keys = set(source_hashes.keys())
+                target_keys = set(target_hashes.keys())
+                
+                # 计算缺失
+                missing_keys = source_keys - target_keys
+                result.missing_count = len(missing_keys)
+                
+                # 计算不一致（两边都有但hash不同）
+                common_keys = source_keys & target_keys
+                mismatch_keys = {
+                    k for k in common_keys
+                    if source_hashes[k] != target_hashes[k]
+                }
+                result.mismatch_count = len(mismatch_keys)
+            else:
+                # 仅对比主键
+                source_keys = self.fetch_source_keys(table, window_start, window_end)
+                target_keys = self.fetch_target_keys(table, window_start, window_end)
+                
+                result.source_count = len(source_keys)
+                result.target_count = len(target_keys)
+                
+                missing_keys = source_keys - target_keys
+                result.missing_count = len(missing_keys)
+                mismatch_keys = set()
+            
+            # 判断状态
+            if result.missing_count > 0:
+                result.status = VerificationStatus.MISSING
+            elif result.mismatch_count > 0:
+                result.status = VerificationStatus.MISMATCH
+            else:
+                result.status = VerificationStatus.OK
+            
+            # 自动补齐
+            if auto_backfill and (missing_keys or mismatch_keys):
+                backfill_missing_count = 0
+                backfill_mismatch_count = 0
+                
+                if missing_keys:
+                    self.logger.info(
+                        "%s 补齐缺失: %s, 数量=%d",
+                        self.layer_name, table, len(missing_keys)
+                    )
+                    backfill_missing_count += self.backfill_missing(
+                        table, missing_keys, window_start, window_end
+                    )
+                
+                if mismatch_keys:
+                    self.logger.info(
+                        "%s 更新不一致: %s, 数量=%d",
+                        self.layer_name, table, len(mismatch_keys)
+                    )
+                    backfill_mismatch_count += self.backfill_mismatch(
+                        table, mismatch_keys, window_start, window_end
+                    )
+                
+                result.backfilled_missing_count = backfill_missing_count
+                result.backfilled_mismatch_count = backfill_mismatch_count
+                result.backfilled_count = backfill_missing_count + backfill_mismatch_count
+                if result.backfilled_count > 0:
+                    result.status = VerificationStatus.BACKFILLED
+            
+            self.logger.info(
+                "%s 校验完成: %s, 源=%d, 目标=%d, 缺失=%d, 不一致=%d, 补齐=%d(缺失=%d, 不一致=%d)",
+                self.layer_name, table,
+                result.source_count, result.target_count,
+                result.missing_count, result.mismatch_count, result.backfilled_count,
+                result.backfilled_missing_count, result.backfilled_mismatch_count
+            )
+            
+        except Exception as e:
+            result.status = VerificationStatus.ERROR
+            result.error_message = str(e)
+            if isinstance(e, VerificationFetchError):
+                # 连接不可用等致命错误，标记后续应中止
+                result.details["fatal"] = True
+            self.logger.exception("%s 校验失败: %s, error=%s", self.layer_name, table, e)
+            # 回滚事务，避免 PostgreSQL "当前事务被终止" 错误影响后续查询
+            try:
+                self.db.conn.rollback()
+            except Exception:
+                pass  # 忽略回滚错误
+        
+        result.elapsed_seconds = time.time() - start_time
+        return result
+    
+    def verify_and_backfill(
+        self,
+        window_start: datetime,
+        window_end: datetime,
+        split_unit: str = "month",
+        tables: Optional[List[str]] = None,
+        auto_backfill: bool = True,
+        compare_content: bool = True,
+    ) -> VerificationSummary:
+        """
+        按时间窗口切分执行批量校验
+        
+        Args:
+            window_start: 开始时间
+            window_end: 结束时间
+            split_unit: 切分单位 ("none", "day", "week", "month")
+            tables: 指定校验的表，None 表示全部
+            auto_backfill: 是否自动补齐
+            compare_content: 是否对比内容
+        
+        Returns:
+            校验汇总结果
+        """
+        summary = VerificationSummary(
+            layer=self.layer_name,
+            window_start=window_start,
+            window_end=window_end,
+        )
+        
+        # 获取要校验的表
+        all_tables = tables or self.get_tables()
+        
+        # 切分时间窗口
+        segments = build_window_segments(window_start, window_end, split_unit)
+        
+        self.logger.info(
+            "%s 批量校验开始: 表数=%d, 窗口切分=%d段",
+            self.layer_name, len(all_tables), len(segments)
+        )
+        
+        fatal_error = False
+        for segment in segments:
+            # 每段开始前检查连接状态，异常时立即终止，避免大量空跑
+            self._ensure_connection()
+            self.logger.info(
+                "%s 处理窗口 [%d/%d]: %s",
+                self.layer_name, segment.index + 1, segment.total, segment.label
+            )
+            
+            for table in all_tables:
+                result = self.verify_table(
+                    table=table,
+                    window_start=segment.start,
+                    window_end=segment.end,
+                    auto_backfill=auto_backfill,
+                    compare_content=compare_content,
+                )
+                summary.add_result(result)
+                if result.details.get("fatal"):
+                    fatal_error = True
+                    break
+            
+            # 每段完成后提交
+            try:
+                self.db.commit()
+            except Exception as e:
+                self.logger.warning("提交失败: %s", e)
+            if fatal_error:
+                self.logger.warning("%s 校验中止：连接不可用或发生致命错误", self.layer_name)
+                break
+        
+        self.logger.info(summary.format_summary())
+        return summary
+
+    def _ensure_connection(self):
+        """确保数据库连接可用，必要时尝试重连。"""
+        if not hasattr(self.db, "conn"):
+            raise VerificationFetchError("校验器未绑定有效数据库连接")
+        if getattr(self.db.conn, "closed", 0):
+            # 优先使用连接对象的重连能力
+            if hasattr(self.db, "ensure_open"):
+                if not self.db.ensure_open():
+                    raise VerificationFetchError("数据库连接已关闭，无法继续校验")
+            else:
+                raise VerificationFetchError("数据库连接已关闭，无法继续校验")
+    
+    def quick_check(
+        self,
+        window_start: datetime,
+        window_end: datetime,
+        tables: Optional[List[str]] = None,
+    ) -> Dict[str, dict]:
+        """
+        快速检查（仅对比数量，不对比内容）
+        
+        Args:
+            window_start: 开始时间
+            window_end: 结束时间
+            tables: 指定表，None 表示全部
+        
+        Returns:
+            {表名: {source_count, target_count, diff}}
+        """
+        all_tables = tables or self.get_tables()
+        results = {}
+        
+        for table in all_tables:
+            source_keys = self.fetch_source_keys(table, window_start, window_end)
+            target_keys = self.fetch_target_keys(table, window_start, window_end)
+            
+            results[table] = {
+                "source_count": len(source_keys),
+                "target_count": len(target_keys),
+                "diff": len(source_keys) - len(target_keys),
+            }
+        
+        return results