数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/etl_billiards/tasks/data_integrity_task.py
+++ b/etl_billiards/tasks/data_integrity_task.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-
 """Data integrity task that checks API -> ODS -> DWD completeness."""
 from __future__ import annotations

@@ -7,16 +7,9 @@ from zoneinfo import ZoneInfo

 from dateutil import parser as dtparser

-import json
-from pathlib import Path
 from utils.windowing import build_window_segments, calc_window_minutes
 from .base_task import BaseTask
-from quality.integrity_checker import (
-    IntegrityWindow,
-    compute_last_etl_end,
-    run_integrity_history,
-    run_integrity_window,
-)
+from quality.integrity_service import run_history_flow, run_window_flow, write_report


 class DataIntegrityTask(BaseTask):
@@ -31,15 +24,25 @@ class DataIntegrityTask(BaseTask):
        include_dimensions = bool(self.config.get("integrity.include_dimensions", False))
        task_codes = str(self.config.get("integrity.ods_task_codes", "") or "").strip()
        auto_backfill = bool(self.config.get("integrity.auto_backfill", False))
+        compare_content = self.config.get("integrity.compare_content")
+        if compare_content is None:
+            compare_content = True
+        content_sample_limit = self.config.get("integrity.content_sample_limit")
+        backfill_mismatch = self.config.get("integrity.backfill_mismatch")
+        if backfill_mismatch is None:
+            backfill_mismatch = True
+        recheck_after_backfill = self.config.get("integrity.recheck_after_backfill")
+        if recheck_after_backfill is None:
+            recheck_after_backfill = True

-        # 检测是否通过 CLI 传入了时间窗口参数（window_override）
-        # 如果有，自动切换到 window 模式
+        # Switch to window mode when CLI override is provided.
        window_override_start = self.config.get("run.window_override.start")
        window_override_end = self.config.get("run.window_override.end")
        if window_override_start or window_override_end:
            self.logger.info(
-                "检测到 CLI 时间窗口参数，自动切换到 window 模式: %s ~ %s",
-                window_override_start, window_override_end
+                "Detected CLI window override. Switching to window mode: %s ~ %s",
+                window_override_start,
+                window_override_end,
            )
            mode = "window"

@@ -57,65 +60,28 @@ class DataIntegrityTask(BaseTask):

            total_segments = len(segments)
            if total_segments > 1:
-                self.logger.info("数据完整性检查: 分段执行 共%s段", total_segments)
+                self.logger.info("Data integrity check split into %s segments.", total_segments)

-            window_reports = []
-            total_missing = 0
-            total_errors = 0
-            for idx, (seg_start, seg_end) in enumerate(segments, start=1):
-                window = IntegrityWindow(
-                    start=seg_start,
-                    end=seg_end,
-                    label=f"segment_{idx}",
-                    granularity="window",
-                )
-                payload = run_integrity_window(
-                    cfg=self.config,
-                    window=window,
-                    include_dimensions=include_dimensions,
-                    task_codes=task_codes,
-                    logger=self.logger,
-                    write_report=False,
-                    window_split_unit="none",
-                    window_compensation_hours=0,
-                )
-                window_reports.append(payload)
-                total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
-                total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
+            report, counts = run_window_flow(
+                cfg=self.config,
+                windows=segments,
+                include_dimensions=include_dimensions,
+                task_codes=task_codes,
+                logger=self.logger,
+                compare_content=bool(compare_content),
+                content_sample_limit=content_sample_limit,
+                do_backfill=bool(auto_backfill),
+                include_mismatch=bool(backfill_mismatch),
+                recheck_after_backfill=bool(recheck_after_backfill),
+                page_size=int(self.config.get("api.page_size") or 200),
+                chunk_size=500,
+            )

            overall_start = segments[0][0]
            overall_end = segments[-1][1]
-            report = {
-                "mode": "window",
-                "window": {
-                    "start": overall_start.isoformat(),
-                    "end": overall_end.isoformat(),
-                    "segments": total_segments,
-                },
-                "windows": window_reports,
-                "api_to_ods": {
-                    "total_missing": total_missing,
-                    "total_errors": total_errors,
-                },
-                "total_missing": total_missing,
-                "total_errors": total_errors,
-                "generated_at": datetime.now(tz).isoformat(),
-            }
-            report_path = self._write_report(report, "data_integrity_window")
+            report_path = write_report(report, prefix="data_integrity_window", tz=tz)
            report["report_path"] = report_path

-            missing_count = int(total_missing or 0)
-            counts = {
-                "missing": missing_count,
-                "errors": int(total_errors or 0),
-            }
-
-            # ????
-            backfill_result = None
-            if auto_backfill and missing_count > 0:
-                backfill_result = self._run_backfill(base_start, base_end, task_codes)
-                counts["backfilled"] = backfill_result.get("backfilled", 0)
-
            return {
                "status": "SUCCESS",
                "counts": counts,
@@ -125,7 +91,7 @@ class DataIntegrityTask(BaseTask):
                    "minutes": calc_window_minutes(overall_start, overall_end),
                },
                "report_path": report_path,
-                "backfill_result": backfill_result,
+                "backfill_result": report.get("backfill_result"),
            }

        history_start = str(self.config.get("integrity.history_start", "2025-07-01") or "2025-07-01")
@@ -136,77 +102,52 @@ class DataIntegrityTask(BaseTask):
        else:
            start_dt = start_dt.astimezone(tz)

+        end_dt = None
        if history_end:
            end_dt = dtparser.parse(history_end)
            if end_dt.tzinfo is None:
                end_dt = end_dt.replace(tzinfo=tz)
            else:
                end_dt = end_dt.astimezone(tz)
-        else:
-            end_dt = compute_last_etl_end(self.config) or datetime.now(tz)

-        report = run_integrity_history(
+        report, counts = run_history_flow(
            cfg=self.config,
            start_dt=start_dt,
            end_dt=end_dt,
            include_dimensions=include_dimensions,
            task_codes=task_codes,
            logger=self.logger,
-            write_report=True,
+            compare_content=bool(compare_content),
+            content_sample_limit=content_sample_limit,
+            do_backfill=bool(auto_backfill),
+            include_mismatch=bool(backfill_mismatch),
+            recheck_after_backfill=bool(recheck_after_backfill),
+            page_size=int(self.config.get("api.page_size") or 200),
+            chunk_size=500,
        )
-        missing_count = int(report.get("total_missing") or 0)
-        counts = {
-            "missing": missing_count,
-            "errors": int(report.get("total_errors") or 0),
-        }
-        
-        # 自动补全
-        backfill_result = None
-        if auto_backfill and missing_count > 0:
-            backfill_result = self._run_backfill(start_dt, end_dt, task_codes)
-            counts["backfilled"] = backfill_result.get("backfilled", 0)
-        
+        report_path = write_report(report, prefix="data_integrity_history", tz=tz)
+        report["report_path"] = report_path
+
+        end_dt_used = end_dt
+        if end_dt_used is None:
+            end_str = report.get("end")
+            if end_str:
+                parsed = dtparser.parse(end_str)
+                if parsed.tzinfo is None:
+                    end_dt_used = parsed.replace(tzinfo=tz)
+                else:
+                    end_dt_used = parsed.astimezone(tz)
+        if end_dt_used is None:
+            end_dt_used = start_dt
+
        return {
            "status": "SUCCESS",
            "counts": counts,
            "window": {
                "start": start_dt,
-                "end": end_dt,
-                "minutes": int((end_dt - start_dt).total_seconds() // 60) if end_dt > start_dt else 0,
+                "end": end_dt_used,
+                "minutes": int((end_dt_used - start_dt).total_seconds() // 60) if end_dt_used > start_dt else 0,
            },
-            "report_path": report.get("report_path"),
-            "backfill_result": backfill_result,
+            "report_path": report_path,
+            "backfill_result": report.get("backfill_result"),
        }
-    
-    def _write_report(self, report: dict, prefix: str) -> str:
-        root = Path(__file__).resolve().parents[1]
-        stamp = datetime.now(self.tz).strftime("%Y%m%d_%H%M%S")
-        path = root / "reports" / f"{prefix}_{stamp}.json"
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
-        return str(path)
-
-    def _run_backfill(self, start_dt: datetime, end_dt: datetime, task_codes: str) -> dict:
-        """运行数据补全"""
-        self.logger.info("自动补全开始 起始=%s 结束=%s", start_dt, end_dt)
-        try:
-            from scripts.backfill_missing_data import run_backfill
-            result = run_backfill(
-                cfg=self.config,
-                start=start_dt,
-                end=end_dt,
-                task_codes=task_codes or None,
-                dry_run=False,
-                page_size=200,
-                chunk_size=500,
-                logger=self.logger,
-            )
-            self.logger.info(
-                "自动补全完成 已补全=%s 错误数=%s",
-                result.get("backfilled", 0),
-                result.get("errors", 0),
-            )
-            return result
-        except Exception as exc:
-            self.logger.exception("自动补全失败")
-            return {"backfilled": 0, "errors": 1, "error": str(exc)}