数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/etl_billiards/scripts/check_data_integrity.py
+++ b/etl_billiards/scripts/check_data_integrity.py
@@ -3,7 +3,6 @@
 from __future__ import annotations

 import argparse
-import json
 import sys
 from datetime import datetime
 from pathlib import Path
@@ -12,12 +11,7 @@ from zoneinfo import ZoneInfo
 from dateutil import parser as dtparser

 from config.settings import AppConfig
-from quality.integrity_checker import (
-    IntegrityWindow,
-    compute_last_etl_end,
-    run_integrity_history,
-    run_integrity_window,
-)
+from quality.integrity_service import run_history_flow, run_window_flow, write_report
 from utils.logging_utils import build_log_path, configure_logging
 from utils.windowing import split_window

@@ -38,14 +32,37 @@ def main() -> int:

    ap = argparse.ArgumentParser(description="Data integrity checks (API -> ODS -> DWD)")
    ap.add_argument("--mode", choices=["history", "window"], default="history")
+    ap.add_argument(
+        "--flow",
+        choices=["verify", "update_and_verify"],
+        default="verify",
+        help="verify only or update+verify (auto backfill then optional recheck)",
+    )
    ap.add_argument("--start", default="2025-07-01", help="history start date (default: 2025-07-01)")
    ap.add_argument("--end", default="", help="history end datetime (default: last ETL end)")
    ap.add_argument("--window-start", default="", help="window start datetime (mode=window)")
    ap.add_argument("--window-end", default="", help="window end datetime (mode=window)")
    ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
    ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
-    ap.add_argument("--include-dimensions", action="store_true", help="include dimension tables in ODS->DWD checks")
+    ap.add_argument(
+        "--include-dimensions",
+        action="store_true",
+        default=None,
+        help="include dimension tables in ODS->DWD checks",
+    )
+    ap.add_argument(
+        "--no-include-dimensions",
+        action="store_true",
+        help="exclude dimension tables in ODS->DWD checks",
+    )
    ap.add_argument("--ods-task-codes", default="", help="comma-separated ODS task codes for API checks")
+    ap.add_argument("--compare-content", action="store_true", help="compare API vs ODS content hash")
+    ap.add_argument("--no-compare-content", action="store_true", help="disable content comparison even if enabled in config")
+    ap.add_argument("--include-mismatch", action="store_true", help="backfill mismatch records as well")
+    ap.add_argument("--no-include-mismatch", action="store_true", help="disable mismatch backfill")
+    ap.add_argument("--recheck", action="store_true", help="re-run checks after backfill")
+    ap.add_argument("--no-recheck", action="store_true", help="skip recheck after backfill")
+    ap.add_argument("--content-sample-limit", type=int, default=None, help="max mismatch samples per table")
    ap.add_argument("--out", default="", help="output JSON path")
    ap.add_argument("--log-file", default="", help="log file path")
    ap.add_argument("--log-dir", default="", help="log directory")
@@ -68,6 +85,39 @@ def main() -> int:
        tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
        report_path = Path(args.out) if args.out else None

+        if args.recheck and args.no_recheck:
+            raise SystemExit("cannot set both --recheck and --no-recheck")
+        if args.include_mismatch and args.no_include_mismatch:
+            raise SystemExit("cannot set both --include-mismatch and --no-include-mismatch")
+        if args.include_dimensions and args.no_include_dimensions:
+            raise SystemExit("cannot set both --include-dimensions and --no-include-dimensions")
+
+        compare_content = None
+        if args.compare_content and args.no_compare_content:
+            raise SystemExit("cannot set both --compare-content and --no-compare-content")
+        if args.compare_content:
+            compare_content = True
+        elif args.no_compare_content:
+            compare_content = False
+
+        include_mismatch = cfg.get("integrity.backfill_mismatch", True)
+        if args.include_mismatch:
+            include_mismatch = True
+        elif args.no_include_mismatch:
+            include_mismatch = False
+
+        recheck_after_backfill = cfg.get("integrity.recheck_after_backfill", True)
+        if args.recheck:
+            recheck_after_backfill = True
+        elif args.no_recheck:
+            recheck_after_backfill = False
+
+        include_dimensions = cfg.get("integrity.include_dimensions", True)
+        if args.include_dimensions:
+            include_dimensions = True
+        elif args.no_include_dimensions:
+            include_dimensions = False
+
        if args.mode == "window":
            if not args.window_start or not args.window_end:
                raise SystemExit("window-start and window-end are required for mode=window")
@@ -88,78 +138,52 @@ def main() -> int:
            if not windows:
                windows = [(start_dt, end_dt)]

-            window_reports = []
-            total_missing = 0
-            total_errors = 0
-            for idx, (seg_start, seg_end) in enumerate(windows, start=1):
-                window = IntegrityWindow(
-                    start=seg_start,
-                    end=seg_end,
-                    label=f"segment_{idx}",
-                    granularity="window",
-                )
-                payload = run_integrity_window(
-                    cfg=cfg,
-                    window=window,
-                    include_dimensions=args.include_dimensions,
-                    task_codes=args.ods_task_codes,
-                    logger=logger,
-                    write_report=False,
-                    report_path=None,
-                    window_split_unit="none",
-                    window_compensation_hours=0,
-                )
-                window_reports.append(payload)
-                total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
-                total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
-
-            overall_start = windows[0][0]
-            overall_end = windows[-1][1]
-            report = {
-                "mode": "window",
-                "window": {
-                    "start": overall_start.isoformat(),
-                    "end": overall_end.isoformat(),
-                    "segments": len(windows),
-                },
-                "windows": window_reports,
-                "api_to_ods": {
-                    "total_missing": total_missing,
-                    "total_errors": total_errors,
-                },
-                "total_missing": total_missing,
-                "total_errors": total_errors,
-                "generated_at": datetime.now(tz).isoformat(),
-            }
-            if report_path is None:
-                root = Path(__file__).resolve().parents[1]
-                stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
-                report_path = root / "reports" / f"data_integrity_window_{stamp}.json"
-            report_path.parent.mkdir(parents=True, exist_ok=True)
-            report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
-            report["report_path"] = str(report_path)
+            report, counts = run_window_flow(
+                cfg=cfg,
+                windows=windows,
+                include_dimensions=bool(include_dimensions),
+                task_codes=args.ods_task_codes,
+                logger=logger,
+                compare_content=compare_content,
+                content_sample_limit=args.content_sample_limit,
+                do_backfill=args.flow == "update_and_verify",
+                include_mismatch=bool(include_mismatch),
+                recheck_after_backfill=bool(recheck_after_backfill),
+                page_size=int(cfg.get("api.page_size") or 200),
+                chunk_size=500,
+            )
+            report_path = write_report(report, prefix="data_integrity_window", tz=tz, report_path=report_path)
+            report["report_path"] = report_path
            logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
        else:
            start_dt = _parse_dt(args.start, tz)
            if args.end:
                end_dt = _parse_dt(args.end, tz)
            else:
-                end_dt = compute_last_etl_end(cfg) or datetime.now(tz)
-            report = run_integrity_history(
+                end_dt = None
+            report, counts = run_history_flow(
                cfg=cfg,
                start_dt=start_dt,
                end_dt=end_dt,
-                include_dimensions=args.include_dimensions,
+                include_dimensions=bool(include_dimensions),
                task_codes=args.ods_task_codes,
                logger=logger,
-                write_report=True,
-                report_path=report_path,
+                compare_content=compare_content,
+                content_sample_limit=args.content_sample_limit,
+                do_backfill=args.flow == "update_and_verify",
+                include_mismatch=bool(include_mismatch),
+                recheck_after_backfill=bool(recheck_after_backfill),
+                page_size=int(cfg.get("api.page_size") or 200),
+                chunk_size=500,
            )
+            report_path = write_report(report, prefix="data_integrity_history", tz=tz, report_path=report_path)
+            report["report_path"] = report_path
            logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
            logger.info(
-                "SUMMARY missing=%s errors=%s",
-                report.get("total_missing"),
-                report.get("total_errors"),
+                "SUMMARY missing=%s mismatch=%s errors=%s",
+                counts.get("missing"),
+                counts.get("mismatch"),
+                counts.get("errors"),
            )

    return 0