Files
feiqiu-ETL/etl_billiards/quality/integrity_service.py

257 lines
8.6 KiB
Python

# -*- coding: utf-8 -*-
"""Shared integrity flow helpers (window/history + optional backfill)."""
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from typing import Any, Iterable, Tuple
from zoneinfo import ZoneInfo
import json
from quality.integrity_checker import IntegrityWindow, compute_last_etl_end, run_integrity_history, run_integrity_window
from scripts.backfill_missing_data import run_backfill
from utils.windowing import split_window
def _normalize_windows(cfg, windows: Iterable[Tuple[datetime, datetime]]) -> list[Tuple[datetime, datetime]]:
segments = list(windows)
if not segments:
return segments
force_monthly = bool(cfg.get("integrity.force_monthly_split", True))
if not force_monthly:
return segments
overall_start = segments[0][0]
overall_end = segments[-1][1]
total_days = (overall_end - overall_start).total_seconds() / 86400.0
if total_days <= 31 and len(segments) == 1:
return segments
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
monthly = split_window(
overall_start,
overall_end,
tz=tz,
split_unit="month",
compensation_hours=comp_hours,
)
return monthly or segments
def build_window_report(
*,
cfg,
windows: Iterable[Tuple[datetime, datetime]],
include_dimensions: bool,
task_codes: str,
logger,
compare_content: bool | None,
content_sample_limit: int | None,
) -> tuple[dict, dict]:
window_reports = []
total_missing = 0
total_mismatch = 0
total_errors = 0
segments = list(windows)
for idx, (seg_start, seg_end) in enumerate(segments, start=1):
window = IntegrityWindow(
start=seg_start,
end=seg_end,
label=f"segment_{idx}",
granularity="window",
)
payload = run_integrity_window(
cfg=cfg,
window=window,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=logger,
write_report=False,
compare_content=compare_content,
content_sample_limit=content_sample_limit,
report_path=None,
window_split_unit="none",
window_compensation_hours=0,
)
window_reports.append(payload)
total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
total_mismatch += int(payload.get("api_to_ods", {}).get("total_mismatch") or 0)
total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
overall_start = segments[0][0]
overall_end = segments[-1][1]
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
report = {
"mode": "window",
"window": {
"start": overall_start.isoformat(),
"end": overall_end.isoformat(),
"segments": len(segments),
},
"windows": window_reports,
"api_to_ods": {
"total_missing": total_missing,
"total_mismatch": total_mismatch,
"total_errors": total_errors,
},
"total_missing": total_missing,
"total_mismatch": total_mismatch,
"total_errors": total_errors,
"generated_at": datetime.now(tz).isoformat(),
}
counts = {
"missing": int(total_missing or 0),
"mismatch": int(total_mismatch or 0),
"errors": int(total_errors or 0),
}
return report, counts
def run_window_flow(
*,
cfg,
windows: Iterable[Tuple[datetime, datetime]],
include_dimensions: bool,
task_codes: str,
logger,
compare_content: bool | None,
content_sample_limit: int | None,
do_backfill: bool,
include_mismatch: bool,
recheck_after_backfill: bool,
page_size: int | None = None,
chunk_size: int = 500,
) -> tuple[dict, dict]:
segments = _normalize_windows(cfg, windows)
report, counts = build_window_report(
cfg=cfg,
windows=segments,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=logger,
compare_content=compare_content,
content_sample_limit=content_sample_limit,
)
overall_start = segments[0][0]
overall_end = segments[-1][1]
backfill_result = None
post_report = None
if do_backfill:
missing_count = int(counts.get("missing", 0))
mismatch_count = int(counts.get("mismatch", 0))
need_backfill = missing_count > 0 or (include_mismatch and mismatch_count > 0)
if need_backfill:
backfill_result = run_backfill(
cfg=cfg,
start=overall_start,
end=overall_end,
task_codes=task_codes or None,
include_mismatch=bool(include_mismatch),
dry_run=False,
page_size=int(page_size or cfg.get("api.page_size") or 200),
chunk_size=chunk_size,
logger=logger,
)
report["backfill_result"] = backfill_result
if recheck_after_backfill:
post_report, post_counts = build_window_report(
cfg=cfg,
windows=segments,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=logger,
compare_content=compare_content,
content_sample_limit=content_sample_limit,
)
report["post_check"] = post_report
counts.update(post_counts)
return report, counts
def run_history_flow(
*,
cfg,
start_dt: datetime,
end_dt: datetime | None,
include_dimensions: bool,
task_codes: str,
logger,
compare_content: bool | None,
content_sample_limit: int | None,
do_backfill: bool,
include_mismatch: bool,
recheck_after_backfill: bool,
page_size: int | None = None,
chunk_size: int = 500,
) -> tuple[dict, dict]:
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
if end_dt is None:
end_dt = compute_last_etl_end(cfg) or datetime.now(tz)
report = run_integrity_history(
cfg=cfg,
start_dt=start_dt,
end_dt=end_dt,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=logger,
write_report=False,
compare_content=compare_content,
content_sample_limit=content_sample_limit,
)
counts = {
"missing": int(report.get("total_missing") or 0),
"mismatch": int(report.get("total_mismatch") or 0),
"errors": int(report.get("total_errors") or 0),
}
if do_backfill:
need_backfill = counts.get("missing", 0) > 0 or (include_mismatch and counts.get("mismatch", 0) > 0)
if need_backfill:
backfill_result = run_backfill(
cfg=cfg,
start=start_dt,
end=end_dt,
task_codes=task_codes or None,
include_mismatch=bool(include_mismatch),
dry_run=False,
page_size=int(page_size or cfg.get("api.page_size") or 200),
chunk_size=chunk_size,
logger=logger,
)
report["backfill_result"] = backfill_result
if recheck_after_backfill:
post_report = run_integrity_history(
cfg=cfg,
start_dt=start_dt,
end_dt=end_dt,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=logger,
write_report=False,
compare_content=compare_content,
content_sample_limit=content_sample_limit,
)
report["post_check"] = post_report
counts.update(
{
"missing": int(post_report.get("total_missing") or 0),
"mismatch": int(post_report.get("total_mismatch") or 0),
"errors": int(post_report.get("total_errors") or 0),
}
)
return report, counts
def write_report(report: dict, *, prefix: str, tz: ZoneInfo, report_path: Path | None = None) -> str:
if report_path is None:
root = Path(__file__).resolve().parents[1]
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
report_path = root / "reports" / f"{prefix}_{stamp}.json"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
return str(report_path)