数据库 数据校验写入等逻辑更新。
This commit is contained in:
256
etl_billiards/quality/integrity_service.py
Normal file
256
etl_billiards/quality/integrity_service.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Shared integrity flow helpers (window/history + optional backfill)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Tuple
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import json
|
||||
|
||||
from quality.integrity_checker import IntegrityWindow, compute_last_etl_end, run_integrity_history, run_integrity_window
|
||||
from scripts.backfill_missing_data import run_backfill
|
||||
from utils.windowing import split_window
|
||||
|
||||
|
||||
def _normalize_windows(cfg, windows: Iterable[Tuple[datetime, datetime]]) -> list[Tuple[datetime, datetime]]:
|
||||
segments = list(windows)
|
||||
if not segments:
|
||||
return segments
|
||||
|
||||
force_monthly = bool(cfg.get("integrity.force_monthly_split", True))
|
||||
if not force_monthly:
|
||||
return segments
|
||||
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
total_days = (overall_end - overall_start).total_seconds() / 86400.0
|
||||
if total_days <= 31 and len(segments) == 1:
|
||||
return segments
|
||||
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
|
||||
monthly = split_window(
|
||||
overall_start,
|
||||
overall_end,
|
||||
tz=tz,
|
||||
split_unit="month",
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
return monthly or segments
|
||||
|
||||
|
||||
def build_window_report(
|
||||
*,
|
||||
cfg,
|
||||
windows: Iterable[Tuple[datetime, datetime]],
|
||||
include_dimensions: bool,
|
||||
task_codes: str,
|
||||
logger,
|
||||
compare_content: bool | None,
|
||||
content_sample_limit: int | None,
|
||||
) -> tuple[dict, dict]:
|
||||
window_reports = []
|
||||
total_missing = 0
|
||||
total_mismatch = 0
|
||||
total_errors = 0
|
||||
segments = list(windows)
|
||||
for idx, (seg_start, seg_end) in enumerate(segments, start=1):
|
||||
window = IntegrityWindow(
|
||||
start=seg_start,
|
||||
end=seg_end,
|
||||
label=f"segment_{idx}",
|
||||
granularity="window",
|
||||
)
|
||||
payload = run_integrity_window(
|
||||
cfg=cfg,
|
||||
window=window,
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=logger,
|
||||
write_report=False,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=content_sample_limit,
|
||||
report_path=None,
|
||||
window_split_unit="none",
|
||||
window_compensation_hours=0,
|
||||
)
|
||||
window_reports.append(payload)
|
||||
total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
|
||||
total_mismatch += int(payload.get("api_to_ods", {}).get("total_mismatch") or 0)
|
||||
total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
|
||||
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
report = {
|
||||
"mode": "window",
|
||||
"window": {
|
||||
"start": overall_start.isoformat(),
|
||||
"end": overall_end.isoformat(),
|
||||
"segments": len(segments),
|
||||
},
|
||||
"windows": window_reports,
|
||||
"api_to_ods": {
|
||||
"total_missing": total_missing,
|
||||
"total_mismatch": total_mismatch,
|
||||
"total_errors": total_errors,
|
||||
},
|
||||
"total_missing": total_missing,
|
||||
"total_mismatch": total_mismatch,
|
||||
"total_errors": total_errors,
|
||||
"generated_at": datetime.now(tz).isoformat(),
|
||||
}
|
||||
counts = {
|
||||
"missing": int(total_missing or 0),
|
||||
"mismatch": int(total_mismatch or 0),
|
||||
"errors": int(total_errors or 0),
|
||||
}
|
||||
return report, counts
|
||||
|
||||
|
||||
def run_window_flow(
|
||||
*,
|
||||
cfg,
|
||||
windows: Iterable[Tuple[datetime, datetime]],
|
||||
include_dimensions: bool,
|
||||
task_codes: str,
|
||||
logger,
|
||||
compare_content: bool | None,
|
||||
content_sample_limit: int | None,
|
||||
do_backfill: bool,
|
||||
include_mismatch: bool,
|
||||
recheck_after_backfill: bool,
|
||||
page_size: int | None = None,
|
||||
chunk_size: int = 500,
|
||||
) -> tuple[dict, dict]:
|
||||
segments = _normalize_windows(cfg, windows)
|
||||
report, counts = build_window_report(
|
||||
cfg=cfg,
|
||||
windows=segments,
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=logger,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=content_sample_limit,
|
||||
)
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
|
||||
backfill_result = None
|
||||
post_report = None
|
||||
if do_backfill:
|
||||
missing_count = int(counts.get("missing", 0))
|
||||
mismatch_count = int(counts.get("mismatch", 0))
|
||||
need_backfill = missing_count > 0 or (include_mismatch and mismatch_count > 0)
|
||||
if need_backfill:
|
||||
backfill_result = run_backfill(
|
||||
cfg=cfg,
|
||||
start=overall_start,
|
||||
end=overall_end,
|
||||
task_codes=task_codes or None,
|
||||
include_mismatch=bool(include_mismatch),
|
||||
dry_run=False,
|
||||
page_size=int(page_size or cfg.get("api.page_size") or 200),
|
||||
chunk_size=chunk_size,
|
||||
logger=logger,
|
||||
)
|
||||
report["backfill_result"] = backfill_result
|
||||
if recheck_after_backfill:
|
||||
post_report, post_counts = build_window_report(
|
||||
cfg=cfg,
|
||||
windows=segments,
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=logger,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=content_sample_limit,
|
||||
)
|
||||
report["post_check"] = post_report
|
||||
counts.update(post_counts)
|
||||
return report, counts
|
||||
|
||||
|
||||
def run_history_flow(
|
||||
*,
|
||||
cfg,
|
||||
start_dt: datetime,
|
||||
end_dt: datetime | None,
|
||||
include_dimensions: bool,
|
||||
task_codes: str,
|
||||
logger,
|
||||
compare_content: bool | None,
|
||||
content_sample_limit: int | None,
|
||||
do_backfill: bool,
|
||||
include_mismatch: bool,
|
||||
recheck_after_backfill: bool,
|
||||
page_size: int | None = None,
|
||||
chunk_size: int = 500,
|
||||
) -> tuple[dict, dict]:
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
if end_dt is None:
|
||||
end_dt = compute_last_etl_end(cfg) or datetime.now(tz)
|
||||
|
||||
report = run_integrity_history(
|
||||
cfg=cfg,
|
||||
start_dt=start_dt,
|
||||
end_dt=end_dt,
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=logger,
|
||||
write_report=False,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=content_sample_limit,
|
||||
)
|
||||
counts = {
|
||||
"missing": int(report.get("total_missing") or 0),
|
||||
"mismatch": int(report.get("total_mismatch") or 0),
|
||||
"errors": int(report.get("total_errors") or 0),
|
||||
}
|
||||
if do_backfill:
|
||||
need_backfill = counts.get("missing", 0) > 0 or (include_mismatch and counts.get("mismatch", 0) > 0)
|
||||
if need_backfill:
|
||||
backfill_result = run_backfill(
|
||||
cfg=cfg,
|
||||
start=start_dt,
|
||||
end=end_dt,
|
||||
task_codes=task_codes or None,
|
||||
include_mismatch=bool(include_mismatch),
|
||||
dry_run=False,
|
||||
page_size=int(page_size or cfg.get("api.page_size") or 200),
|
||||
chunk_size=chunk_size,
|
||||
logger=logger,
|
||||
)
|
||||
report["backfill_result"] = backfill_result
|
||||
if recheck_after_backfill:
|
||||
post_report = run_integrity_history(
|
||||
cfg=cfg,
|
||||
start_dt=start_dt,
|
||||
end_dt=end_dt,
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=logger,
|
||||
write_report=False,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=content_sample_limit,
|
||||
)
|
||||
report["post_check"] = post_report
|
||||
counts.update(
|
||||
{
|
||||
"missing": int(post_report.get("total_missing") or 0),
|
||||
"mismatch": int(post_report.get("total_mismatch") or 0),
|
||||
"errors": int(post_report.get("total_errors") or 0),
|
||||
}
|
||||
)
|
||||
return report, counts
|
||||
|
||||
|
||||
def write_report(report: dict, *, prefix: str, tz: ZoneInfo, report_path: Path | None = None) -> str:
|
||||
if report_path is None:
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
|
||||
report_path = root / "reports" / f"{prefix}_{stamp}.json"
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
return str(report_path)
|
||||
Reference in New Issue
Block a user