数据库 数据校验写入等逻辑更新。

This commit is contained in:
Neo
2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions

View File

@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""Data integrity task that checks API -> ODS -> DWD completeness."""
from __future__ import annotations
@@ -7,16 +7,9 @@ from zoneinfo import ZoneInfo
from dateutil import parser as dtparser
import json
from pathlib import Path
from utils.windowing import build_window_segments, calc_window_minutes
from .base_task import BaseTask
from quality.integrity_checker import (
IntegrityWindow,
compute_last_etl_end,
run_integrity_history,
run_integrity_window,
)
from quality.integrity_service import run_history_flow, run_window_flow, write_report
class DataIntegrityTask(BaseTask):
@@ -31,15 +24,25 @@ class DataIntegrityTask(BaseTask):
include_dimensions = bool(self.config.get("integrity.include_dimensions", False))
task_codes = str(self.config.get("integrity.ods_task_codes", "") or "").strip()
auto_backfill = bool(self.config.get("integrity.auto_backfill", False))
compare_content = self.config.get("integrity.compare_content")
if compare_content is None:
compare_content = True
content_sample_limit = self.config.get("integrity.content_sample_limit")
backfill_mismatch = self.config.get("integrity.backfill_mismatch")
if backfill_mismatch is None:
backfill_mismatch = True
recheck_after_backfill = self.config.get("integrity.recheck_after_backfill")
if recheck_after_backfill is None:
recheck_after_backfill = True
# 检测是否通过 CLI 传入了时间窗口参数window_override
# 如果有,自动切换到 window 模式
# Switch to window mode when CLI override is provided.
window_override_start = self.config.get("run.window_override.start")
window_override_end = self.config.get("run.window_override.end")
if window_override_start or window_override_end:
self.logger.info(
"检测到 CLI 时间窗口参数,自动切换到 window 模式: %s ~ %s",
window_override_start, window_override_end
"Detected CLI window override. Switching to window mode: %s ~ %s",
window_override_start,
window_override_end,
)
mode = "window"
@@ -57,65 +60,28 @@ class DataIntegrityTask(BaseTask):
total_segments = len(segments)
if total_segments > 1:
self.logger.info("数据完整性检查: 分段执行 共%s", total_segments)
self.logger.info("Data integrity check split into %s segments.", total_segments)
window_reports = []
total_missing = 0
total_errors = 0
for idx, (seg_start, seg_end) in enumerate(segments, start=1):
window = IntegrityWindow(
start=seg_start,
end=seg_end,
label=f"segment_{idx}",
granularity="window",
)
payload = run_integrity_window(
cfg=self.config,
window=window,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=self.logger,
write_report=False,
window_split_unit="none",
window_compensation_hours=0,
)
window_reports.append(payload)
total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
report, counts = run_window_flow(
cfg=self.config,
windows=segments,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=self.logger,
compare_content=bool(compare_content),
content_sample_limit=content_sample_limit,
do_backfill=bool(auto_backfill),
include_mismatch=bool(backfill_mismatch),
recheck_after_backfill=bool(recheck_after_backfill),
page_size=int(self.config.get("api.page_size") or 200),
chunk_size=500,
)
overall_start = segments[0][0]
overall_end = segments[-1][1]
report = {
"mode": "window",
"window": {
"start": overall_start.isoformat(),
"end": overall_end.isoformat(),
"segments": total_segments,
},
"windows": window_reports,
"api_to_ods": {
"total_missing": total_missing,
"total_errors": total_errors,
},
"total_missing": total_missing,
"total_errors": total_errors,
"generated_at": datetime.now(tz).isoformat(),
}
report_path = self._write_report(report, "data_integrity_window")
report_path = write_report(report, prefix="data_integrity_window", tz=tz)
report["report_path"] = report_path
missing_count = int(total_missing or 0)
counts = {
"missing": missing_count,
"errors": int(total_errors or 0),
}
# ????
backfill_result = None
if auto_backfill and missing_count > 0:
backfill_result = self._run_backfill(base_start, base_end, task_codes)
counts["backfilled"] = backfill_result.get("backfilled", 0)
return {
"status": "SUCCESS",
"counts": counts,
@@ -125,7 +91,7 @@ class DataIntegrityTask(BaseTask):
"minutes": calc_window_minutes(overall_start, overall_end),
},
"report_path": report_path,
"backfill_result": backfill_result,
"backfill_result": report.get("backfill_result"),
}
history_start = str(self.config.get("integrity.history_start", "2025-07-01") or "2025-07-01")
@@ -136,77 +102,52 @@ class DataIntegrityTask(BaseTask):
else:
start_dt = start_dt.astimezone(tz)
end_dt = None
if history_end:
end_dt = dtparser.parse(history_end)
if end_dt.tzinfo is None:
end_dt = end_dt.replace(tzinfo=tz)
else:
end_dt = end_dt.astimezone(tz)
else:
end_dt = compute_last_etl_end(self.config) or datetime.now(tz)
report = run_integrity_history(
report, counts = run_history_flow(
cfg=self.config,
start_dt=start_dt,
end_dt=end_dt,
include_dimensions=include_dimensions,
task_codes=task_codes,
logger=self.logger,
write_report=True,
compare_content=bool(compare_content),
content_sample_limit=content_sample_limit,
do_backfill=bool(auto_backfill),
include_mismatch=bool(backfill_mismatch),
recheck_after_backfill=bool(recheck_after_backfill),
page_size=int(self.config.get("api.page_size") or 200),
chunk_size=500,
)
missing_count = int(report.get("total_missing") or 0)
counts = {
"missing": missing_count,
"errors": int(report.get("total_errors") or 0),
}
# 自动补全
backfill_result = None
if auto_backfill and missing_count > 0:
backfill_result = self._run_backfill(start_dt, end_dt, task_codes)
counts["backfilled"] = backfill_result.get("backfilled", 0)
report_path = write_report(report, prefix="data_integrity_history", tz=tz)
report["report_path"] = report_path
end_dt_used = end_dt
if end_dt_used is None:
end_str = report.get("end")
if end_str:
parsed = dtparser.parse(end_str)
if parsed.tzinfo is None:
end_dt_used = parsed.replace(tzinfo=tz)
else:
end_dt_used = parsed.astimezone(tz)
if end_dt_used is None:
end_dt_used = start_dt
return {
"status": "SUCCESS",
"counts": counts,
"window": {
"start": start_dt,
"end": end_dt,
"minutes": int((end_dt - start_dt).total_seconds() // 60) if end_dt > start_dt else 0,
"end": end_dt_used,
"minutes": int((end_dt_used - start_dt).total_seconds() // 60) if end_dt_used > start_dt else 0,
},
"report_path": report.get("report_path"),
"backfill_result": backfill_result,
"report_path": report_path,
"backfill_result": report.get("backfill_result"),
}
def _write_report(self, report: dict, prefix: str) -> str:
root = Path(__file__).resolve().parents[1]
stamp = datetime.now(self.tz).strftime("%Y%m%d_%H%M%S")
path = root / "reports" / f"{prefix}_{stamp}.json"
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
return str(path)
def _run_backfill(self, start_dt: datetime, end_dt: datetime, task_codes: str) -> dict:
"""运行数据补全"""
self.logger.info("自动补全开始 起始=%s 结束=%s", start_dt, end_dt)
try:
from scripts.backfill_missing_data import run_backfill
result = run_backfill(
cfg=self.config,
start=start_dt,
end=end_dt,
task_codes=task_codes or None,
dry_run=False,
page_size=200,
chunk_size=500,
logger=self.logger,
)
self.logger.info(
"自动补全完成 已补全=%s 错误数=%s",
result.get("backfilled", 0),
result.get("errors", 0),
)
return result
except Exception as exc:
self.logger.exception("自动补全失败")
return {"backfilled": 0, "errors": 1, "error": str(exc)}