改 相对路径 完成客户端
This commit is contained in:
1
etl_billiards/scripts/__init__.py
Normal file
1
etl_billiards/scripts/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Script helpers package marker.
|
||||
@@ -240,9 +240,8 @@ class MissingDataBackfiller:
|
||||
headers_extra=cfg["api"].get("headers_extra") or {},
|
||||
)
|
||||
|
||||
# 数据库连接
|
||||
# 数据库连接(DatabaseConnection 构造时已设置 autocommit=False)
|
||||
self.db = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
self.db.conn.autocommit = False
|
||||
|
||||
def close(self):
|
||||
"""关闭连接"""
|
||||
@@ -264,7 +263,7 @@ class MissingDataBackfiller:
|
||||
Returns:
|
||||
补全结果统计
|
||||
"""
|
||||
self.logger.info("BACKFILL_START start=%s end=%s", start.isoformat(), end.isoformat())
|
||||
self.logger.info("数据补全开始 起始=%s 结束=%s", start.isoformat(), end.isoformat())
|
||||
|
||||
# 计算窗口大小
|
||||
total_seconds = max(0, int((end - start).total_seconds()))
|
||||
@@ -276,7 +275,7 @@ class MissingDataBackfiller:
|
||||
window_hours = max(1, total_seconds // 3600 or 1)
|
||||
|
||||
# 运行 gap check
|
||||
self.logger.info("RUNNING_GAP_CHECK...")
|
||||
self.logger.info("正在执行缺失检查...")
|
||||
gap_result = run_gap_check(
|
||||
cfg=self.cfg,
|
||||
start=start,
|
||||
@@ -297,10 +296,10 @@ class MissingDataBackfiller:
|
||||
|
||||
total_missing = gap_result.get("total_missing", 0)
|
||||
if total_missing == 0:
|
||||
self.logger.info("NO_MISSING_DATA")
|
||||
self.logger.info("数据完整,无缺失记录")
|
||||
return {"backfilled": 0, "errors": 0, "details": []}
|
||||
|
||||
self.logger.info("GAP_CHECK_DONE total_missing=%s", total_missing)
|
||||
self.logger.info("缺失检查完成 总缺失=%s", total_missing)
|
||||
|
||||
# 补全每个任务的丢失数据
|
||||
results = []
|
||||
@@ -316,7 +315,7 @@ class MissingDataBackfiller:
|
||||
continue
|
||||
|
||||
self.logger.info(
|
||||
"BACKFILL_TASK task=%s missing=%s samples=%s",
|
||||
"开始补全任务 任务=%s 缺失=%s 样本数=%s",
|
||||
task_code, missing, len(missing_samples)
|
||||
)
|
||||
|
||||
@@ -339,7 +338,7 @@ class MissingDataBackfiller:
|
||||
})
|
||||
total_backfilled += backfilled
|
||||
except Exception as exc:
|
||||
self.logger.exception("BACKFILL_ERROR task=%s", task_code)
|
||||
self.logger.exception("补全失败 任务=%s", task_code)
|
||||
results.append({
|
||||
"task_code": task_code,
|
||||
"missing": missing,
|
||||
@@ -349,7 +348,7 @@ class MissingDataBackfiller:
|
||||
total_errors += 1
|
||||
|
||||
self.logger.info(
|
||||
"BACKFILL_DONE total_missing=%s backfilled=%s errors=%s",
|
||||
"数据补全完成 总缺失=%s 已补全=%s 错误数=%s",
|
||||
total_missing, total_backfilled, total_errors
|
||||
)
|
||||
|
||||
@@ -375,14 +374,14 @@ class MissingDataBackfiller:
|
||||
"""补全单个任务的丢失数据"""
|
||||
spec = _get_spec(task_code)
|
||||
if not spec:
|
||||
self.logger.warning("SPEC_NOT_FOUND task=%s", task_code)
|
||||
self.logger.warning("未找到任务规格 任务=%s", task_code)
|
||||
return 0
|
||||
|
||||
if not pk_columns:
|
||||
pk_columns = _get_table_pk_columns(self.db.conn, table)
|
||||
|
||||
if not pk_columns:
|
||||
self.logger.warning("NO_PK_COLUMNS task=%s table=%s", task_code, table)
|
||||
self.logger.warning("未找到主键列 任务=%s 表=%s", task_code, table)
|
||||
return 0
|
||||
|
||||
# 提取丢失的 PK 值
|
||||
@@ -393,11 +392,11 @@ class MissingDataBackfiller:
|
||||
missing_pks.add(pk_tuple)
|
||||
|
||||
if not missing_pks:
|
||||
self.logger.info("NO_MISSING_PKS task=%s", task_code)
|
||||
self.logger.info("无缺失主键 任务=%s", task_code)
|
||||
return 0
|
||||
|
||||
self.logger.info(
|
||||
"BACKFILL_FETCHING task=%s missing_pks=%s",
|
||||
"开始获取数据 任务=%s 缺失主键数=%s",
|
||||
task_code, len(missing_pks)
|
||||
)
|
||||
|
||||
@@ -436,7 +435,7 @@ class MissingDataBackfiller:
|
||||
if self.dry_run:
|
||||
backfilled += len(records_to_insert)
|
||||
self.logger.info(
|
||||
"DRY_RUN task=%s page=%s would_insert=%s",
|
||||
"模拟运行 任务=%s 页=%s 将插入=%s",
|
||||
task_code, page_no, len(records_to_insert)
|
||||
)
|
||||
else:
|
||||
@@ -449,14 +448,14 @@ class MissingDataBackfiller:
|
||||
)
|
||||
backfilled += inserted
|
||||
self.logger.info(
|
||||
"INSERTED task=%s page=%s count=%s",
|
||||
"已插入 任务=%s 页=%s 数量=%s",
|
||||
task_code, page_no, inserted
|
||||
)
|
||||
|
||||
if not self.dry_run:
|
||||
self.db.conn.commit()
|
||||
|
||||
self.logger.info("BACKFILL_TASK_DONE task=%s backfilled=%s", task_code, backfilled)
|
||||
self.logger.info("任务补全完成 任务=%s 已补全=%s", task_code, backfilled)
|
||||
return backfilled
|
||||
|
||||
except Exception:
|
||||
|
||||
169
etl_billiards/scripts/check_data_integrity.py
Normal file
169
etl_billiards/scripts/check_data_integrity.py
Normal file
@@ -0,0 +1,169 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Run data integrity checks across API -> ODS -> DWD."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from dateutil import parser as dtparser
|
||||
|
||||
from config.settings import AppConfig
|
||||
from quality.integrity_checker import (
|
||||
IntegrityWindow,
|
||||
compute_last_etl_end,
|
||||
run_integrity_history,
|
||||
run_integrity_window,
|
||||
)
|
||||
from utils.logging_utils import build_log_path, configure_logging
|
||||
from utils.windowing import split_window
|
||||
|
||||
|
||||
def _parse_dt(value: str, tz: ZoneInfo) -> datetime:
|
||||
dt = dtparser.parse(value)
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=tz)
|
||||
return dt.astimezone(tz)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ap = argparse.ArgumentParser(description="Data integrity checks (API -> ODS -> DWD)")
|
||||
ap.add_argument("--mode", choices=["history", "window"], default="history")
|
||||
ap.add_argument("--start", default="2025-07-01", help="history start date (default: 2025-07-01)")
|
||||
ap.add_argument("--end", default="", help="history end datetime (default: last ETL end)")
|
||||
ap.add_argument("--window-start", default="", help="window start datetime (mode=window)")
|
||||
ap.add_argument("--window-end", default="", help="window end datetime (mode=window)")
|
||||
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
|
||||
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
|
||||
ap.add_argument("--include-dimensions", action="store_true", help="include dimension tables in ODS->DWD checks")
|
||||
ap.add_argument("--ods-task-codes", default="", help="comma-separated ODS task codes for API checks")
|
||||
ap.add_argument("--out", default="", help="output JSON path")
|
||||
ap.add_argument("--log-file", default="", help="log file path")
|
||||
ap.add_argument("--log-dir", default="", help="log directory")
|
||||
ap.add_argument("--log-level", default="INFO", help="log level")
|
||||
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
|
||||
args = ap.parse_args()
|
||||
|
||||
log_dir = Path(args.log_dir) if args.log_dir else (Path(__file__).resolve().parent / "logs")
|
||||
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "data_integrity")
|
||||
log_console = not args.no_log_console
|
||||
|
||||
with configure_logging(
|
||||
"data_integrity",
|
||||
log_file,
|
||||
level=args.log_level,
|
||||
console=log_console,
|
||||
tee_std=True,
|
||||
) as logger:
|
||||
cfg = AppConfig.load({})
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
report_path = Path(args.out) if args.out else None
|
||||
|
||||
if args.mode == "window":
|
||||
if not args.window_start or not args.window_end:
|
||||
raise SystemExit("window-start and window-end are required for mode=window")
|
||||
start_dt = _parse_dt(args.window_start, tz)
|
||||
end_dt = _parse_dt(args.window_end, tz)
|
||||
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
|
||||
comp_hours = args.window_compensation_hours
|
||||
if comp_hours is None:
|
||||
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
|
||||
|
||||
windows = split_window(
|
||||
start_dt,
|
||||
end_dt,
|
||||
tz=tz,
|
||||
split_unit=split_unit,
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
if not windows:
|
||||
windows = [(start_dt, end_dt)]
|
||||
|
||||
window_reports = []
|
||||
total_missing = 0
|
||||
total_errors = 0
|
||||
for idx, (seg_start, seg_end) in enumerate(windows, start=1):
|
||||
window = IntegrityWindow(
|
||||
start=seg_start,
|
||||
end=seg_end,
|
||||
label=f"segment_{idx}",
|
||||
granularity="window",
|
||||
)
|
||||
payload = run_integrity_window(
|
||||
cfg=cfg,
|
||||
window=window,
|
||||
include_dimensions=args.include_dimensions,
|
||||
task_codes=args.ods_task_codes,
|
||||
logger=logger,
|
||||
write_report=False,
|
||||
report_path=None,
|
||||
window_split_unit="none",
|
||||
window_compensation_hours=0,
|
||||
)
|
||||
window_reports.append(payload)
|
||||
total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
|
||||
total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
|
||||
|
||||
overall_start = windows[0][0]
|
||||
overall_end = windows[-1][1]
|
||||
report = {
|
||||
"mode": "window",
|
||||
"window": {
|
||||
"start": overall_start.isoformat(),
|
||||
"end": overall_end.isoformat(),
|
||||
"segments": len(windows),
|
||||
},
|
||||
"windows": window_reports,
|
||||
"api_to_ods": {
|
||||
"total_missing": total_missing,
|
||||
"total_errors": total_errors,
|
||||
},
|
||||
"total_missing": total_missing,
|
||||
"total_errors": total_errors,
|
||||
"generated_at": datetime.now(tz).isoformat(),
|
||||
}
|
||||
if report_path is None:
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
|
||||
report_path = root / "reports" / f"data_integrity_window_{stamp}.json"
|
||||
report_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
report["report_path"] = str(report_path)
|
||||
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
|
||||
else:
|
||||
start_dt = _parse_dt(args.start, tz)
|
||||
if args.end:
|
||||
end_dt = _parse_dt(args.end, tz)
|
||||
else:
|
||||
end_dt = compute_last_etl_end(cfg) or datetime.now(tz)
|
||||
report = run_integrity_history(
|
||||
cfg=cfg,
|
||||
start_dt=start_dt,
|
||||
end_dt=end_dt,
|
||||
include_dimensions=args.include_dimensions,
|
||||
task_codes=args.ods_task_codes,
|
||||
logger=logger,
|
||||
write_report=True,
|
||||
report_path=report_path,
|
||||
)
|
||||
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
|
||||
logger.info(
|
||||
"SUMMARY missing=%s errors=%s",
|
||||
report.get("total_missing"),
|
||||
report.get("total_errors"),
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -34,6 +34,7 @@ from database.connection import DatabaseConnection
|
||||
from models.parsers import TypeParser
|
||||
from tasks.ods_tasks import ENABLED_ODS_CODES, ODS_TASK_SPECS
|
||||
from utils.logging_utils import build_log_path, configure_logging
|
||||
from utils.windowing import split_window
|
||||
|
||||
DEFAULT_START = "2025-07-01"
|
||||
MIN_COMPLETENESS_WINDOW_DAYS = 30
|
||||
@@ -214,8 +215,7 @@ def _check_spec(
|
||||
store_id: int,
|
||||
start: datetime | None,
|
||||
end: datetime | None,
|
||||
window_days: int,
|
||||
window_hours: int,
|
||||
windows: list[tuple[datetime, datetime]] | None,
|
||||
page_size: int,
|
||||
chunk_size: int,
|
||||
sample_limit: int,
|
||||
@@ -249,8 +249,7 @@ def _check_spec(
|
||||
result["errors"] = 1
|
||||
result["error_detail"] = "missing start/end for windowed endpoint"
|
||||
return result
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
windows = list(_iter_windows(start, end, window_size))
|
||||
windows = list(windows or [(start, end)])
|
||||
else:
|
||||
windows = [(None, None)]
|
||||
|
||||
@@ -377,8 +376,7 @@ def _check_settlement_tickets(
|
||||
store_id: int,
|
||||
start: datetime | None,
|
||||
end: datetime | None,
|
||||
window_days: int,
|
||||
window_hours: int,
|
||||
windows: list[tuple[datetime, datetime]] | None,
|
||||
page_size: int,
|
||||
chunk_size: int,
|
||||
sample_limit: int,
|
||||
@@ -415,8 +413,7 @@ def _check_settlement_tickets(
|
||||
missing_seen: set[tuple] = set()
|
||||
pay_endpoint = "/PayLog/GetPayLogListPage"
|
||||
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
windows = list(_iter_windows(start, end, window_size))
|
||||
windows = list(windows or [(start, end)])
|
||||
logger.info(
|
||||
"CHECK_START task=%s table=%s windows=%s start=%s end=%s",
|
||||
result["task_code"],
|
||||
@@ -571,6 +568,222 @@ def _resolve_window_from_cutoff(
|
||||
return start, now, cutoff
|
||||
|
||||
|
||||
def run_gap_check(
|
||||
*,
|
||||
cfg: AppConfig | None,
|
||||
start: datetime | str | None,
|
||||
end: datetime | str | None,
|
||||
window_days: int,
|
||||
window_hours: int,
|
||||
page_size: int,
|
||||
chunk_size: int,
|
||||
sample_limit: int,
|
||||
sleep_per_window: float,
|
||||
sleep_per_page: float,
|
||||
task_codes: str,
|
||||
from_cutoff: bool,
|
||||
cutoff_overlap_hours: int,
|
||||
allow_small_window: bool,
|
||||
logger: logging.Logger,
|
||||
window_split_unit: str | None = None,
|
||||
window_compensation_hours: int | None = None,
|
||||
) -> dict:
|
||||
cfg = cfg or AppConfig.load({})
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
store_id = int(cfg.get("app.store_id") or 0)
|
||||
|
||||
if not cfg.get("api.token"):
|
||||
raise ValueError("missing api.token; please set API_TOKEN in .env")
|
||||
|
||||
window_days = int(window_days)
|
||||
window_hours = int(window_hours)
|
||||
split_unit = (window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
|
||||
comp_hours = window_compensation_hours
|
||||
if comp_hours is None:
|
||||
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
|
||||
|
||||
use_split = split_unit.lower() not in ("", "none", "off", "false", "0")
|
||||
if not use_split and not from_cutoff and not allow_small_window:
|
||||
min_hours = MIN_COMPLETENESS_WINDOW_DAYS * 24
|
||||
if window_hours > 0:
|
||||
if window_hours < min_hours:
|
||||
logger.warning(
|
||||
"window_hours=%s too small for completeness check; adjust to %s",
|
||||
window_hours,
|
||||
min_hours,
|
||||
)
|
||||
window_hours = min_hours
|
||||
elif window_days < MIN_COMPLETENESS_WINDOW_DAYS:
|
||||
logger.warning(
|
||||
"window_days=%s too small for completeness check; adjust to %s",
|
||||
window_days,
|
||||
MIN_COMPLETENESS_WINDOW_DAYS,
|
||||
)
|
||||
window_days = MIN_COMPLETENESS_WINDOW_DAYS
|
||||
|
||||
cutoff = None
|
||||
if from_cutoff:
|
||||
db_tmp = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
ods_tables = [s.table_name for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
|
||||
start, end, cutoff = _resolve_window_from_cutoff(
|
||||
conn=db_tmp.conn,
|
||||
ods_tables=ods_tables,
|
||||
tz=tz,
|
||||
overlap_hours=cutoff_overlap_hours,
|
||||
)
|
||||
db_tmp.close()
|
||||
else:
|
||||
if not start:
|
||||
start = DEFAULT_START
|
||||
if not end:
|
||||
end = datetime.now(tz)
|
||||
if isinstance(start, str):
|
||||
start = _parse_dt(start, tz, is_end=False)
|
||||
if isinstance(end, str):
|
||||
end = _parse_dt(end, tz, is_end=True)
|
||||
|
||||
|
||||
windows = None
|
||||
if use_split:
|
||||
windows = split_window(
|
||||
start,
|
||||
end,
|
||||
tz=tz,
|
||||
split_unit=split_unit,
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
else:
|
||||
adjusted = split_window(
|
||||
start,
|
||||
end,
|
||||
tz=tz,
|
||||
split_unit="none",
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
if adjusted:
|
||||
start, end = adjusted[0]
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
windows = list(_iter_windows(start, end, window_size))
|
||||
|
||||
if windows:
|
||||
start, end = windows[0][0], windows[-1][1]
|
||||
|
||||
logger.info(
|
||||
"START range=%s~%s window_days=%s window_hours=%s split_unit=%s comp_hours=%s page_size=%s chunk_size=%s",
|
||||
start.isoformat() if isinstance(start, datetime) else None,
|
||||
end.isoformat() if isinstance(end, datetime) else None,
|
||||
window_days,
|
||||
window_hours,
|
||||
split_unit,
|
||||
comp_hours,
|
||||
page_size,
|
||||
chunk_size,
|
||||
)
|
||||
if cutoff:
|
||||
logger.info("CUTOFF=%s overlap_hours=%s", cutoff.isoformat(), cutoff_overlap_hours)
|
||||
|
||||
client = APIClient(
|
||||
base_url=cfg["api"]["base_url"],
|
||||
token=cfg["api"]["token"],
|
||||
timeout=int(cfg["api"].get("timeout_sec") or 20),
|
||||
retry_max=int(cfg["api"].get("retries", {}).get("max_attempts") or 3),
|
||||
headers_extra=cfg["api"].get("headers_extra") or {},
|
||||
)
|
||||
|
||||
db_conn = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
try:
|
||||
db_conn.conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
db_conn.conn.autocommit = True
|
||||
try:
|
||||
task_filter = {t.strip().upper() for t in (task_codes or "").split(",") if t.strip()}
|
||||
specs = [s for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
|
||||
if task_filter:
|
||||
specs = [s for s in specs if s.code in task_filter]
|
||||
|
||||
results: list[dict] = []
|
||||
for spec in specs:
|
||||
if spec.code == "ODS_SETTLEMENT_TICKET":
|
||||
continue
|
||||
result = _check_spec(
|
||||
client=client,
|
||||
db_conn=db_conn.conn,
|
||||
cfg=cfg,
|
||||
tz=tz,
|
||||
logger=logger,
|
||||
spec=spec,
|
||||
store_id=store_id,
|
||||
start=start,
|
||||
end=end,
|
||||
windows=windows,
|
||||
page_size=page_size,
|
||||
chunk_size=chunk_size,
|
||||
sample_limit=sample_limit,
|
||||
sleep_per_window=sleep_per_window,
|
||||
sleep_per_page=sleep_per_page,
|
||||
)
|
||||
results.append(result)
|
||||
logger.info(
|
||||
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
|
||||
result.get("task_code"),
|
||||
result.get("missing"),
|
||||
result.get("records"),
|
||||
result.get("errors"),
|
||||
)
|
||||
|
||||
if (not task_filter) or ("ODS_SETTLEMENT_TICKET" in task_filter):
|
||||
ticket_result = _check_settlement_tickets(
|
||||
client=client,
|
||||
db_conn=db_conn.conn,
|
||||
cfg=cfg,
|
||||
tz=tz,
|
||||
logger=logger,
|
||||
store_id=store_id,
|
||||
start=start,
|
||||
end=end,
|
||||
windows=windows,
|
||||
page_size=page_size,
|
||||
chunk_size=chunk_size,
|
||||
sample_limit=sample_limit,
|
||||
sleep_per_window=sleep_per_window,
|
||||
sleep_per_page=sleep_per_page,
|
||||
)
|
||||
results.append(ticket_result)
|
||||
logger.info(
|
||||
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
|
||||
ticket_result.get("task_code"),
|
||||
ticket_result.get("missing"),
|
||||
ticket_result.get("records"),
|
||||
ticket_result.get("errors"),
|
||||
)
|
||||
|
||||
total_missing = sum(int(r.get("missing") or 0) for r in results)
|
||||
total_errors = sum(int(r.get("errors") or 0) for r in results)
|
||||
|
||||
payload = {
|
||||
"window_split_unit": split_unit,
|
||||
"window_compensation_hours": comp_hours,
|
||||
"start": start.isoformat() if isinstance(start, datetime) else None,
|
||||
"end": end.isoformat() if isinstance(end, datetime) else None,
|
||||
"cutoff": cutoff.isoformat() if cutoff else None,
|
||||
"window_days": window_days,
|
||||
"window_hours": window_hours,
|
||||
"page_size": page_size,
|
||||
"chunk_size": chunk_size,
|
||||
"sample_limit": sample_limit,
|
||||
"store_id": store_id,
|
||||
"base_url": cfg.get("api.base_url"),
|
||||
"results": results,
|
||||
"total_missing": total_missing,
|
||||
"total_errors": total_errors,
|
||||
"generated_at": datetime.now(tz).isoformat(),
|
||||
}
|
||||
return payload
|
||||
finally:
|
||||
db_conn.close()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
_reconfigure_stdout_utf8()
|
||||
ap = argparse.ArgumentParser(description="Check missing ODS records by comparing API vs ODS PKs.")
|
||||
@@ -578,6 +791,8 @@ def main() -> int:
|
||||
ap.add_argument("--end", default="", help="end datetime (default: now)")
|
||||
ap.add_argument("--window-days", type=int, default=1, help="days per API window (default: 1)")
|
||||
ap.add_argument("--window-hours", type=int, default=0, help="hours per API window (default: 0)")
|
||||
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
|
||||
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
|
||||
ap.add_argument("--page-size", type=int, default=200, help="API page size (default: 200)")
|
||||
ap.add_argument("--chunk-size", type=int, default=500, help="DB query chunk size (default: 500)")
|
||||
ap.add_argument("--sample-limit", type=int, default=50, help="max missing PK samples per table")
|
||||
@@ -593,6 +808,11 @@ def main() -> int:
|
||||
default=24,
|
||||
help="overlap hours when using --from-cutoff (default: 24)",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--allow-small-window",
|
||||
action="store_true",
|
||||
help="allow windows smaller than default completeness guard",
|
||||
)
|
||||
ap.add_argument("--log-file", default="", help="log file path (default: logs/check_ods_gaps_YYYYMMDD_HHMMSS.log)")
|
||||
ap.add_argument("--log-dir", default="", help="log directory (default: logs)")
|
||||
ap.add_argument("--log-level", default="INFO", help="log level (default: INFO)")
|
||||
@@ -611,170 +831,41 @@ def main() -> int:
|
||||
tee_std=True,
|
||||
) as logger:
|
||||
cfg = AppConfig.load({})
|
||||
payload = run_gap_check(
|
||||
cfg=cfg,
|
||||
start=args.start,
|
||||
end=args.end,
|
||||
window_days=args.window_days,
|
||||
window_hours=args.window_hours,
|
||||
page_size=args.page_size,
|
||||
chunk_size=args.chunk_size,
|
||||
sample_limit=args.sample_limit,
|
||||
sleep_per_window=args.sleep_per_window_seconds,
|
||||
sleep_per_page=args.sleep_per_page_seconds,
|
||||
task_codes=args.task_codes,
|
||||
from_cutoff=args.from_cutoff,
|
||||
cutoff_overlap_hours=args.cutoff_overlap_hours,
|
||||
allow_small_window=args.allow_small_window,
|
||||
logger=logger,
|
||||
window_split_unit=args.window_split_unit or None,
|
||||
window_compensation_hours=args.window_compensation_hours,
|
||||
)
|
||||
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
|
||||
store_id = int(cfg.get("app.store_id"))
|
||||
|
||||
if not cfg.get("api.token"):
|
||||
logger.error("missing api.token; please set API_TOKEN in .env")
|
||||
raise SystemExit("missing api.token; please set API_TOKEN in .env")
|
||||
|
||||
window_days = int(args.window_days)
|
||||
window_hours = int(args.window_hours)
|
||||
if not args.from_cutoff:
|
||||
min_hours = MIN_COMPLETENESS_WINDOW_DAYS * 24
|
||||
if window_hours > 0:
|
||||
if window_hours < min_hours:
|
||||
logger.warning(
|
||||
"window_hours=%s too small for completeness check; adjust to %s",
|
||||
window_hours,
|
||||
min_hours,
|
||||
)
|
||||
window_hours = min_hours
|
||||
elif window_days < MIN_COMPLETENESS_WINDOW_DAYS:
|
||||
logger.warning(
|
||||
"window_days=%s too small for completeness check; adjust to %s",
|
||||
window_days,
|
||||
MIN_COMPLETENESS_WINDOW_DAYS,
|
||||
)
|
||||
window_days = MIN_COMPLETENESS_WINDOW_DAYS
|
||||
|
||||
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
|
||||
if args.from_cutoff:
|
||||
db_tmp = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
ods_tables = [s.table_name for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
|
||||
start, end, cutoff = _resolve_window_from_cutoff(
|
||||
conn=db_tmp.conn,
|
||||
ods_tables=ods_tables,
|
||||
tz=tz,
|
||||
overlap_hours=args.cutoff_overlap_hours,
|
||||
)
|
||||
db_tmp.close()
|
||||
if args.out:
|
||||
out_path = Path(args.out)
|
||||
else:
|
||||
start = _parse_dt(args.start, tz, is_end=False)
|
||||
cutoff = None
|
||||
|
||||
tag = f"_{args.tag}" if args.tag else ""
|
||||
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
|
||||
out_path = PROJECT_ROOT / "reports" / f"ods_gap_check{tag}_{stamp}.json"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
logger.info("REPORT_WRITTEN path=%s", out_path)
|
||||
logger.info(
|
||||
"START range=%s~%s window_days=%s window_hours=%s page_size=%s chunk_size=%s",
|
||||
start.isoformat() if start else None,
|
||||
end.isoformat() if end else None,
|
||||
window_days,
|
||||
window_hours,
|
||||
args.page_size,
|
||||
args.chunk_size,
|
||||
"SUMMARY missing=%s errors=%s",
|
||||
payload.get("total_missing"),
|
||||
payload.get("total_errors"),
|
||||
)
|
||||
if cutoff:
|
||||
logger.info("CUTOFF=%s overlap_hours=%s", cutoff.isoformat(), args.cutoff_overlap_hours)
|
||||
|
||||
client = APIClient(
|
||||
base_url=cfg["api"]["base_url"],
|
||||
token=cfg["api"]["token"],
|
||||
timeout=int(cfg["api"].get("timeout_sec") or 20),
|
||||
retry_max=int(cfg["api"].get("retries", {}).get("max_attempts") or 3),
|
||||
headers_extra=cfg["api"].get("headers_extra") or {},
|
||||
)
|
||||
|
||||
db_conn = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
try:
|
||||
db_conn.conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
db_conn.conn.autocommit = True
|
||||
try:
|
||||
task_filter = {t.strip().upper() for t in args.task_codes.split(",") if t.strip()}
|
||||
specs = [s for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
|
||||
if task_filter:
|
||||
specs = [s for s in specs if s.code in task_filter]
|
||||
|
||||
results: list[dict] = []
|
||||
for spec in specs:
|
||||
if spec.code == "ODS_SETTLEMENT_TICKET":
|
||||
continue
|
||||
result = _check_spec(
|
||||
client=client,
|
||||
db_conn=db_conn.conn,
|
||||
cfg=cfg,
|
||||
tz=tz,
|
||||
logger=logger,
|
||||
spec=spec,
|
||||
store_id=store_id,
|
||||
start=start,
|
||||
end=end,
|
||||
window_days=window_days,
|
||||
window_hours=window_hours,
|
||||
page_size=args.page_size,
|
||||
chunk_size=args.chunk_size,
|
||||
sample_limit=args.sample_limit,
|
||||
sleep_per_window=args.sleep_per_window_seconds,
|
||||
sleep_per_page=args.sleep_per_page_seconds,
|
||||
)
|
||||
results.append(result)
|
||||
logger.info(
|
||||
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
|
||||
result.get("task_code"),
|
||||
result.get("missing"),
|
||||
result.get("records"),
|
||||
result.get("errors"),
|
||||
)
|
||||
|
||||
if (not task_filter) or ("ODS_SETTLEMENT_TICKET" in task_filter):
|
||||
ticket_result = _check_settlement_tickets(
|
||||
client=client,
|
||||
db_conn=db_conn.conn,
|
||||
cfg=cfg,
|
||||
tz=tz,
|
||||
logger=logger,
|
||||
store_id=store_id,
|
||||
start=start,
|
||||
end=end,
|
||||
window_days=window_days,
|
||||
window_hours=window_hours,
|
||||
page_size=args.page_size,
|
||||
chunk_size=args.chunk_size,
|
||||
sample_limit=args.sample_limit,
|
||||
sleep_per_window=args.sleep_per_window_seconds,
|
||||
sleep_per_page=args.sleep_per_page_seconds,
|
||||
)
|
||||
results.append(ticket_result)
|
||||
logger.info(
|
||||
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
|
||||
ticket_result.get("task_code"),
|
||||
ticket_result.get("missing"),
|
||||
ticket_result.get("records"),
|
||||
ticket_result.get("errors"),
|
||||
)
|
||||
|
||||
total_missing = sum(int(r.get("missing") or 0) for r in results)
|
||||
total_errors = sum(int(r.get("errors") or 0) for r in results)
|
||||
|
||||
if args.out:
|
||||
out_path = Path(args.out)
|
||||
else:
|
||||
tag = f"_{args.tag}" if args.tag else ""
|
||||
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
|
||||
out_path = PROJECT_ROOT / "reports" / f"ods_gap_check{tag}_{stamp}.json"
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
payload = {
|
||||
"start": start.isoformat(),
|
||||
"end": end.isoformat(),
|
||||
"cutoff": cutoff.isoformat() if cutoff else None,
|
||||
"window_days": window_days,
|
||||
"window_hours": window_hours,
|
||||
"page_size": args.page_size,
|
||||
"chunk_size": args.chunk_size,
|
||||
"sample_limit": args.sample_limit,
|
||||
"store_id": store_id,
|
||||
"base_url": cfg.get("api.base_url"),
|
||||
"results": results,
|
||||
"total_missing": total_missing,
|
||||
"total_errors": total_errors,
|
||||
"generated_at": datetime.now(tz).isoformat(),
|
||||
}
|
||||
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
logger.info("REPORT_WRITTEN path=%s", out_path)
|
||||
logger.info("SUMMARY missing=%s errors=%s", total_missing, total_errors)
|
||||
finally:
|
||||
db_conn.close()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON(默认目录 C:\\dev\\LLTQ\\export\\test-json-doc)
|
||||
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON(默认目录 export/test-json-doc)
|
||||
是否包含同名键,并输出每表未命中的字段,便于补充映射或确认确实无源字段。
|
||||
|
||||
使用方法:
|
||||
@@ -69,7 +69,7 @@ def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
|
||||
def main() -> None:
|
||||
"""主流程:遍历 FILE_MAPPING 中的 ODS 表,检查 JSON 键覆盖情况并打印报告。"""
|
||||
dsn = os.environ.get("PG_DSN")
|
||||
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", r"C:\dev\LLTQ\export\test-json-doc"))
|
||||
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", "export/test-json-doc"))
|
||||
|
||||
ods_cols_map = _load_ods_columns(dsn)
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
python -m etl_billiards.scripts.rebuild_db_and_run_ods_to_dwd ^
|
||||
--dsn "postgresql://user:pwd@host:5432/db" ^
|
||||
--store-id 1 ^
|
||||
--json-dir "C:\\dev\\LLTQ\\export\\test-json-doc" ^
|
||||
--json-dir "export/test-json-doc" ^
|
||||
--drop-schemas
|
||||
|
||||
环境变量(可选):
|
||||
@@ -44,7 +44,7 @@ from etl_billiards.tasks.init_schema_task import InitOdsSchemaTask
|
||||
from etl_billiards.tasks.manual_ingest_task import ManualIngestTask
|
||||
|
||||
|
||||
DEFAULT_JSON_DIR = r"C:\dev\LLTQ\export\test-json-doc"
|
||||
DEFAULT_JSON_DIR = "export/test-json-doc"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
依赖环境变量:
|
||||
PG_DSN PostgreSQL 连接串(必填)
|
||||
PG_CONNECT_TIMEOUT 可选,秒,默认 10
|
||||
JSON_DOC_DIR 可选,JSON 目录,默认 C:\\dev\\LLTQ\\export\\test-json-doc
|
||||
JSON_DOC_DIR 可选,JSON 目录,默认 export/test-json-doc
|
||||
ODS_INCLUDE_FILES 可选,逗号分隔文件名(不含 .json)
|
||||
ODS_DROP_SCHEMA_FIRST 可选,true/false,默认 true
|
||||
"""
|
||||
@@ -26,7 +26,7 @@ from psycopg2 import sql
|
||||
from psycopg2.extras import Json, execute_values
|
||||
|
||||
|
||||
DEFAULT_JSON_DIR = r"C:\dev\LLTQ\export\test-json-doc"
|
||||
DEFAULT_JSON_DIR = "export/test-json-doc"
|
||||
SPECIAL_LIST_PATHS: dict[str, tuple[str, ...]] = {
|
||||
"assistant_accounts_master": ("data", "assistantInfos"),
|
||||
"assistant_cancellation_records": ("data", "abolitionAssistants"),
|
||||
|
||||
@@ -20,10 +20,12 @@ if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from config.settings import AppConfig
|
||||
from utils.windowing import split_window
|
||||
from utils.logging_utils import build_log_path, configure_logging
|
||||
|
||||
MIN_RELOAD_WINDOW_DAYS = 30
|
||||
|
||||
|
||||
def _parse_dt(value: str, tz: ZoneInfo, *, is_end: bool) -> datetime:
|
||||
raw = (value or "").strip()
|
||||
if not raw:
|
||||
@@ -56,6 +58,8 @@ def _run_task_window(
|
||||
api_page_size: int,
|
||||
api_timeout: int,
|
||||
logger: logging.Logger,
|
||||
window_split_unit: str | None = "none",
|
||||
window_compensation_hours: int | None = 0,
|
||||
) -> None:
|
||||
cmd = [
|
||||
sys.executable,
|
||||
@@ -70,6 +74,10 @@ def _run_task_window(
|
||||
"--window-end",
|
||||
window_end.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"--force-window-override",
|
||||
"--window-split-unit",
|
||||
str(window_split_unit or "none"),
|
||||
"--window-compensation-hours",
|
||||
str(int(window_compensation_hours or 0)),
|
||||
]
|
||||
if api_page_size > 0:
|
||||
cmd += ["--api-page-size", str(api_page_size)]
|
||||
@@ -92,6 +100,8 @@ def main() -> int:
|
||||
ap.add_argument("--end", default="", help="end datetime (default: now)")
|
||||
ap.add_argument("--window-days", type=int, default=1, help="days per window (default: 1)")
|
||||
ap.add_argument("--window-hours", type=int, default=0, help="hours per window (default: 0)")
|
||||
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
|
||||
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
|
||||
ap.add_argument("--sleep-seconds", type=float, default=0, help="sleep seconds after each window")
|
||||
ap.add_argument("--api-page-size", type=int, default=200, help="API page size override")
|
||||
ap.add_argument("--api-timeout", type=int, default=20, help="API timeout seconds override")
|
||||
@@ -119,40 +129,70 @@ def main() -> int:
|
||||
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
|
||||
window_days = int(args.window_days)
|
||||
window_hours = int(args.window_hours)
|
||||
min_hours = MIN_RELOAD_WINDOW_DAYS * 24
|
||||
if window_hours > 0:
|
||||
if window_hours < min_hours:
|
||||
logger.warning(
|
||||
"window_hours=%s too small; adjust to %s",
|
||||
window_hours,
|
||||
min_hours,
|
||||
)
|
||||
window_hours = min_hours
|
||||
elif window_days < MIN_RELOAD_WINDOW_DAYS:
|
||||
logger.warning(
|
||||
"window_days=%s too small; adjust to %s",
|
||||
window_days,
|
||||
MIN_RELOAD_WINDOW_DAYS,
|
||||
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
|
||||
comp_hours = args.window_compensation_hours
|
||||
if comp_hours is None:
|
||||
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
|
||||
|
||||
use_split = split_unit.lower() not in ("", "none", "off", "false", "0")
|
||||
if use_split:
|
||||
windows = split_window(
|
||||
start,
|
||||
end,
|
||||
tz=tz,
|
||||
split_unit=split_unit,
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
window_days = MIN_RELOAD_WINDOW_DAYS
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
else:
|
||||
min_hours = MIN_RELOAD_WINDOW_DAYS * 24
|
||||
if window_hours > 0:
|
||||
if window_hours < min_hours:
|
||||
logger.warning(
|
||||
"window_hours=%s too small; adjust to %s",
|
||||
window_hours,
|
||||
min_hours,
|
||||
)
|
||||
window_hours = min_hours
|
||||
elif window_days < MIN_RELOAD_WINDOW_DAYS:
|
||||
logger.warning(
|
||||
"window_days=%s too small; adjust to %s",
|
||||
window_days,
|
||||
MIN_RELOAD_WINDOW_DAYS,
|
||||
)
|
||||
window_days = MIN_RELOAD_WINDOW_DAYS
|
||||
adjusted = split_window(
|
||||
start,
|
||||
end,
|
||||
tz=tz,
|
||||
split_unit="none",
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
if adjusted:
|
||||
start, end = adjusted[0]
|
||||
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
|
||||
windows = list(_iter_windows(start, end, window_size))
|
||||
|
||||
if windows:
|
||||
start, end = windows[0][0], windows[-1][1]
|
||||
|
||||
task_codes = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
|
||||
if not task_codes:
|
||||
raise SystemExit("no tasks specified")
|
||||
|
||||
logger.info(
|
||||
"START range=%s~%s window_days=%s window_hours=%s sleep=%.2f",
|
||||
"START range=%s~%s window_days=%s window_hours=%s split_unit=%s comp_hours=%s sleep=%.2f",
|
||||
start.isoformat(),
|
||||
end.isoformat(),
|
||||
window_days,
|
||||
window_hours,
|
||||
split_unit,
|
||||
comp_hours,
|
||||
args.sleep_seconds,
|
||||
)
|
||||
|
||||
for task_code in task_codes:
|
||||
logger.info("TASK_START task=%s", task_code)
|
||||
for window_start, window_end in _iter_windows(start, end, window_size):
|
||||
for window_start, window_end in windows:
|
||||
start_ts = time_mod.monotonic()
|
||||
_run_task_window(
|
||||
task_code=task_code,
|
||||
@@ -161,6 +201,8 @@ def main() -> int:
|
||||
api_page_size=args.api_page_size,
|
||||
api_timeout=args.api_timeout,
|
||||
logger=logger,
|
||||
window_split_unit="none",
|
||||
window_compensation_hours=0,
|
||||
)
|
||||
elapsed = time_mod.monotonic() - start_ts
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user