Files
feiqiu-ETL/etl_billiards/scripts/reload_ods_windowed.py
2026-01-27 22:45:50 +08:00

225 lines
8.2 KiB
Python

# -*- coding: utf-8 -*-
"""
Reload ODS tasks by fixed time windows with optional sleep between windows.
"""
from __future__ import annotations
import argparse
import logging
import subprocess
import sys
import time as time_mod
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
from dateutil import parser as dtparser
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from config.settings import AppConfig
from utils.windowing import split_window
from utils.logging_utils import build_log_path, configure_logging
MIN_RELOAD_WINDOW_DAYS = 30
def _parse_dt(value: str, tz: ZoneInfo, *, is_end: bool) -> datetime:
raw = (value or "").strip()
if not raw:
raise ValueError("empty datetime")
has_time = any(ch in raw for ch in (":", "T"))
dt = dtparser.parse(raw)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=tz)
else:
dt = dt.astimezone(tz)
if not has_time:
dt = dt.replace(hour=23 if is_end else 0, minute=59 if is_end else 0, second=59 if is_end else 0, microsecond=0)
return dt
def _iter_windows(start: datetime, end: datetime, window_size: timedelta):
if window_size.total_seconds() <= 0:
raise ValueError("window_size must be > 0")
cur = start
while cur < end:
nxt = min(cur + window_size, end)
yield cur, nxt
cur = nxt
def _run_task_window(
task_code: str,
window_start: datetime,
window_end: datetime,
api_page_size: int,
api_timeout: int,
logger: logging.Logger,
window_split_unit: str | None = "none",
window_compensation_hours: int | None = 0,
) -> None:
cmd = [
sys.executable,
"-m",
"cli.main",
"--pipeline-flow",
"FULL",
"--tasks",
task_code,
"--window-start",
window_start.strftime("%Y-%m-%d %H:%M:%S"),
"--window-end",
window_end.strftime("%Y-%m-%d %H:%M:%S"),
"--force-window-override",
"--window-split-unit",
str(window_split_unit or "none"),
"--window-compensation-hours",
str(int(window_compensation_hours or 0)),
]
if api_page_size > 0:
cmd += ["--api-page-size", str(api_page_size)]
if api_timeout > 0:
cmd += ["--api-timeout", str(api_timeout)]
logger.info(
"RUN_TASK task=%s window_start=%s window_end=%s",
task_code,
window_start.isoformat(),
window_end.isoformat(),
)
logger.debug("CMD %s", " ".join(cmd))
subprocess.run(cmd, check=True, cwd=str(PROJECT_ROOT))
def main() -> int:
ap = argparse.ArgumentParser(description="Reload ODS tasks by window slices.")
ap.add_argument("--tasks", required=True, help="comma-separated ODS task codes")
ap.add_argument("--start", required=True, help="start datetime, e.g. 2025-07-01")
ap.add_argument("--end", default="", help="end datetime (default: now)")
ap.add_argument("--window-days", type=int, default=1, help="days per window (default: 1)")
ap.add_argument("--window-hours", type=int, default=0, help="hours per window (default: 0)")
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
ap.add_argument("--sleep-seconds", type=float, default=0, help="sleep seconds after each window")
ap.add_argument("--api-page-size", type=int, default=200, help="API page size override")
ap.add_argument("--api-timeout", type=int, default=20, help="API timeout seconds override")
ap.add_argument("--log-file", default="", help="log file path (default: logs/reload_ods_windowed_YYYYMMDD_HHMMSS.log)")
ap.add_argument("--log-dir", default="", help="log directory (default: logs)")
ap.add_argument("--log-level", default="INFO", help="log level (default: INFO)")
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
args = ap.parse_args()
log_dir = Path(args.log_dir) if args.log_dir else (PROJECT_ROOT / "logs")
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "reload_ods_windowed")
log_console = not args.no_log_console
with configure_logging(
"reload_ods_windowed",
log_file,
level=args.log_level,
console=log_console,
tee_std=True,
) as logger:
cfg = AppConfig.load({})
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
start = _parse_dt(args.start, tz, is_end=False)
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
window_days = int(args.window_days)
window_hours = int(args.window_hours)
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
comp_hours = args.window_compensation_hours
if comp_hours is None:
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
use_split = split_unit.lower() not in ("", "none", "off", "false", "0")
if use_split:
windows = split_window(
start,
end,
tz=tz,
split_unit=split_unit,
compensation_hours=comp_hours,
)
else:
min_hours = MIN_RELOAD_WINDOW_DAYS * 24
if window_hours > 0:
if window_hours < min_hours:
logger.warning(
"window_hours=%s too small; adjust to %s",
window_hours,
min_hours,
)
window_hours = min_hours
elif window_days < MIN_RELOAD_WINDOW_DAYS:
logger.warning(
"window_days=%s too small; adjust to %s",
window_days,
MIN_RELOAD_WINDOW_DAYS,
)
window_days = MIN_RELOAD_WINDOW_DAYS
adjusted = split_window(
start,
end,
tz=tz,
split_unit="none",
compensation_hours=comp_hours,
)
if adjusted:
start, end = adjusted[0]
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
windows = list(_iter_windows(start, end, window_size))
if windows:
start, end = windows[0][0], windows[-1][1]
task_codes = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
if not task_codes:
raise SystemExit("no tasks specified")
logger.info(
"START range=%s~%s window_days=%s window_hours=%s split_unit=%s comp_hours=%s sleep=%.2f",
start.isoformat(),
end.isoformat(),
window_days,
window_hours,
split_unit,
comp_hours,
args.sleep_seconds,
)
for task_code in task_codes:
logger.info("TASK_START task=%s", task_code)
for window_start, window_end in windows:
start_ts = time_mod.monotonic()
_run_task_window(
task_code=task_code,
window_start=window_start,
window_end=window_end,
api_page_size=args.api_page_size,
api_timeout=args.api_timeout,
logger=logger,
window_split_unit="none",
window_compensation_hours=0,
)
elapsed = time_mod.monotonic() - start_ts
logger.info(
"WINDOW_DONE task=%s window_start=%s window_end=%s elapsed=%.2fs",
task_code,
window_start.isoformat(),
window_end.isoformat(),
elapsed,
)
if args.sleep_seconds > 0:
logger.debug("SLEEP seconds=%.2f", args.sleep_seconds)
time_mod.sleep(args.sleep_seconds)
logger.info("TASK_DONE task=%s", task_code)
return 0
if __name__ == "__main__":
raise SystemExit(main())