改 相对路径 完成客户端

This commit is contained in:
Neo
2026-01-27 22:14:01 +08:00
parent 04c064793a
commit 9f8976e75a
292 changed files with 307062 additions and 678 deletions

View File

@@ -0,0 +1 @@
# Script helpers package marker.

View File

@@ -240,9 +240,8 @@ class MissingDataBackfiller:
headers_extra=cfg["api"].get("headers_extra") or {},
)
# 数据库连接
# 数据库连接DatabaseConnection 构造时已设置 autocommit=False
self.db = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
self.db.conn.autocommit = False
def close(self):
"""关闭连接"""
@@ -264,7 +263,7 @@ class MissingDataBackfiller:
Returns:
补全结果统计
"""
self.logger.info("BACKFILL_START start=%s end=%s", start.isoformat(), end.isoformat())
self.logger.info("数据补全开始 起始=%s 结束=%s", start.isoformat(), end.isoformat())
# 计算窗口大小
total_seconds = max(0, int((end - start).total_seconds()))
@@ -276,7 +275,7 @@ class MissingDataBackfiller:
window_hours = max(1, total_seconds // 3600 or 1)
# 运行 gap check
self.logger.info("RUNNING_GAP_CHECK...")
self.logger.info("正在执行缺失检查...")
gap_result = run_gap_check(
cfg=self.cfg,
start=start,
@@ -297,10 +296,10 @@ class MissingDataBackfiller:
total_missing = gap_result.get("total_missing", 0)
if total_missing == 0:
self.logger.info("NO_MISSING_DATA")
self.logger.info("数据完整,无缺失记录")
return {"backfilled": 0, "errors": 0, "details": []}
self.logger.info("GAP_CHECK_DONE total_missing=%s", total_missing)
self.logger.info("缺失检查完成 总缺失=%s", total_missing)
# 补全每个任务的丢失数据
results = []
@@ -316,7 +315,7 @@ class MissingDataBackfiller:
continue
self.logger.info(
"BACKFILL_TASK task=%s missing=%s samples=%s",
"开始补全任务 任务=%s 缺失=%s 样本数=%s",
task_code, missing, len(missing_samples)
)
@@ -339,7 +338,7 @@ class MissingDataBackfiller:
})
total_backfilled += backfilled
except Exception as exc:
self.logger.exception("BACKFILL_ERROR task=%s", task_code)
self.logger.exception("补全失败 任务=%s", task_code)
results.append({
"task_code": task_code,
"missing": missing,
@@ -349,7 +348,7 @@ class MissingDataBackfiller:
total_errors += 1
self.logger.info(
"BACKFILL_DONE total_missing=%s backfilled=%s errors=%s",
"数据补全完成 总缺失=%s 已补全=%s 错误数=%s",
total_missing, total_backfilled, total_errors
)
@@ -375,14 +374,14 @@ class MissingDataBackfiller:
"""补全单个任务的丢失数据"""
spec = _get_spec(task_code)
if not spec:
self.logger.warning("SPEC_NOT_FOUND task=%s", task_code)
self.logger.warning("未找到任务规格 任务=%s", task_code)
return 0
if not pk_columns:
pk_columns = _get_table_pk_columns(self.db.conn, table)
if not pk_columns:
self.logger.warning("NO_PK_COLUMNS task=%s table=%s", task_code, table)
self.logger.warning("未找到主键列 任务=%s =%s", task_code, table)
return 0
# 提取丢失的 PK 值
@@ -393,11 +392,11 @@ class MissingDataBackfiller:
missing_pks.add(pk_tuple)
if not missing_pks:
self.logger.info("NO_MISSING_PKS task=%s", task_code)
self.logger.info("无缺失主键 任务=%s", task_code)
return 0
self.logger.info(
"BACKFILL_FETCHING task=%s missing_pks=%s",
"开始获取数据 任务=%s 缺失主键数=%s",
task_code, len(missing_pks)
)
@@ -436,7 +435,7 @@ class MissingDataBackfiller:
if self.dry_run:
backfilled += len(records_to_insert)
self.logger.info(
"DRY_RUN task=%s page=%s would_insert=%s",
"模拟运行 任务=%s =%s 将插入=%s",
task_code, page_no, len(records_to_insert)
)
else:
@@ -449,14 +448,14 @@ class MissingDataBackfiller:
)
backfilled += inserted
self.logger.info(
"INSERTED task=%s page=%s count=%s",
"已插入 任务=%s =%s 数量=%s",
task_code, page_no, inserted
)
if not self.dry_run:
self.db.conn.commit()
self.logger.info("BACKFILL_TASK_DONE task=%s backfilled=%s", task_code, backfilled)
self.logger.info("任务补全完成 任务=%s 已补全=%s", task_code, backfilled)
return backfilled
except Exception:

View File

@@ -0,0 +1,169 @@
# -*- coding: utf-8 -*-
"""Run data integrity checks across API -> ODS -> DWD."""
from __future__ import annotations
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
from dateutil import parser as dtparser
from config.settings import AppConfig
from quality.integrity_checker import (
IntegrityWindow,
compute_last_etl_end,
run_integrity_history,
run_integrity_window,
)
from utils.logging_utils import build_log_path, configure_logging
from utils.windowing import split_window
def _parse_dt(value: str, tz: ZoneInfo) -> datetime:
dt = dtparser.parse(value)
if dt.tzinfo is None:
return dt.replace(tzinfo=tz)
return dt.astimezone(tz)
def main() -> int:
if hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
ap = argparse.ArgumentParser(description="Data integrity checks (API -> ODS -> DWD)")
ap.add_argument("--mode", choices=["history", "window"], default="history")
ap.add_argument("--start", default="2025-07-01", help="history start date (default: 2025-07-01)")
ap.add_argument("--end", default="", help="history end datetime (default: last ETL end)")
ap.add_argument("--window-start", default="", help="window start datetime (mode=window)")
ap.add_argument("--window-end", default="", help="window end datetime (mode=window)")
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
ap.add_argument("--include-dimensions", action="store_true", help="include dimension tables in ODS->DWD checks")
ap.add_argument("--ods-task-codes", default="", help="comma-separated ODS task codes for API checks")
ap.add_argument("--out", default="", help="output JSON path")
ap.add_argument("--log-file", default="", help="log file path")
ap.add_argument("--log-dir", default="", help="log directory")
ap.add_argument("--log-level", default="INFO", help="log level")
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
args = ap.parse_args()
log_dir = Path(args.log_dir) if args.log_dir else (Path(__file__).resolve().parent / "logs")
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "data_integrity")
log_console = not args.no_log_console
with configure_logging(
"data_integrity",
log_file,
level=args.log_level,
console=log_console,
tee_std=True,
) as logger:
cfg = AppConfig.load({})
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
report_path = Path(args.out) if args.out else None
if args.mode == "window":
if not args.window_start or not args.window_end:
raise SystemExit("window-start and window-end are required for mode=window")
start_dt = _parse_dt(args.window_start, tz)
end_dt = _parse_dt(args.window_end, tz)
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
comp_hours = args.window_compensation_hours
if comp_hours is None:
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
windows = split_window(
start_dt,
end_dt,
tz=tz,
split_unit=split_unit,
compensation_hours=comp_hours,
)
if not windows:
windows = [(start_dt, end_dt)]
window_reports = []
total_missing = 0
total_errors = 0
for idx, (seg_start, seg_end) in enumerate(windows, start=1):
window = IntegrityWindow(
start=seg_start,
end=seg_end,
label=f"segment_{idx}",
granularity="window",
)
payload = run_integrity_window(
cfg=cfg,
window=window,
include_dimensions=args.include_dimensions,
task_codes=args.ods_task_codes,
logger=logger,
write_report=False,
report_path=None,
window_split_unit="none",
window_compensation_hours=0,
)
window_reports.append(payload)
total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)
overall_start = windows[0][0]
overall_end = windows[-1][1]
report = {
"mode": "window",
"window": {
"start": overall_start.isoformat(),
"end": overall_end.isoformat(),
"segments": len(windows),
},
"windows": window_reports,
"api_to_ods": {
"total_missing": total_missing,
"total_errors": total_errors,
},
"total_missing": total_missing,
"total_errors": total_errors,
"generated_at": datetime.now(tz).isoformat(),
}
if report_path is None:
root = Path(__file__).resolve().parents[1]
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
report_path = root / "reports" / f"data_integrity_window_{stamp}.json"
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
report["report_path"] = str(report_path)
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
else:
start_dt = _parse_dt(args.start, tz)
if args.end:
end_dt = _parse_dt(args.end, tz)
else:
end_dt = compute_last_etl_end(cfg) or datetime.now(tz)
report = run_integrity_history(
cfg=cfg,
start_dt=start_dt,
end_dt=end_dt,
include_dimensions=args.include_dimensions,
task_codes=args.ods_task_codes,
logger=logger,
write_report=True,
report_path=report_path,
)
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
logger.info(
"SUMMARY missing=%s errors=%s",
report.get("total_missing"),
report.get("total_errors"),
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -34,6 +34,7 @@ from database.connection import DatabaseConnection
from models.parsers import TypeParser
from tasks.ods_tasks import ENABLED_ODS_CODES, ODS_TASK_SPECS
from utils.logging_utils import build_log_path, configure_logging
from utils.windowing import split_window
DEFAULT_START = "2025-07-01"
MIN_COMPLETENESS_WINDOW_DAYS = 30
@@ -214,8 +215,7 @@ def _check_spec(
store_id: int,
start: datetime | None,
end: datetime | None,
window_days: int,
window_hours: int,
windows: list[tuple[datetime, datetime]] | None,
page_size: int,
chunk_size: int,
sample_limit: int,
@@ -249,8 +249,7 @@ def _check_spec(
result["errors"] = 1
result["error_detail"] = "missing start/end for windowed endpoint"
return result
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
windows = list(_iter_windows(start, end, window_size))
windows = list(windows or [(start, end)])
else:
windows = [(None, None)]
@@ -377,8 +376,7 @@ def _check_settlement_tickets(
store_id: int,
start: datetime | None,
end: datetime | None,
window_days: int,
window_hours: int,
windows: list[tuple[datetime, datetime]] | None,
page_size: int,
chunk_size: int,
sample_limit: int,
@@ -415,8 +413,7 @@ def _check_settlement_tickets(
missing_seen: set[tuple] = set()
pay_endpoint = "/PayLog/GetPayLogListPage"
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
windows = list(_iter_windows(start, end, window_size))
windows = list(windows or [(start, end)])
logger.info(
"CHECK_START task=%s table=%s windows=%s start=%s end=%s",
result["task_code"],
@@ -571,6 +568,222 @@ def _resolve_window_from_cutoff(
return start, now, cutoff
def run_gap_check(
*,
cfg: AppConfig | None,
start: datetime | str | None,
end: datetime | str | None,
window_days: int,
window_hours: int,
page_size: int,
chunk_size: int,
sample_limit: int,
sleep_per_window: float,
sleep_per_page: float,
task_codes: str,
from_cutoff: bool,
cutoff_overlap_hours: int,
allow_small_window: bool,
logger: logging.Logger,
window_split_unit: str | None = None,
window_compensation_hours: int | None = None,
) -> dict:
cfg = cfg or AppConfig.load({})
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
store_id = int(cfg.get("app.store_id") or 0)
if not cfg.get("api.token"):
raise ValueError("missing api.token; please set API_TOKEN in .env")
window_days = int(window_days)
window_hours = int(window_hours)
split_unit = (window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
comp_hours = window_compensation_hours
if comp_hours is None:
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
use_split = split_unit.lower() not in ("", "none", "off", "false", "0")
if not use_split and not from_cutoff and not allow_small_window:
min_hours = MIN_COMPLETENESS_WINDOW_DAYS * 24
if window_hours > 0:
if window_hours < min_hours:
logger.warning(
"window_hours=%s too small for completeness check; adjust to %s",
window_hours,
min_hours,
)
window_hours = min_hours
elif window_days < MIN_COMPLETENESS_WINDOW_DAYS:
logger.warning(
"window_days=%s too small for completeness check; adjust to %s",
window_days,
MIN_COMPLETENESS_WINDOW_DAYS,
)
window_days = MIN_COMPLETENESS_WINDOW_DAYS
cutoff = None
if from_cutoff:
db_tmp = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
ods_tables = [s.table_name for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
start, end, cutoff = _resolve_window_from_cutoff(
conn=db_tmp.conn,
ods_tables=ods_tables,
tz=tz,
overlap_hours=cutoff_overlap_hours,
)
db_tmp.close()
else:
if not start:
start = DEFAULT_START
if not end:
end = datetime.now(tz)
if isinstance(start, str):
start = _parse_dt(start, tz, is_end=False)
if isinstance(end, str):
end = _parse_dt(end, tz, is_end=True)
windows = None
if use_split:
windows = split_window(
start,
end,
tz=tz,
split_unit=split_unit,
compensation_hours=comp_hours,
)
else:
adjusted = split_window(
start,
end,
tz=tz,
split_unit="none",
compensation_hours=comp_hours,
)
if adjusted:
start, end = adjusted[0]
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
windows = list(_iter_windows(start, end, window_size))
if windows:
start, end = windows[0][0], windows[-1][1]
logger.info(
"START range=%s~%s window_days=%s window_hours=%s split_unit=%s comp_hours=%s page_size=%s chunk_size=%s",
start.isoformat() if isinstance(start, datetime) else None,
end.isoformat() if isinstance(end, datetime) else None,
window_days,
window_hours,
split_unit,
comp_hours,
page_size,
chunk_size,
)
if cutoff:
logger.info("CUTOFF=%s overlap_hours=%s", cutoff.isoformat(), cutoff_overlap_hours)
client = APIClient(
base_url=cfg["api"]["base_url"],
token=cfg["api"]["token"],
timeout=int(cfg["api"].get("timeout_sec") or 20),
retry_max=int(cfg["api"].get("retries", {}).get("max_attempts") or 3),
headers_extra=cfg["api"].get("headers_extra") or {},
)
db_conn = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
try:
db_conn.conn.rollback()
except Exception:
pass
db_conn.conn.autocommit = True
try:
task_filter = {t.strip().upper() for t in (task_codes or "").split(",") if t.strip()}
specs = [s for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
if task_filter:
specs = [s for s in specs if s.code in task_filter]
results: list[dict] = []
for spec in specs:
if spec.code == "ODS_SETTLEMENT_TICKET":
continue
result = _check_spec(
client=client,
db_conn=db_conn.conn,
cfg=cfg,
tz=tz,
logger=logger,
spec=spec,
store_id=store_id,
start=start,
end=end,
windows=windows,
page_size=page_size,
chunk_size=chunk_size,
sample_limit=sample_limit,
sleep_per_window=sleep_per_window,
sleep_per_page=sleep_per_page,
)
results.append(result)
logger.info(
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
result.get("task_code"),
result.get("missing"),
result.get("records"),
result.get("errors"),
)
if (not task_filter) or ("ODS_SETTLEMENT_TICKET" in task_filter):
ticket_result = _check_settlement_tickets(
client=client,
db_conn=db_conn.conn,
cfg=cfg,
tz=tz,
logger=logger,
store_id=store_id,
start=start,
end=end,
windows=windows,
page_size=page_size,
chunk_size=chunk_size,
sample_limit=sample_limit,
sleep_per_window=sleep_per_window,
sleep_per_page=sleep_per_page,
)
results.append(ticket_result)
logger.info(
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
ticket_result.get("task_code"),
ticket_result.get("missing"),
ticket_result.get("records"),
ticket_result.get("errors"),
)
total_missing = sum(int(r.get("missing") or 0) for r in results)
total_errors = sum(int(r.get("errors") or 0) for r in results)
payload = {
"window_split_unit": split_unit,
"window_compensation_hours": comp_hours,
"start": start.isoformat() if isinstance(start, datetime) else None,
"end": end.isoformat() if isinstance(end, datetime) else None,
"cutoff": cutoff.isoformat() if cutoff else None,
"window_days": window_days,
"window_hours": window_hours,
"page_size": page_size,
"chunk_size": chunk_size,
"sample_limit": sample_limit,
"store_id": store_id,
"base_url": cfg.get("api.base_url"),
"results": results,
"total_missing": total_missing,
"total_errors": total_errors,
"generated_at": datetime.now(tz).isoformat(),
}
return payload
finally:
db_conn.close()
def main() -> int:
_reconfigure_stdout_utf8()
ap = argparse.ArgumentParser(description="Check missing ODS records by comparing API vs ODS PKs.")
@@ -578,6 +791,8 @@ def main() -> int:
ap.add_argument("--end", default="", help="end datetime (default: now)")
ap.add_argument("--window-days", type=int, default=1, help="days per API window (default: 1)")
ap.add_argument("--window-hours", type=int, default=0, help="hours per API window (default: 0)")
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
ap.add_argument("--page-size", type=int, default=200, help="API page size (default: 200)")
ap.add_argument("--chunk-size", type=int, default=500, help="DB query chunk size (default: 500)")
ap.add_argument("--sample-limit", type=int, default=50, help="max missing PK samples per table")
@@ -593,6 +808,11 @@ def main() -> int:
default=24,
help="overlap hours when using --from-cutoff (default: 24)",
)
ap.add_argument(
"--allow-small-window",
action="store_true",
help="allow windows smaller than default completeness guard",
)
ap.add_argument("--log-file", default="", help="log file path (default: logs/check_ods_gaps_YYYYMMDD_HHMMSS.log)")
ap.add_argument("--log-dir", default="", help="log directory (default: logs)")
ap.add_argument("--log-level", default="INFO", help="log level (default: INFO)")
@@ -611,170 +831,41 @@ def main() -> int:
tee_std=True,
) as logger:
cfg = AppConfig.load({})
payload = run_gap_check(
cfg=cfg,
start=args.start,
end=args.end,
window_days=args.window_days,
window_hours=args.window_hours,
page_size=args.page_size,
chunk_size=args.chunk_size,
sample_limit=args.sample_limit,
sleep_per_window=args.sleep_per_window_seconds,
sleep_per_page=args.sleep_per_page_seconds,
task_codes=args.task_codes,
from_cutoff=args.from_cutoff,
cutoff_overlap_hours=args.cutoff_overlap_hours,
allow_small_window=args.allow_small_window,
logger=logger,
window_split_unit=args.window_split_unit or None,
window_compensation_hours=args.window_compensation_hours,
)
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
store_id = int(cfg.get("app.store_id"))
if not cfg.get("api.token"):
logger.error("missing api.token; please set API_TOKEN in .env")
raise SystemExit("missing api.token; please set API_TOKEN in .env")
window_days = int(args.window_days)
window_hours = int(args.window_hours)
if not args.from_cutoff:
min_hours = MIN_COMPLETENESS_WINDOW_DAYS * 24
if window_hours > 0:
if window_hours < min_hours:
logger.warning(
"window_hours=%s too small for completeness check; adjust to %s",
window_hours,
min_hours,
)
window_hours = min_hours
elif window_days < MIN_COMPLETENESS_WINDOW_DAYS:
logger.warning(
"window_days=%s too small for completeness check; adjust to %s",
window_days,
MIN_COMPLETENESS_WINDOW_DAYS,
)
window_days = MIN_COMPLETENESS_WINDOW_DAYS
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
if args.from_cutoff:
db_tmp = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
ods_tables = [s.table_name for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
start, end, cutoff = _resolve_window_from_cutoff(
conn=db_tmp.conn,
ods_tables=ods_tables,
tz=tz,
overlap_hours=args.cutoff_overlap_hours,
)
db_tmp.close()
if args.out:
out_path = Path(args.out)
else:
start = _parse_dt(args.start, tz, is_end=False)
cutoff = None
tag = f"_{args.tag}" if args.tag else ""
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
out_path = PROJECT_ROOT / "reports" / f"ods_gap_check{tag}_{stamp}.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
logger.info("REPORT_WRITTEN path=%s", out_path)
logger.info(
"START range=%s~%s window_days=%s window_hours=%s page_size=%s chunk_size=%s",
start.isoformat() if start else None,
end.isoformat() if end else None,
window_days,
window_hours,
args.page_size,
args.chunk_size,
"SUMMARY missing=%s errors=%s",
payload.get("total_missing"),
payload.get("total_errors"),
)
if cutoff:
logger.info("CUTOFF=%s overlap_hours=%s", cutoff.isoformat(), args.cutoff_overlap_hours)
client = APIClient(
base_url=cfg["api"]["base_url"],
token=cfg["api"]["token"],
timeout=int(cfg["api"].get("timeout_sec") or 20),
retry_max=int(cfg["api"].get("retries", {}).get("max_attempts") or 3),
headers_extra=cfg["api"].get("headers_extra") or {},
)
db_conn = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
try:
db_conn.conn.rollback()
except Exception:
pass
db_conn.conn.autocommit = True
try:
task_filter = {t.strip().upper() for t in args.task_codes.split(",") if t.strip()}
specs = [s for s in ODS_TASK_SPECS if s.code in ENABLED_ODS_CODES]
if task_filter:
specs = [s for s in specs if s.code in task_filter]
results: list[dict] = []
for spec in specs:
if spec.code == "ODS_SETTLEMENT_TICKET":
continue
result = _check_spec(
client=client,
db_conn=db_conn.conn,
cfg=cfg,
tz=tz,
logger=logger,
spec=spec,
store_id=store_id,
start=start,
end=end,
window_days=window_days,
window_hours=window_hours,
page_size=args.page_size,
chunk_size=args.chunk_size,
sample_limit=args.sample_limit,
sleep_per_window=args.sleep_per_window_seconds,
sleep_per_page=args.sleep_per_page_seconds,
)
results.append(result)
logger.info(
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
result.get("task_code"),
result.get("missing"),
result.get("records"),
result.get("errors"),
)
if (not task_filter) or ("ODS_SETTLEMENT_TICKET" in task_filter):
ticket_result = _check_settlement_tickets(
client=client,
db_conn=db_conn.conn,
cfg=cfg,
tz=tz,
logger=logger,
store_id=store_id,
start=start,
end=end,
window_days=window_days,
window_hours=window_hours,
page_size=args.page_size,
chunk_size=args.chunk_size,
sample_limit=args.sample_limit,
sleep_per_window=args.sleep_per_window_seconds,
sleep_per_page=args.sleep_per_page_seconds,
)
results.append(ticket_result)
logger.info(
"CHECK_DONE task=%s missing=%s records=%s errors=%s",
ticket_result.get("task_code"),
ticket_result.get("missing"),
ticket_result.get("records"),
ticket_result.get("errors"),
)
total_missing = sum(int(r.get("missing") or 0) for r in results)
total_errors = sum(int(r.get("errors") or 0) for r in results)
if args.out:
out_path = Path(args.out)
else:
tag = f"_{args.tag}" if args.tag else ""
stamp = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
out_path = PROJECT_ROOT / "reports" / f"ods_gap_check{tag}_{stamp}.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"start": start.isoformat(),
"end": end.isoformat(),
"cutoff": cutoff.isoformat() if cutoff else None,
"window_days": window_days,
"window_hours": window_hours,
"page_size": args.page_size,
"chunk_size": args.chunk_size,
"sample_limit": args.sample_limit,
"store_id": store_id,
"base_url": cfg.get("api.base_url"),
"results": results,
"total_missing": total_missing,
"total_errors": total_errors,
"generated_at": datetime.now(tz).isoformat(),
}
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
logger.info("REPORT_WRITTEN path=%s", out_path)
logger.info("SUMMARY missing=%s errors=%s", total_missing, total_errors)
finally:
db_conn.close()
return 0

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
"""
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON默认目录 C:\\dev\\LLTQ\\export\\test-json-doc
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON默认目录 export/test-json-doc
是否包含同名键,并输出每表未命中的字段,便于补充映射或确认确实无源字段。
使用方法:
@@ -69,7 +69,7 @@ def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
def main() -> None:
"""主流程:遍历 FILE_MAPPING 中的 ODS 表,检查 JSON 键覆盖情况并打印报告。"""
dsn = os.environ.get("PG_DSN")
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", r"C:\dev\LLTQ\export\test-json-doc"))
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", "export/test-json-doc"))
ods_cols_map = _load_ods_columns(dsn)

View File

@@ -13,7 +13,7 @@
python -m etl_billiards.scripts.rebuild_db_and_run_ods_to_dwd ^
--dsn "postgresql://user:pwd@host:5432/db" ^
--store-id 1 ^
--json-dir "C:\\dev\\LLTQ\\export\\test-json-doc" ^
--json-dir "export/test-json-doc" ^
--drop-schemas
环境变量(可选):
@@ -44,7 +44,7 @@ from etl_billiards.tasks.init_schema_task import InitOdsSchemaTask
from etl_billiards.tasks.manual_ingest_task import ManualIngestTask
DEFAULT_JSON_DIR = r"C:\dev\LLTQ\export\test-json-doc"
DEFAULT_JSON_DIR = "export/test-json-doc"
@dataclass(frozen=True)

View File

@@ -7,7 +7,7 @@
依赖环境变量:
PG_DSN PostgreSQL 连接串(必填)
PG_CONNECT_TIMEOUT 可选,秒,默认 10
JSON_DOC_DIR 可选JSON 目录,默认 C:\\dev\\LLTQ\\export\\test-json-doc
JSON_DOC_DIR 可选JSON 目录,默认 export/test-json-doc
ODS_INCLUDE_FILES 可选,逗号分隔文件名(不含 .json
ODS_DROP_SCHEMA_FIRST 可选true/false默认 true
"""
@@ -26,7 +26,7 @@ from psycopg2 import sql
from psycopg2.extras import Json, execute_values
DEFAULT_JSON_DIR = r"C:\dev\LLTQ\export\test-json-doc"
DEFAULT_JSON_DIR = "export/test-json-doc"
SPECIAL_LIST_PATHS: dict[str, tuple[str, ...]] = {
"assistant_accounts_master": ("data", "assistantInfos"),
"assistant_cancellation_records": ("data", "abolitionAssistants"),

View File

@@ -20,10 +20,12 @@ if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from config.settings import AppConfig
from utils.windowing import split_window
from utils.logging_utils import build_log_path, configure_logging
MIN_RELOAD_WINDOW_DAYS = 30
def _parse_dt(value: str, tz: ZoneInfo, *, is_end: bool) -> datetime:
raw = (value or "").strip()
if not raw:
@@ -56,6 +58,8 @@ def _run_task_window(
api_page_size: int,
api_timeout: int,
logger: logging.Logger,
window_split_unit: str | None = "none",
window_compensation_hours: int | None = 0,
) -> None:
cmd = [
sys.executable,
@@ -70,6 +74,10 @@ def _run_task_window(
"--window-end",
window_end.strftime("%Y-%m-%d %H:%M:%S"),
"--force-window-override",
"--window-split-unit",
str(window_split_unit or "none"),
"--window-compensation-hours",
str(int(window_compensation_hours or 0)),
]
if api_page_size > 0:
cmd += ["--api-page-size", str(api_page_size)]
@@ -92,6 +100,8 @@ def main() -> int:
ap.add_argument("--end", default="", help="end datetime (default: now)")
ap.add_argument("--window-days", type=int, default=1, help="days per window (default: 1)")
ap.add_argument("--window-hours", type=int, default=0, help="hours per window (default: 0)")
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
ap.add_argument("--sleep-seconds", type=float, default=0, help="sleep seconds after each window")
ap.add_argument("--api-page-size", type=int, default=200, help="API page size override")
ap.add_argument("--api-timeout", type=int, default=20, help="API timeout seconds override")
@@ -119,40 +129,70 @@ def main() -> int:
end = datetime.now(tz) if not args.end else _parse_dt(args.end, tz, is_end=True)
window_days = int(args.window_days)
window_hours = int(args.window_hours)
min_hours = MIN_RELOAD_WINDOW_DAYS * 24
if window_hours > 0:
if window_hours < min_hours:
logger.warning(
"window_hours=%s too small; adjust to %s",
window_hours,
min_hours,
)
window_hours = min_hours
elif window_days < MIN_RELOAD_WINDOW_DAYS:
logger.warning(
"window_days=%s too small; adjust to %s",
window_days,
MIN_RELOAD_WINDOW_DAYS,
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
comp_hours = args.window_compensation_hours
if comp_hours is None:
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
use_split = split_unit.lower() not in ("", "none", "off", "false", "0")
if use_split:
windows = split_window(
start,
end,
tz=tz,
split_unit=split_unit,
compensation_hours=comp_hours,
)
window_days = MIN_RELOAD_WINDOW_DAYS
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
else:
min_hours = MIN_RELOAD_WINDOW_DAYS * 24
if window_hours > 0:
if window_hours < min_hours:
logger.warning(
"window_hours=%s too small; adjust to %s",
window_hours,
min_hours,
)
window_hours = min_hours
elif window_days < MIN_RELOAD_WINDOW_DAYS:
logger.warning(
"window_days=%s too small; adjust to %s",
window_days,
MIN_RELOAD_WINDOW_DAYS,
)
window_days = MIN_RELOAD_WINDOW_DAYS
adjusted = split_window(
start,
end,
tz=tz,
split_unit="none",
compensation_hours=comp_hours,
)
if adjusted:
start, end = adjusted[0]
window_size = timedelta(hours=window_hours) if window_hours > 0 else timedelta(days=window_days)
windows = list(_iter_windows(start, end, window_size))
if windows:
start, end = windows[0][0], windows[-1][1]
task_codes = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
if not task_codes:
raise SystemExit("no tasks specified")
logger.info(
"START range=%s~%s window_days=%s window_hours=%s sleep=%.2f",
"START range=%s~%s window_days=%s window_hours=%s split_unit=%s comp_hours=%s sleep=%.2f",
start.isoformat(),
end.isoformat(),
window_days,
window_hours,
split_unit,
comp_hours,
args.sleep_seconds,
)
for task_code in task_codes:
logger.info("TASK_START task=%s", task_code)
for window_start, window_end in _iter_windows(start, end, window_size):
for window_start, window_end in windows:
start_ts = time_mod.monotonic()
_run_task_window(
task_code=task_code,
@@ -161,6 +201,8 @@ def main() -> int:
api_page_size=args.api_page_size,
api_timeout=args.api_timeout,
logger=logger,
window_split_unit="none",
window_compensation_hours=0,
)
elapsed = time_mod.monotonic() - start_ts
logger.info(