合并
This commit is contained in:
@@ -39,9 +39,18 @@ class RunTracker:
|
||||
self.db.commit()
|
||||
return run_id
|
||||
|
||||
def update_run(self, run_id: int, counts: dict, status: str,
|
||||
ended_at: datetime = None, manifest: dict = None,
|
||||
error_message: str = None):
|
||||
def update_run(
|
||||
self,
|
||||
run_id: int,
|
||||
counts: dict,
|
||||
status: str,
|
||||
ended_at: datetime = None,
|
||||
manifest: dict = None,
|
||||
error_message: str = None,
|
||||
window: dict | None = None,
|
||||
request_params: dict | None = None,
|
||||
overlap_seconds: int | None = None,
|
||||
):
|
||||
"""更新运行记录"""
|
||||
sql = """
|
||||
UPDATE etl_admin.etl_run
|
||||
@@ -54,17 +63,65 @@ class RunTracker:
|
||||
status = %s,
|
||||
ended_at = %s,
|
||||
manifest = %s,
|
||||
error_message = %s
|
||||
error_message = %s,
|
||||
window_start = COALESCE(%s, window_start),
|
||||
window_end = COALESCE(%s, window_end),
|
||||
window_minutes = COALESCE(%s, window_minutes),
|
||||
overlap_seconds = COALESCE(%s, overlap_seconds),
|
||||
request_params = CASE WHEN %s IS NULL THEN request_params ELSE %s::jsonb END
|
||||
WHERE run_id = %s
|
||||
"""
|
||||
|
||||
def _count(v, default: int = 0) -> int:
|
||||
if v is None:
|
||||
return default
|
||||
if isinstance(v, bool):
|
||||
return int(v)
|
||||
if isinstance(v, int):
|
||||
return int(v)
|
||||
if isinstance(v, str):
|
||||
try:
|
||||
return int(v)
|
||||
except Exception:
|
||||
return default
|
||||
if isinstance(v, (list, tuple, set, dict)):
|
||||
try:
|
||||
return len(v)
|
||||
except Exception:
|
||||
return default
|
||||
return default
|
||||
|
||||
safe_counts = counts or {}
|
||||
|
||||
window_start = None
|
||||
window_end = None
|
||||
window_minutes = None
|
||||
if isinstance(window, dict):
|
||||
window_start = window.get("start") or window.get("window_start")
|
||||
window_end = window.get("end") or window.get("window_end")
|
||||
window_minutes = window.get("minutes") or window.get("window_minutes")
|
||||
|
||||
request_json = None if request_params is None else json.dumps(request_params or {}, ensure_ascii=False)
|
||||
self.db.execute(
|
||||
sql,
|
||||
(counts.get("fetched", 0), counts.get("inserted", 0),
|
||||
counts.get("updated", 0), counts.get("skipped", 0),
|
||||
counts.get("errors", 0), counts.get("unknown_fields", 0),
|
||||
status, ended_at,
|
||||
json.dumps(manifest or {}, ensure_ascii=False),
|
||||
error_message, run_id)
|
||||
(
|
||||
_count(safe_counts.get("fetched", 0)),
|
||||
_count(safe_counts.get("inserted", 0)),
|
||||
_count(safe_counts.get("updated", 0)),
|
||||
_count(safe_counts.get("skipped", 0)),
|
||||
_count(safe_counts.get("errors", 0)),
|
||||
_count(safe_counts.get("unknown_fields", 0)),
|
||||
status,
|
||||
ended_at,
|
||||
json.dumps(manifest or {}, ensure_ascii=False),
|
||||
error_message,
|
||||
window_start,
|
||||
window_end,
|
||||
window_minutes,
|
||||
overlap_seconds,
|
||||
request_json,
|
||||
request_json,
|
||||
run_id,
|
||||
),
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""ETL 调度:支持在线抓取、离线清洗入库、全流程三种模式。"""
|
||||
"""ETL 调度:支持在线抓取、离线清洗入库、全流程三种模式。
|
||||
|
||||
说明:
|
||||
为了便于排障与审计,调度器默认会在每次运行时将日志写入文件:
|
||||
`io.log_root/<run_uuid>.log`。
|
||||
|
||||
- 该文件路径会同步写入 `etl_admin.etl_run.log_path` 字段(由 RunTracker 记录)。
|
||||
- 文件日志通过给 root logger 动态挂载 FileHandler 实现,保证即便子模块使用
|
||||
`logging.getLogger(__name__)` 也能写入同一份日志文件。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -50,6 +60,40 @@ class ETLScheduler:
|
||||
self.run_tracker = RunTracker(self.db_conn)
|
||||
self.task_registry = default_registry
|
||||
|
||||
def _attach_run_file_logger(self, run_uuid: str) -> logging.Handler | None:
|
||||
"""
|
||||
为本次 run_uuid 动态挂载文件日志处理器。
|
||||
|
||||
返回值:
|
||||
- 成功:返回 FileHandler(调用方负责 removeHandler/close)
|
||||
- 失败:返回 None(不中断主流程)
|
||||
"""
|
||||
log_root = Path(self.config["io"]["log_root"])
|
||||
try:
|
||||
log_root.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("创建日志目录失败:%s(%s)", log_root, exc)
|
||||
return None
|
||||
|
||||
log_path = log_root / f"{run_uuid}.log"
|
||||
try:
|
||||
handler: logging.Handler = logging.FileHandler(log_path, encoding="utf-8")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("创建文件日志失败:%s(%s)", log_path, exc)
|
||||
return None
|
||||
|
||||
fmt = logging.Formatter(
|
||||
fmt="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
handler.setFormatter(fmt)
|
||||
handler.setLevel(logging.INFO)
|
||||
|
||||
# 挂到 root logger,保证各模块 logger 都能写入同一文件。
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addHandler(handler)
|
||||
return handler
|
||||
|
||||
# ------------------------------------------------------------------ public
|
||||
def run_tasks(self, task_codes: list | None = None):
|
||||
"""按配置或传入列表执行任务。"""
|
||||
@@ -59,20 +103,39 @@ class ETLScheduler:
|
||||
if not task_codes:
|
||||
task_codes = self.config.get("run.tasks", [])
|
||||
|
||||
self.logger.info("开始运行任务: %s, run_uuid=%s", task_codes, run_uuid)
|
||||
file_handler = self._attach_run_file_logger(run_uuid)
|
||||
try:
|
||||
self.logger.info("开始运行任务: %s, run_uuid=%s", task_codes, run_uuid)
|
||||
|
||||
for task_code in task_codes:
|
||||
try:
|
||||
self._run_single_task(task_code, run_uuid, store_id)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.error("任务 %s 失败: %s", task_code, exc, exc_info=True)
|
||||
continue
|
||||
for task_code in task_codes:
|
||||
try:
|
||||
self._run_single_task(task_code, run_uuid, store_id)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.error("任务 %s 失败: %s", task_code, exc, exc_info=True)
|
||||
continue
|
||||
|
||||
self.logger.info("所有任务执行完成")
|
||||
self.logger.info("所有任务执行完成")
|
||||
finally:
|
||||
if file_handler is not None:
|
||||
try:
|
||||
logging.getLogger().removeHandler(file_handler)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
file_handler.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------ internals
|
||||
def _run_single_task(self, task_code: str, run_uuid: str, store_id: int):
|
||||
"""单个任务的抓取/清洗编排。"""
|
||||
task_code_upper = task_code.upper()
|
||||
|
||||
# 工具类任务:直接执行,不记录 cursor/run
|
||||
if task_code_upper in self.NO_DB_CONFIG_TASKS:
|
||||
self._run_utility_task(task_code_upper, store_id)
|
||||
return
|
||||
|
||||
task_cfg = self._load_task_config(task_code, store_id)
|
||||
if not task_cfg:
|
||||
self.logger.warning("任务 %s 未启用或不存在", task_code)
|
||||
@@ -98,6 +161,38 @@ class ETLScheduler:
|
||||
fetch_stats = None
|
||||
|
||||
try:
|
||||
# ODS_* tasks (except ODS_JSON_ARCHIVE) don't implement extract/transform/load stages in this repo
|
||||
# version, so we execute them as a single step with the appropriate API client.
|
||||
if self._is_ods_task(task_code):
|
||||
if self.pipeline_flow in {"FULL", "FETCH_ONLY"}:
|
||||
result, _ = self._execute_ods_record_and_load(task_code, cursor_data, fetch_dir, run_id)
|
||||
else:
|
||||
source_dir = self._resolve_ingest_source(fetch_dir, None)
|
||||
result = self._execute_ingest(task_code, cursor_data, source_dir)
|
||||
|
||||
self.run_tracker.update_run(
|
||||
run_id=run_id,
|
||||
counts=result.get("counts") or {},
|
||||
status=self._map_run_status(result.get("status")),
|
||||
ended_at=datetime.now(self.tz),
|
||||
window=result.get("window"),
|
||||
request_params=result.get("request_params"),
|
||||
overlap_seconds=self.config.get("run.overlap_seconds"),
|
||||
)
|
||||
|
||||
if (result.get("status") or "").upper() == "SUCCESS":
|
||||
window = result.get("window")
|
||||
if isinstance(window, dict):
|
||||
self.cursor_mgr.advance(
|
||||
task_id=task_id,
|
||||
store_id=store_id,
|
||||
window_start=window.get("start"),
|
||||
window_end=window.get("end"),
|
||||
run_id=run_id,
|
||||
)
|
||||
self._maybe_run_integrity_check(task_code, window)
|
||||
return
|
||||
|
||||
if self._flow_includes_fetch():
|
||||
fetch_stats = self._execute_fetch(task_code, cursor_data, fetch_dir, run_id)
|
||||
if self.pipeline_flow == "FETCH_ONLY":
|
||||
@@ -119,6 +214,9 @@ class ETLScheduler:
|
||||
counts=result["counts"],
|
||||
status=self._map_run_status(result["status"]),
|
||||
ended_at=datetime.now(self.tz),
|
||||
window=result.get("window"),
|
||||
request_params=result.get("request_params"),
|
||||
overlap_seconds=self.config.get("run.overlap_seconds"),
|
||||
)
|
||||
|
||||
if (result.get("status") or "").upper() == "SUCCESS":
|
||||
@@ -131,6 +229,7 @@ class ETLScheduler:
|
||||
window_end=window.get("end"),
|
||||
run_id=run_id,
|
||||
)
|
||||
self._maybe_run_integrity_check(task_code, window)
|
||||
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.run_tracker.update_run(
|
||||
@@ -158,7 +257,10 @@ class ETLScheduler:
|
||||
extracted = task.extract(context)
|
||||
# 抓取结束,不执行 transform/load
|
||||
stats = recording_client.last_dump or {}
|
||||
fetched_count = stats.get("records") or len(extracted.get("records", [])) if isinstance(extracted, dict) else 0
|
||||
extracted_count = 0
|
||||
if isinstance(extracted, dict):
|
||||
extracted_count = int(extracted.get("fetched") or 0) or len(extracted.get("records", []))
|
||||
fetched_count = stats.get("records") or extracted_count or 0
|
||||
self.logger.info(
|
||||
"%s: 抓取完成,文件=%s,记录数=%s",
|
||||
task_code,
|
||||
@@ -167,6 +269,34 @@ class ETLScheduler:
|
||||
)
|
||||
return {"file": stats.get("file"), "records": fetched_count, "pages": stats.get("pages")}
|
||||
|
||||
@staticmethod
|
||||
def _is_ods_task(task_code: str) -> bool:
|
||||
tc = str(task_code or "").upper()
|
||||
return tc.startswith("ODS_") and tc != "ODS_JSON_ARCHIVE"
|
||||
|
||||
def _execute_ods_record_and_load(
|
||||
self,
|
||||
task_code: str,
|
||||
cursor_data: dict | None,
|
||||
fetch_dir: Path,
|
||||
run_id: int,
|
||||
) -> tuple[dict, dict]:
|
||||
"""
|
||||
Execute an ODS task with RecordingAPIClient so it fetches online and writes JSON dumps.
|
||||
(ODS tasks in this repo perform DB upsert inside execute(); there is no staged extract/load.)
|
||||
"""
|
||||
recording_client = RecordingAPIClient(
|
||||
base_client=self.api_client,
|
||||
output_dir=fetch_dir,
|
||||
task_code=task_code,
|
||||
run_id=run_id,
|
||||
write_pretty=self.write_pretty_json,
|
||||
)
|
||||
task = self.task_registry.create_task(task_code, self.config, self.db_ops, recording_client, self.logger)
|
||||
self.logger.info("%s: ODS fetch+load start, dir=%s", task_code, fetch_dir)
|
||||
result = task.execute(cursor_data)
|
||||
return result, (recording_client.last_dump or {})
|
||||
|
||||
def _execute_ingest(self, task_code: str, cursor_data: dict | None, source_dir: Path):
|
||||
"""本地清洗入库:使用 LocalJsonClient 回放 JSON,走原有任务 ETL。"""
|
||||
local_client = LocalJsonClient(source_dir)
|
||||
@@ -201,6 +331,53 @@ class ETLScheduler:
|
||||
def _flow_includes_ingest(self) -> bool:
|
||||
return self.pipeline_flow in {"INGEST_ONLY", "FULL"}
|
||||
|
||||
# 不需要数据库配置即可运行的任务(工具类/初始化类)
|
||||
NO_DB_CONFIG_TASKS = {
|
||||
# Schema 初始化任务
|
||||
"INIT_ODS_SCHEMA",
|
||||
"INIT_DWD_SCHEMA",
|
||||
"INIT_DWS_SCHEMA",
|
||||
# 质量检查任务
|
||||
"DATA_INTEGRITY_CHECK",
|
||||
"DWD_QUALITY_CHECK",
|
||||
# 工具任务
|
||||
"CHECK_CUTOFF",
|
||||
"MANUAL_INGEST",
|
||||
"ODS_JSON_ARCHIVE",
|
||||
# DWS 汇总任务
|
||||
"DWS_BUILD_ORDER_SUMMARY",
|
||||
}
|
||||
|
||||
def _run_utility_task(self, task_code: str, store_id: int):
|
||||
"""
|
||||
执行工具类任务(不记录 cursor/run,直接执行)。
|
||||
这些任务不需要游标管理和运行跟踪。
|
||||
"""
|
||||
self.logger.info("%s: 开始执行工具类任务", task_code)
|
||||
|
||||
try:
|
||||
# 创建任务实例(不需要 API client,使用 None)
|
||||
task = self.task_registry.create_task(
|
||||
task_code, self.config, self.db_ops, None, self.logger
|
||||
)
|
||||
|
||||
# 执行任务(工具类任务通常不需要 cursor_data)
|
||||
result = task.execute(None)
|
||||
|
||||
status = (result.get("status") or "").upper() if isinstance(result, dict) else "SUCCESS"
|
||||
if status == "SUCCESS":
|
||||
self.logger.info("%s: 工具类任务执行成功", task_code)
|
||||
if isinstance(result, dict):
|
||||
counts = result.get("counts", {})
|
||||
if counts:
|
||||
self.logger.info("%s: 结果统计: %s", task_code, counts)
|
||||
else:
|
||||
self.logger.warning("%s: 工具类任务执行结果: %s", task_code, status)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error("%s: 工具类任务执行失败: %s", task_code, exc, exc_info=True)
|
||||
raise
|
||||
|
||||
def _load_task_config(self, task_code: str, store_id: int) -> dict | None:
|
||||
"""从数据库加载任务配置。"""
|
||||
sql = """
|
||||
@@ -213,6 +390,45 @@ class ETLScheduler:
|
||||
rows = self.db_conn.query(sql, (store_id, task_code))
|
||||
return rows[0] if rows else None
|
||||
|
||||
def _maybe_run_integrity_check(self, task_code: str, window: dict | None) -> None:
|
||||
if not self.config.get("integrity.auto_check", False):
|
||||
return
|
||||
if str(task_code or "").upper() != "DWD_LOAD_FROM_ODS":
|
||||
return
|
||||
if not isinstance(window, dict):
|
||||
return
|
||||
window_start = window.get("start")
|
||||
window_end = window.get("end")
|
||||
if not window_start or not window_end:
|
||||
return
|
||||
|
||||
try:
|
||||
from quality.integrity_checker import IntegrityWindow, run_integrity_window
|
||||
|
||||
include_dimensions = bool(self.config.get("integrity.include_dimensions", False))
|
||||
task_codes = str(self.config.get("integrity.ods_task_codes", "") or "").strip()
|
||||
report = run_integrity_window(
|
||||
cfg=self.config,
|
||||
window=IntegrityWindow(
|
||||
start=window_start,
|
||||
end=window_end,
|
||||
label="etl_window",
|
||||
granularity="window",
|
||||
),
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=self.logger,
|
||||
write_report=True,
|
||||
)
|
||||
self.logger.info(
|
||||
"Integrity check done: report=%s missing=%s errors=%s",
|
||||
report.get("report_path"),
|
||||
report.get("api_to_ods", {}).get("total_missing"),
|
||||
report.get("api_to_ods", {}).get("total_errors"),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("Integrity check failed: %s", exc, exc_info=True)
|
||||
|
||||
def close(self):
|
||||
"""关闭连接。"""
|
||||
self.db_conn.close()
|
||||
|
||||
@@ -15,10 +15,19 @@ from tasks.table_discount_task import TableDiscountTask
|
||||
from tasks.assistant_abolish_task import AssistantAbolishTask
|
||||
from tasks.ledger_task import LedgerTask
|
||||
from tasks.ods_tasks import ODS_TASK_CLASSES
|
||||
from tasks.ticket_dwd_task import TicketDwdTask
|
||||
from tasks.manual_ingest_task import ManualIngestTask
|
||||
from tasks.payments_dwd_task import PaymentsDwdTask
|
||||
from tasks.members_dwd_task import MembersDwdTask
|
||||
from tasks.init_schema_task import InitOdsSchemaTask
|
||||
from tasks.init_dwd_schema_task import InitDwdSchemaTask
|
||||
from tasks.dwd_load_task import DwdLoadTask
|
||||
from tasks.ticket_dwd_task import TicketDwdTask
|
||||
from tasks.dwd_quality_task import DwdQualityTask
|
||||
from tasks.ods_json_archive_task import OdsJsonArchiveTask
|
||||
from tasks.check_cutoff_task import CheckCutoffTask
|
||||
from tasks.init_dws_schema_task import InitDwsSchemaTask
|
||||
from tasks.dws_build_order_summary_task import DwsBuildOrderSummaryTask
|
||||
from tasks.data_integrity_task import DataIntegrityTask
|
||||
|
||||
class TaskRegistry:
|
||||
"""任务注册和工厂"""
|
||||
@@ -64,5 +73,14 @@ default_registry.register("TICKET_DWD", TicketDwdTask)
|
||||
default_registry.register("MANUAL_INGEST", ManualIngestTask)
|
||||
default_registry.register("PAYMENTS_DWD", PaymentsDwdTask)
|
||||
default_registry.register("MEMBERS_DWD", MembersDwdTask)
|
||||
default_registry.register("INIT_ODS_SCHEMA", InitOdsSchemaTask)
|
||||
default_registry.register("INIT_DWD_SCHEMA", InitDwdSchemaTask)
|
||||
default_registry.register("DWD_LOAD_FROM_ODS", DwdLoadTask)
|
||||
default_registry.register("DWD_QUALITY_CHECK", DwdQualityTask)
|
||||
default_registry.register("ODS_JSON_ARCHIVE", OdsJsonArchiveTask)
|
||||
default_registry.register("CHECK_CUTOFF", CheckCutoffTask)
|
||||
default_registry.register("DATA_INTEGRITY_CHECK", DataIntegrityTask)
|
||||
default_registry.register("INIT_DWS_SCHEMA", InitDwsSchemaTask)
|
||||
default_registry.register("DWS_BUILD_ORDER_SUMMARY", DwsBuildOrderSummaryTask)
|
||||
for code, task_cls in ODS_TASK_CLASSES.items():
|
||||
default_registry.register(code, task_cls)
|
||||
|
||||
Reference in New Issue
Block a user