ETL 完成

This commit is contained in:
Neo
2026-01-18 22:37:38 +08:00
parent 8da6cb6563
commit 7ca19a4a2c
159 changed files with 31225 additions and 467 deletions

View File

@@ -39,9 +39,18 @@ class RunTracker:
self.db.commit()
return run_id
def update_run(self, run_id: int, counts: dict, status: str,
ended_at: datetime = None, manifest: dict = None,
error_message: str = None):
def update_run(
self,
run_id: int,
counts: dict,
status: str,
ended_at: datetime = None,
manifest: dict = None,
error_message: str = None,
window: dict | None = None,
request_params: dict | None = None,
overlap_seconds: int | None = None,
):
"""更新运行记录"""
sql = """
UPDATE etl_admin.etl_run
@@ -54,17 +63,65 @@ class RunTracker:
status = %s,
ended_at = %s,
manifest = %s,
error_message = %s
error_message = %s,
window_start = COALESCE(%s, window_start),
window_end = COALESCE(%s, window_end),
window_minutes = COALESCE(%s, window_minutes),
overlap_seconds = COALESCE(%s, overlap_seconds),
request_params = CASE WHEN %s IS NULL THEN request_params ELSE %s::jsonb END
WHERE run_id = %s
"""
def _count(v, default: int = 0) -> int:
if v is None:
return default
if isinstance(v, bool):
return int(v)
if isinstance(v, int):
return int(v)
if isinstance(v, str):
try:
return int(v)
except Exception:
return default
if isinstance(v, (list, tuple, set, dict)):
try:
return len(v)
except Exception:
return default
return default
safe_counts = counts or {}
window_start = None
window_end = None
window_minutes = None
if isinstance(window, dict):
window_start = window.get("start") or window.get("window_start")
window_end = window.get("end") or window.get("window_end")
window_minutes = window.get("minutes") or window.get("window_minutes")
request_json = None if request_params is None else json.dumps(request_params or {}, ensure_ascii=False)
self.db.execute(
sql,
(counts.get("fetched", 0), counts.get("inserted", 0),
counts.get("updated", 0), counts.get("skipped", 0),
counts.get("errors", 0), counts.get("unknown_fields", 0),
status, ended_at,
json.dumps(manifest or {}, ensure_ascii=False),
error_message, run_id)
(
_count(safe_counts.get("fetched", 0)),
_count(safe_counts.get("inserted", 0)),
_count(safe_counts.get("updated", 0)),
_count(safe_counts.get("skipped", 0)),
_count(safe_counts.get("errors", 0)),
_count(safe_counts.get("unknown_fields", 0)),
status,
ended_at,
json.dumps(manifest or {}, ensure_ascii=False),
error_message,
window_start,
window_end,
window_minutes,
overlap_seconds,
request_json,
request_json,
run_id,
),
)
self.db.commit()

View File

@@ -1,7 +1,17 @@
# -*- coding: utf-8 -*-
"""ETL 调度:支持在线抓取、离线清洗入库、全流程三种模式。"""
"""ETL 调度:支持在线抓取、离线清洗入库、全流程三种模式。
说明:
为了便于排障与审计,调度器默认会在每次运行时将日志写入文件:
`io.log_root/<run_uuid>.log`。
- 该文件路径会同步写入 `etl_admin.etl_run.log_path` 字段(由 RunTracker 记录)。
- 文件日志通过给 root logger 动态挂载 FileHandler 实现,保证即便子模块使用
`logging.getLogger(__name__)` 也能写入同一份日志文件。
"""
from __future__ import annotations
import logging
import uuid
from datetime import datetime
from pathlib import Path
@@ -50,6 +60,40 @@ class ETLScheduler:
self.run_tracker = RunTracker(self.db_conn)
self.task_registry = default_registry
def _attach_run_file_logger(self, run_uuid: str) -> logging.Handler | None:
"""
为本次 run_uuid 动态挂载文件日志处理器。
返回值:
- 成功:返回 FileHandler调用方负责 removeHandler/close
- 失败:返回 None不中断主流程
"""
log_root = Path(self.config["io"]["log_root"])
try:
log_root.mkdir(parents=True, exist_ok=True)
except Exception as exc: # noqa: BLE001
self.logger.warning("创建日志目录失败:%s%s", log_root, exc)
return None
log_path = log_root / f"{run_uuid}.log"
try:
handler: logging.Handler = logging.FileHandler(log_path, encoding="utf-8")
except Exception as exc: # noqa: BLE001
self.logger.warning("创建文件日志失败:%s%s", log_path, exc)
return None
fmt = logging.Formatter(
fmt="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
handler.setFormatter(fmt)
handler.setLevel(logging.INFO)
# 挂到 root logger保证各模块 logger 都能写入同一文件。
root_logger = logging.getLogger()
root_logger.addHandler(handler)
return handler
# ------------------------------------------------------------------ public
def run_tasks(self, task_codes: list | None = None):
"""按配置或传入列表执行任务。"""
@@ -59,16 +103,28 @@ class ETLScheduler:
if not task_codes:
task_codes = self.config.get("run.tasks", [])
self.logger.info("开始运行任务: %s, run_uuid=%s", task_codes, run_uuid)
file_handler = self._attach_run_file_logger(run_uuid)
try:
self.logger.info("开始运行任务: %s, run_uuid=%s", task_codes, run_uuid)
for task_code in task_codes:
try:
self._run_single_task(task_code, run_uuid, store_id)
except Exception as exc: # noqa: BLE001
self.logger.error("任务 %s 失败: %s", task_code, exc, exc_info=True)
continue
for task_code in task_codes:
try:
self._run_single_task(task_code, run_uuid, store_id)
except Exception as exc: # noqa: BLE001
self.logger.error("任务 %s 失败: %s", task_code, exc, exc_info=True)
continue
self.logger.info("所有任务执行完成")
self.logger.info("所有任务执行完成")
finally:
if file_handler is not None:
try:
logging.getLogger().removeHandler(file_handler)
except Exception:
pass
try:
file_handler.close()
except Exception:
pass
# ------------------------------------------------------------------ internals
def _run_single_task(self, task_code: str, run_uuid: str, store_id: int):
@@ -98,6 +154,37 @@ class ETLScheduler:
fetch_stats = None
try:
# ODS_* tasks (except ODS_JSON_ARCHIVE) don't implement extract/transform/load stages in this repo
# version, so we execute them as a single step with the appropriate API client.
if self._is_ods_task(task_code):
if self.pipeline_flow in {"FULL", "FETCH_ONLY"}:
result, _ = self._execute_ods_record_and_load(task_code, cursor_data, fetch_dir, run_id)
else:
source_dir = self._resolve_ingest_source(fetch_dir, None)
result = self._execute_ingest(task_code, cursor_data, source_dir)
self.run_tracker.update_run(
run_id=run_id,
counts=result.get("counts") or {},
status=self._map_run_status(result.get("status")),
ended_at=datetime.now(self.tz),
window=result.get("window"),
request_params=result.get("request_params"),
overlap_seconds=self.config.get("run.overlap_seconds"),
)
if (result.get("status") or "").upper() == "SUCCESS":
window = result.get("window")
if isinstance(window, dict):
self.cursor_mgr.advance(
task_id=task_id,
store_id=store_id,
window_start=window.get("start"),
window_end=window.get("end"),
run_id=run_id,
)
return
if self._flow_includes_fetch():
fetch_stats = self._execute_fetch(task_code, cursor_data, fetch_dir, run_id)
if self.pipeline_flow == "FETCH_ONLY":
@@ -119,6 +206,9 @@ class ETLScheduler:
counts=result["counts"],
status=self._map_run_status(result["status"]),
ended_at=datetime.now(self.tz),
window=result.get("window"),
request_params=result.get("request_params"),
overlap_seconds=self.config.get("run.overlap_seconds"),
)
if (result.get("status") or "").upper() == "SUCCESS":
@@ -158,7 +248,10 @@ class ETLScheduler:
extracted = task.extract(context)
# 抓取结束,不执行 transform/load
stats = recording_client.last_dump or {}
fetched_count = stats.get("records") or len(extracted.get("records", [])) if isinstance(extracted, dict) else 0
extracted_count = 0
if isinstance(extracted, dict):
extracted_count = int(extracted.get("fetched") or 0) or len(extracted.get("records", []))
fetched_count = stats.get("records") or extracted_count or 0
self.logger.info(
"%s: 抓取完成,文件=%s,记录数=%s",
task_code,
@@ -167,6 +260,34 @@ class ETLScheduler:
)
return {"file": stats.get("file"), "records": fetched_count, "pages": stats.get("pages")}
@staticmethod
def _is_ods_task(task_code: str) -> bool:
tc = str(task_code or "").upper()
return tc.startswith("ODS_") and tc != "ODS_JSON_ARCHIVE"
def _execute_ods_record_and_load(
self,
task_code: str,
cursor_data: dict | None,
fetch_dir: Path,
run_id: int,
) -> tuple[dict, dict]:
"""
Execute an ODS task with RecordingAPIClient so it fetches online and writes JSON dumps.
(ODS tasks in this repo perform DB upsert inside execute(); there is no staged extract/load.)
"""
recording_client = RecordingAPIClient(
base_client=self.api_client,
output_dir=fetch_dir,
task_code=task_code,
run_id=run_id,
write_pretty=self.write_pretty_json,
)
task = self.task_registry.create_task(task_code, self.config, self.db_ops, recording_client, self.logger)
self.logger.info("%s: ODS fetch+load start, dir=%s", task_code, fetch_dir)
result = task.execute(cursor_data)
return result, (recording_client.last_dump or {})
def _execute_ingest(self, task_code: str, cursor_data: dict | None, source_dir: Path):
"""本地清洗入库:使用 LocalJsonClient 回放 JSON走原有任务 ETL。"""
local_client = LocalJsonClient(source_dir)

View File

@@ -23,6 +23,10 @@ from tasks.init_dwd_schema_task import InitDwdSchemaTask
from tasks.dwd_load_task import DwdLoadTask
from tasks.ticket_dwd_task import TicketDwdTask
from tasks.dwd_quality_task import DwdQualityTask
from tasks.ods_json_archive_task import OdsJsonArchiveTask
from tasks.check_cutoff_task import CheckCutoffTask
from tasks.init_dws_schema_task import InitDwsSchemaTask
from tasks.dws_build_order_summary_task import DwsBuildOrderSummaryTask
class TaskRegistry:
"""任务注册和工厂"""
@@ -72,5 +76,9 @@ default_registry.register("INIT_ODS_SCHEMA", InitOdsSchemaTask)
default_registry.register("INIT_DWD_SCHEMA", InitDwdSchemaTask)
default_registry.register("DWD_LOAD_FROM_ODS", DwdLoadTask)
default_registry.register("DWD_QUALITY_CHECK", DwdQualityTask)
default_registry.register("ODS_JSON_ARCHIVE", OdsJsonArchiveTask)
default_registry.register("CHECK_CUTOFF", CheckCutoffTask)
default_registry.register("INIT_DWS_SCHEMA", InitDwsSchemaTask)
default_registry.register("DWS_BUILD_ORDER_SUMMARY", DwsBuildOrderSummaryTask)
for code, task_cls in ODS_TASK_CLASSES.items():
default_registry.register(code, task_cls)