初始提交:飞球 ETL 系统全量代码
This commit is contained in:
379
orchestration/pipeline_runner.py
Normal file
379
orchestration/pipeline_runner.py
Normal file
@@ -0,0 +1,379 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""管道运行器:管道定义、层→任务映射、校验编排。
|
||||
|
||||
从原 ETLScheduler 中提取管道编排逻辑,委托 TaskExecutor 执行具体任务。
|
||||
所有依赖通过构造函数注入,不自行创建资源。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from tasks.verification import filter_verify_tables
|
||||
|
||||
|
||||
class PipelineRunner:
|
||||
"""管道编排器:根据管道定义执行多层 ETL 任务并可选地运行后置校验。"""
|
||||
|
||||
# 管道定义:每个管道包含的层(从 scheduler.py 模块级常量迁移至此)
|
||||
PIPELINE_LAYERS: dict[str, list[str]] = {
|
||||
"api_ods": ["ODS"],
|
||||
"api_ods_dwd": ["ODS", "DWD"],
|
||||
"api_full": ["ODS", "DWD", "DWS", "INDEX"],
|
||||
"ods_dwd": ["DWD"],
|
||||
"dwd_dws": ["DWS"],
|
||||
"dwd_dws_index": ["DWS", "INDEX"],
|
||||
"dwd_index": ["INDEX"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
task_executor,
|
||||
task_registry,
|
||||
db_conn,
|
||||
api_client,
|
||||
logger: logging.Logger,
|
||||
):
|
||||
self.config = config
|
||||
self.task_executor = task_executor
|
||||
self.task_registry = task_registry
|
||||
self.db_conn = db_conn
|
||||
self.api_client = api_client
|
||||
self.logger = logger
|
||||
self.tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
|
||||
|
||||
def run(
|
||||
self,
|
||||
pipeline: str,
|
||||
processing_mode: str = "increment_only",
|
||||
data_source: str = "hybrid",
|
||||
window_start: datetime | None = None,
|
||||
window_end: datetime | None = None,
|
||||
window_split: str | None = None,
|
||||
task_codes: list[str] | None = None,
|
||||
fetch_before_verify: bool = False,
|
||||
verify_tables: list[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""执行管道,返回汇总结果。
|
||||
|
||||
Args:
|
||||
pipeline: 管道类型 (api_ods, api_ods_dwd, api_full, ods_dwd, dwd_dws, dwd_dws_index, dwd_index)
|
||||
processing_mode: 处理模式 (increment_only / verify_only / increment_verify)
|
||||
data_source: 数据源模式 (online / offline / hybrid)
|
||||
window_start: 时间窗口开始
|
||||
window_end: 时间窗口结束
|
||||
window_split: 时间窗口切分 (none / day / week / month)
|
||||
task_codes: 要执行的任务代码列表(作为管道内的任务过滤器)
|
||||
fetch_before_verify: 校验前是否先从 API 获取数据(仅在 verify_only 模式下有效)
|
||||
verify_tables: 指定校验的表名列表(可用于单表验证)
|
||||
|
||||
Returns:
|
||||
执行结果字典,包含 status / pipeline / layers / results / verification_summary
|
||||
"""
|
||||
from utils.task_logger import TaskLogger
|
||||
|
||||
if pipeline not in self.PIPELINE_LAYERS:
|
||||
raise ValueError(f"无效的管道名称: {pipeline}")
|
||||
|
||||
run_uuid = uuid.uuid4().hex
|
||||
pipeline_logger = TaskLogger(f"PIPELINE_{pipeline.upper()}", self.logger)
|
||||
pipeline_logger.start(f"开始执行管道: {pipeline}")
|
||||
|
||||
layers = self.PIPELINE_LAYERS[pipeline]
|
||||
results: list[dict[str, Any]] = []
|
||||
verification_summary: dict[str, Any] | None = None
|
||||
ods_dump_dirs: dict[str, str] = {}
|
||||
use_local_json = bool(self.config.get("verification.ods_use_local_json", False))
|
||||
|
||||
# 设置默认时间窗口
|
||||
if window_end is None:
|
||||
window_end = datetime.now(self.tz)
|
||||
if window_start is None:
|
||||
window_start = window_end - timedelta(hours=24)
|
||||
|
||||
try:
|
||||
if processing_mode == "verify_only":
|
||||
# 仅校验模式
|
||||
if fetch_before_verify:
|
||||
self.logger.info("管道 %s: 校验模式(先获取 API 数据)", pipeline)
|
||||
|
||||
if task_codes:
|
||||
ods_tasks = [t for t in task_codes if t.startswith("ODS_")]
|
||||
if ods_tasks:
|
||||
self.logger.info("从 API 获取数据: %s", ods_tasks)
|
||||
results = self.task_executor.run_tasks(ods_tasks, data_source=data_source)
|
||||
else:
|
||||
auto_tasks = self._resolve_tasks(["ODS"])
|
||||
if auto_tasks:
|
||||
self.logger.info("从 API 获取数据: %s", auto_tasks)
|
||||
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
|
||||
|
||||
ods_dump_dirs = {
|
||||
r.get("task_code"): r.get("dump_dir")
|
||||
for r in results
|
||||
if r.get("task_code") and r.get("dump_dir")
|
||||
}
|
||||
self.logger.info("API 数据获取完成,开始校验并修复")
|
||||
else:
|
||||
self.logger.info("管道 %s: 仅校验模式,跳过增量 ETL,直接执行校验并修复", pipeline)
|
||||
|
||||
verification_summary = self._run_verification(
|
||||
layers=layers,
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
window_split=window_split,
|
||||
fetch_from_api=fetch_before_verify,
|
||||
ods_dump_dirs=ods_dump_dirs,
|
||||
use_local_json=use_local_json,
|
||||
verify_tables=verify_tables,
|
||||
)
|
||||
pipeline_logger.set_verification_result(verification_summary)
|
||||
else:
|
||||
# 增量 ETL(increment_only 或 increment_verify)
|
||||
self.logger.info("管道 %s: 执行增量 ETL,层=%s", pipeline, layers)
|
||||
|
||||
if task_codes:
|
||||
results = self.task_executor.run_tasks(task_codes, data_source=data_source)
|
||||
else:
|
||||
auto_tasks = self._resolve_tasks(layers)
|
||||
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
|
||||
|
||||
# increment_verify 模式:增量后执行校验
|
||||
if processing_mode == "increment_verify":
|
||||
self.logger.info("管道 %s: 开始校验并修复", pipeline)
|
||||
verification_summary = self._run_verification(
|
||||
layers=layers,
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
window_split=window_split,
|
||||
ods_dump_dirs=ods_dump_dirs,
|
||||
use_local_json=use_local_json,
|
||||
verify_tables=verify_tables,
|
||||
)
|
||||
pipeline_logger.set_verification_result(verification_summary)
|
||||
|
||||
# 汇总计数
|
||||
pipeline_logger.set_counts(
|
||||
fetched=sum(r.get("counts", {}).get("fetched", 0) for r in results),
|
||||
inserted=sum(r.get("counts", {}).get("inserted", 0) for r in results),
|
||||
updated=sum(r.get("counts", {}).get("updated", 0) for r in results),
|
||||
errors=sum(r.get("counts", {}).get("errors", 0) for r in results),
|
||||
)
|
||||
|
||||
summary_text = pipeline_logger.end(status="成功")
|
||||
self.logger.info("\n%s", summary_text)
|
||||
|
||||
return {
|
||||
"status": "SUCCESS",
|
||||
"pipeline": pipeline,
|
||||
"layers": layers,
|
||||
"results": results,
|
||||
"verification_summary": verification_summary,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
summary_text = pipeline_logger.end(status="失败", error_message=str(exc))
|
||||
self.logger.error("\n%s", summary_text)
|
||||
raise
|
||||
|
||||
def _resolve_tasks(self, layers: list[str]) -> list[str]:
|
||||
"""根据层列表解析任务代码。
|
||||
|
||||
优先使用配置中的任务列表,回退到 task_registry.get_tasks_by_layer()。
|
||||
DWD 层保持原有逻辑(默认 DWD_LOAD_FROM_ODS)。
|
||||
"""
|
||||
tasks: list[str] = []
|
||||
|
||||
for layer in layers:
|
||||
layer_upper = layer.upper()
|
||||
|
||||
if layer_upper == "ODS":
|
||||
ods_tasks = self.config.get("run.ods_tasks", [])
|
||||
if ods_tasks:
|
||||
tasks.extend(ods_tasks)
|
||||
else:
|
||||
registry_tasks = self.task_registry.get_tasks_by_layer("ODS")
|
||||
if registry_tasks:
|
||||
tasks.extend(registry_tasks)
|
||||
else:
|
||||
# 硬编码回退(与原 _get_tasks_for_layers 一致)
|
||||
tasks.extend([
|
||||
"ODS_MEMBER", "ODS_ASSISTANT", "ODS_TABLE",
|
||||
"ODS_ORDER", "ODS_PAYMENT", "ODS_GOODS",
|
||||
])
|
||||
|
||||
elif layer_upper == "DWD":
|
||||
# DWD 层保持原有逻辑
|
||||
tasks.append("DWD_LOAD_FROM_ODS")
|
||||
|
||||
elif layer_upper == "DWS":
|
||||
dws_tasks = self.config.get("run.dws_tasks", [])
|
||||
if dws_tasks:
|
||||
tasks.extend(dws_tasks)
|
||||
else:
|
||||
registry_tasks = self.task_registry.get_tasks_by_layer("DWS")
|
||||
if registry_tasks:
|
||||
tasks.extend(registry_tasks)
|
||||
else:
|
||||
tasks.extend([
|
||||
"DWS_BUILD_ORDER_SUMMARY",
|
||||
"DWS_BUILD_MEMBER_SUMMARY",
|
||||
])
|
||||
|
||||
elif layer_upper == "INDEX":
|
||||
index_tasks = self.config.get("run.index_tasks", [])
|
||||
if index_tasks:
|
||||
tasks.extend(index_tasks)
|
||||
else:
|
||||
registry_tasks = self.task_registry.get_tasks_by_layer("INDEX")
|
||||
if registry_tasks:
|
||||
tasks.extend(registry_tasks)
|
||||
else:
|
||||
tasks.extend([
|
||||
"DWS_WINBACK_INDEX",
|
||||
"DWS_NEWCONV_INDEX",
|
||||
"DWS_RELATION_INDEX",
|
||||
])
|
||||
|
||||
return tasks
|
||||
|
||||
def _run_verification(
|
||||
self,
|
||||
layers: list[str],
|
||||
window_start: datetime,
|
||||
window_end: datetime,
|
||||
window_split: str | None = None,
|
||||
fetch_from_api: bool = False,
|
||||
ods_dump_dirs: dict[str, str] | None = None,
|
||||
use_local_json: bool = False,
|
||||
verify_tables: list[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""对指定层执行后置校验(从原 _run_layer_verification 迁移)。"""
|
||||
try:
|
||||
from tasks.verification import get_verifier_for_layer, build_window_segments
|
||||
except ImportError:
|
||||
self.logger.warning("校验框架未安装,跳过后置校验")
|
||||
return {"status": "SKIPPED", "message": "校验框架未安装"}
|
||||
|
||||
total_tables = 0
|
||||
consistent_tables = 0
|
||||
total_backfilled = 0
|
||||
total_error_tables = 0
|
||||
layer_results: dict[str, Any] = {}
|
||||
skip_ods_on_fetch = bool(self.config.get("verification.skip_ods_when_fetch_before_verify", True))
|
||||
ods_dump_dirs = ods_dump_dirs or {}
|
||||
|
||||
segments = build_window_segments(window_start, window_end, window_split)
|
||||
|
||||
for layer in layers:
|
||||
try:
|
||||
if layer.upper() == "ODS" and fetch_from_api and skip_ods_on_fetch:
|
||||
self.logger.info("ODS 层在 fetch_before_verify 下已完成入库,跳过二次校验")
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "fetch_before_verify",
|
||||
}
|
||||
continue
|
||||
|
||||
if layer.upper() == "ODS" and fetch_from_api:
|
||||
if use_local_json:
|
||||
if not ods_dump_dirs:
|
||||
self.logger.warning("ODS 校验配置为使用本地 JSON,但未找到 dump 目录,跳过 ODS 校验")
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "local_json_missing",
|
||||
}
|
||||
continue
|
||||
verifier = get_verifier_for_layer(
|
||||
layer,
|
||||
self.db_conn,
|
||||
self.logger,
|
||||
api_client=self.api_client,
|
||||
fetch_from_api=True,
|
||||
local_dump_dirs=ods_dump_dirs,
|
||||
use_local_json=True,
|
||||
)
|
||||
self.logger.info("ODS 层使用本地 JSON 校验(不请求 API)")
|
||||
else:
|
||||
verifier = get_verifier_for_layer(
|
||||
layer,
|
||||
self.db_conn,
|
||||
self.logger,
|
||||
api_client=self.api_client,
|
||||
fetch_from_api=True,
|
||||
)
|
||||
self.logger.info("ODS 层启用 API 数据校验")
|
||||
else:
|
||||
verifier_kwargs: dict[str, Any] = {}
|
||||
if layer.upper() == "INDEX":
|
||||
try:
|
||||
lookback_days = int(self.config.get("run.index_lookback_days", 60))
|
||||
except (TypeError, ValueError):
|
||||
lookback_days = 60
|
||||
verifier_kwargs = {
|
||||
"lookback_days": lookback_days,
|
||||
"config": self.config,
|
||||
}
|
||||
self.logger.info("INDEX 层校验使用回溯天数: %s", lookback_days)
|
||||
if layer.upper() == "DWD":
|
||||
verifier_kwargs["config"] = self.config
|
||||
verifier = get_verifier_for_layer(
|
||||
layer,
|
||||
self.db_conn,
|
||||
self.logger,
|
||||
**verifier_kwargs,
|
||||
)
|
||||
|
||||
# 使用 filter_verify_tables 替代原内联静态方法
|
||||
layer_tables = filter_verify_tables(layer, verify_tables)
|
||||
if verify_tables and not layer_tables:
|
||||
self.logger.info("层 %s 无匹配表,跳过校验", layer)
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "table_filter",
|
||||
}
|
||||
continue
|
||||
|
||||
self.logger.info("开始校验层: %s,时间窗口: %s ~ %s", layer, window_start, window_end)
|
||||
|
||||
layer_summary = verifier.verify_and_backfill(
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
auto_backfill=True,
|
||||
split_unit=window_split or "month",
|
||||
tables=layer_tables,
|
||||
)
|
||||
|
||||
layer_results[layer] = layer_summary.to_dict() if hasattr(layer_summary, 'to_dict') else {}
|
||||
|
||||
if hasattr(layer_summary, 'total_tables'):
|
||||
total_tables += layer_summary.total_tables
|
||||
consistent_tables += layer_summary.consistent_tables
|
||||
total_backfilled += layer_summary.total_backfilled
|
||||
total_error_tables += getattr(layer_summary, 'error_tables', 0)
|
||||
|
||||
self.logger.info(
|
||||
"层 %s 校验完成: 表数=%d, 一致=%d, 错误=%d, 补齐=%d",
|
||||
layer,
|
||||
getattr(layer_summary, 'total_tables', 0),
|
||||
getattr(layer_summary, 'consistent_tables', 0),
|
||||
getattr(layer_summary, 'error_tables', 0),
|
||||
getattr(layer_summary, 'total_backfilled', 0),
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error("层 %s 校验失败: %s", layer, exc, exc_info=True)
|
||||
layer_results[layer] = {"status": "ERROR", "error": str(exc)}
|
||||
|
||||
return {
|
||||
"status": "COMPLETED",
|
||||
"total_tables": total_tables,
|
||||
"consistent_tables": consistent_tables,
|
||||
"total_backfilled": total_backfilled,
|
||||
"error_tables": total_error_tables,
|
||||
"layers": layer_results,
|
||||
}
|
||||
Reference in New Issue
Block a user