在前后端开发联调前 的提交20260223
This commit is contained in:
@@ -4,18 +4,21 @@
|
||||
从原 ETLScheduler 中提取 Flow 编排逻辑,委托 TaskExecutor 执行具体任务。
|
||||
所有依赖通过构造函数注入,不自行创建资源。
|
||||
|
||||
术语说明:代码中保留 pipeline 参数名以兼容调用方,概念上统一使用 Flow。
|
||||
术语说明:统一使用 Flow 概念,pipeline 参数已移除。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from tasks.verification import filter_verify_tables
|
||||
from orchestration.topological_sort import topological_sort
|
||||
from utils.timer import EtlTimer
|
||||
|
||||
|
||||
class FlowRunner:
|
||||
@@ -49,9 +52,10 @@ class FlowRunner:
|
||||
self.logger = logger
|
||||
self.tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
|
||||
|
||||
# CHANGE [2026-02-20] intent: 移除 pipeline 参数,统一使用 flow(消除历史别名)
|
||||
def run(
|
||||
self,
|
||||
pipeline: str | None = None,
|
||||
flow: str | None = None,
|
||||
layers: list[str] | None = None,
|
||||
processing_mode: str = "increment_only",
|
||||
data_source: str = "hybrid",
|
||||
@@ -65,9 +69,9 @@ class FlowRunner:
|
||||
"""执行 Flow,返回汇总结果。
|
||||
|
||||
Args:
|
||||
pipeline: Flow 名称 (api_ods, api_ods_dwd, api_full, ...),与 layers 二选一(参数名保留以兼容调用方)
|
||||
flow: Flow 名称 (api_ods, api_ods_dwd, api_full, ...),与 layers 二选一
|
||||
layers: 直接指定层列表 (["ODS", "DWD"] 等),与 flow 名称二选一
|
||||
processing_mode: 处理模式 (increment_only / verify_only / increment_verify)
|
||||
processing_mode: 处理模式 (increment_only / verify_only / increment_verify / full_window)
|
||||
data_source: 数据源模式 (online / offline / hybrid)
|
||||
window_start: 时间窗口开始
|
||||
window_end: 时间窗口结束
|
||||
@@ -77,16 +81,16 @@ class FlowRunner:
|
||||
verify_tables: 指定校验的表名列表(可用于单表验证)
|
||||
|
||||
Returns:
|
||||
执行结果字典,包含 status / pipeline / layers / results / verification_summary
|
||||
执行结果字典,包含 status / flow / layers / results / verification_summary
|
||||
"""
|
||||
from utils.task_logger import TaskLogger
|
||||
|
||||
# 解析层列表:Flow 名称查找 或 直接使用 layers 参数
|
||||
if pipeline is not None:
|
||||
if pipeline not in self.FLOW_LAYERS:
|
||||
raise ValueError(f"无效的 Flow 名称: {pipeline}")
|
||||
resolved_layers = self.FLOW_LAYERS[pipeline]
|
||||
run_label = pipeline
|
||||
if flow is not None:
|
||||
if flow not in self.FLOW_LAYERS:
|
||||
raise ValueError(f"无效的 Flow 名称: {flow}")
|
||||
resolved_layers = self.FLOW_LAYERS[flow]
|
||||
run_label = flow
|
||||
elif layers is not None:
|
||||
resolved_layers = layers
|
||||
run_label = f"layers({','.join(layers)})"
|
||||
@@ -97,6 +101,12 @@ class FlowRunner:
|
||||
flow_logger = TaskLogger(f"FLOW_{run_label.upper()}", self.logger)
|
||||
flow_logger.start(f"开始执行 Flow: {run_label}")
|
||||
|
||||
# CHANGE [2026-02-20] intent: 集成 EtlTimer 到 Flow 执行流程,记录每个任务的耗时(需求 15.1, 15.2, 15.3)
|
||||
# assumptions: EtlTimer 已通过单元测试,输出路径由 ETL_REPORT_ROOT 环境变量控制
|
||||
# prompt: "将 EtlTimer 集成到 FlowRunner.run()"
|
||||
timer = EtlTimer(tz=self.tz)
|
||||
timer.start()
|
||||
|
||||
layers = resolved_layers
|
||||
results: list[dict[str, Any]] = []
|
||||
verification_summary: dict[str, Any] | None = None
|
||||
@@ -119,12 +129,16 @@ class FlowRunner:
|
||||
ods_tasks = [t for t in task_codes if t.startswith("ODS_")]
|
||||
if ods_tasks:
|
||||
self.logger.info("从 API 获取数据: %s", ods_tasks)
|
||||
timer.start_step("FETCH_BEFORE_VERIFY")
|
||||
results = self.task_executor.run_tasks(ods_tasks, data_source=data_source)
|
||||
timer.stop_step("FETCH_BEFORE_VERIFY")
|
||||
else:
|
||||
auto_tasks = self._resolve_tasks(["ODS"])
|
||||
if auto_tasks:
|
||||
self.logger.info("从 API 获取数据: %s", auto_tasks)
|
||||
timer.start_step("FETCH_BEFORE_VERIFY")
|
||||
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
|
||||
timer.stop_step("FETCH_BEFORE_VERIFY")
|
||||
|
||||
ods_dump_dirs = {
|
||||
r.get("task_code"): r.get("dump_dir")
|
||||
@@ -150,15 +164,18 @@ class FlowRunner:
|
||||
# 增量 ETL(increment_only 或 increment_verify)
|
||||
self.logger.info("Flow %s: 执行增量 ETL,层=%s", run_label, layers)
|
||||
|
||||
timer.start_step("INCREMENT_ETL")
|
||||
if task_codes:
|
||||
results = self.task_executor.run_tasks(task_codes, data_source=data_source)
|
||||
else:
|
||||
auto_tasks = self._resolve_tasks(layers)
|
||||
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
|
||||
timer.stop_step("INCREMENT_ETL")
|
||||
|
||||
# increment_verify 模式:增量后执行校验
|
||||
if processing_mode == "increment_verify":
|
||||
self.logger.info("Flow %s: 开始校验并修复", run_label)
|
||||
timer.start_step("VERIFICATION")
|
||||
verification_summary = self._run_verification(
|
||||
layers=layers,
|
||||
window_start=window_start,
|
||||
@@ -168,14 +185,36 @@ class FlowRunner:
|
||||
use_local_json=use_local_json,
|
||||
verify_tables=verify_tables,
|
||||
)
|
||||
timer.stop_step("VERIFICATION")
|
||||
flow_logger.set_verification_result(verification_summary)
|
||||
|
||||
# 汇总计数
|
||||
# CHANGE [2026-02-20] intent: 集成 ConsistencyChecker 到 Flow 执行流程,ETL 完成后自动运行一致性检查(需求 16.1, 16.4)
|
||||
# assumptions: ConsistencyChecker 已通过单元测试,报告输出到 ETL_REPORT_ROOT
|
||||
# prompt: "将 ConsistencyChecker 集成到 FlowRunner.run()"
|
||||
consistency_report_path = self._run_post_consistency_check(timer)
|
||||
|
||||
# 输出计时报告
|
||||
try:
|
||||
timing_report = timer.finish(write_report=True)
|
||||
self.logger.info("计时报告已生成")
|
||||
except KeyError as ke:
|
||||
self.logger.warning("计时报告输出跳过(环境变量缺失): %s", ke)
|
||||
timing_report = timer.finish(write_report=False)
|
||||
|
||||
# 汇总计数 — CHANGE 2026-02-21: BUG 11 fix — errors 可能是 list,需类型安全处理
|
||||
def _safe_int(val) -> int:
|
||||
"""将 int/list/None 统一转为 int 计数。"""
|
||||
if isinstance(val, int):
|
||||
return val
|
||||
if isinstance(val, list):
|
||||
return len(val)
|
||||
return 0
|
||||
|
||||
flow_logger.set_counts(
|
||||
fetched=sum(r.get("counts", {}).get("fetched", 0) for r in results),
|
||||
inserted=sum(r.get("counts", {}).get("inserted", 0) for r in results),
|
||||
updated=sum(r.get("counts", {}).get("updated", 0) for r in results),
|
||||
errors=sum(r.get("counts", {}).get("errors", 0) for r in results),
|
||||
fetched=sum(_safe_int(r.get("counts", {}).get("fetched", 0)) for r in results),
|
||||
inserted=sum(_safe_int(r.get("counts", {}).get("inserted", 0)) for r in results),
|
||||
updated=sum(_safe_int(r.get("counts", {}).get("updated", 0)) for r in results),
|
||||
errors=sum(_safe_int(r.get("counts", {}).get("errors", 0)) for r in results),
|
||||
)
|
||||
|
||||
summary_text = flow_logger.end(status="成功")
|
||||
@@ -183,17 +222,66 @@ class FlowRunner:
|
||||
|
||||
return {
|
||||
"status": "SUCCESS",
|
||||
"pipeline": run_label,
|
||||
"flow": run_label,
|
||||
"layers": layers,
|
||||
"results": results,
|
||||
"verification_summary": verification_summary,
|
||||
"consistency_report_path": consistency_report_path,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
# 异常时也尝试输出计时报告(便于排查耗时瓶颈)
|
||||
try:
|
||||
timer.finish(write_report=True)
|
||||
except Exception:
|
||||
pass
|
||||
summary_text = flow_logger.end(status="失败", error_message=str(exc))
|
||||
self.logger.error("\n%s", summary_text)
|
||||
raise
|
||||
|
||||
def _run_post_consistency_check(self, timer: EtlTimer) -> str | None:
|
||||
"""ETL 完成后运行数据一致性检查,输出黑盒测试报告。
|
||||
|
||||
返回报告文件路径,失败时返回 None(不阻断主流程)。
|
||||
"""
|
||||
try:
|
||||
from quality.consistency_checker import (
|
||||
run_consistency_check,
|
||||
write_consistency_report,
|
||||
)
|
||||
except ImportError:
|
||||
self.logger.warning("一致性检查模块未安装,跳过")
|
||||
return None
|
||||
|
||||
timer.start_step("CONSISTENCY_CHECK")
|
||||
try:
|
||||
api_sample_dir_str = os.environ.get("API_SAMPLE_CACHE_ROOT")
|
||||
api_sample_dir = Path(api_sample_dir_str) if api_sample_dir_str else None
|
||||
|
||||
report = run_consistency_check(
|
||||
self.db_conn,
|
||||
api_sample_dir=api_sample_dir,
|
||||
include_api_vs_ods=bool(api_sample_dir),
|
||||
include_ods_vs_dwd=True,
|
||||
tz=self.tz,
|
||||
)
|
||||
|
||||
report_path = write_consistency_report(report)
|
||||
self.logger.info("一致性检查报告已生成: %s", report_path)
|
||||
return report_path
|
||||
|
||||
except KeyError as ke:
|
||||
self.logger.warning("一致性检查报告输出跳过(环境变量缺失): %s", ke)
|
||||
return None
|
||||
except Exception as exc:
|
||||
self.logger.warning("一致性检查失败(不阻断主流程): %s", exc, exc_info=True)
|
||||
return None
|
||||
finally:
|
||||
try:
|
||||
timer.stop_step("CONSISTENCY_CHECK")
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def _resolve_tasks(self, layers: list[str]) -> list[str]:
|
||||
"""根据层列表解析任务代码。
|
||||
|
||||
|
||||
Reference in New Issue
Block a user