在前后端开发联调前 的提交20260223

This commit is contained in:
Neo
2026-02-23 23:02:20 +08:00
parent 254ccb1e77
commit fafc95e64c
1142 changed files with 10366960 additions and 36957 deletions

View File

@@ -4,18 +4,21 @@
从原 ETLScheduler 中提取 Flow 编排逻辑,委托 TaskExecutor 执行具体任务。
所有依赖通过构造函数注入,不自行创建资源。
术语说明:代码中保留 pipeline 参数名以兼容调用方,概念上统一使用 Flow。
术语说明:统一使用 Flow 概念pipeline 参数已移除
"""
from __future__ import annotations
import logging
import os
import uuid
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional
from zoneinfo import ZoneInfo
from tasks.verification import filter_verify_tables
from orchestration.topological_sort import topological_sort
from utils.timer import EtlTimer
class FlowRunner:
@@ -49,9 +52,10 @@ class FlowRunner:
self.logger = logger
self.tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
# CHANGE [2026-02-20] intent: 移除 pipeline 参数,统一使用 flow消除历史别名
def run(
self,
pipeline: str | None = None,
flow: str | None = None,
layers: list[str] | None = None,
processing_mode: str = "increment_only",
data_source: str = "hybrid",
@@ -65,9 +69,9 @@ class FlowRunner:
"""执行 Flow返回汇总结果。
Args:
pipeline: Flow 名称 (api_ods, api_ods_dwd, api_full, ...),与 layers 二选一(参数名保留以兼容调用方)
flow: Flow 名称 (api_ods, api_ods_dwd, api_full, ...),与 layers 二选一
layers: 直接指定层列表 (["ODS", "DWD"] 等),与 flow 名称二选一
processing_mode: 处理模式 (increment_only / verify_only / increment_verify)
processing_mode: 处理模式 (increment_only / verify_only / increment_verify / full_window)
data_source: 数据源模式 (online / offline / hybrid)
window_start: 时间窗口开始
window_end: 时间窗口结束
@@ -77,16 +81,16 @@ class FlowRunner:
verify_tables: 指定校验的表名列表(可用于单表验证)
Returns:
执行结果字典,包含 status / pipeline / layers / results / verification_summary
执行结果字典,包含 status / flow / layers / results / verification_summary
"""
from utils.task_logger import TaskLogger
# 解析层列表Flow 名称查找 或 直接使用 layers 参数
if pipeline is not None:
if pipeline not in self.FLOW_LAYERS:
raise ValueError(f"无效的 Flow 名称: {pipeline}")
resolved_layers = self.FLOW_LAYERS[pipeline]
run_label = pipeline
if flow is not None:
if flow not in self.FLOW_LAYERS:
raise ValueError(f"无效的 Flow 名称: {flow}")
resolved_layers = self.FLOW_LAYERS[flow]
run_label = flow
elif layers is not None:
resolved_layers = layers
run_label = f"layers({','.join(layers)})"
@@ -97,6 +101,12 @@ class FlowRunner:
flow_logger = TaskLogger(f"FLOW_{run_label.upper()}", self.logger)
flow_logger.start(f"开始执行 Flow: {run_label}")
# CHANGE [2026-02-20] intent: 集成 EtlTimer 到 Flow 执行流程,记录每个任务的耗时(需求 15.1, 15.2, 15.3
# assumptions: EtlTimer 已通过单元测试,输出路径由 ETL_REPORT_ROOT 环境变量控制
# prompt: "将 EtlTimer 集成到 FlowRunner.run()"
timer = EtlTimer(tz=self.tz)
timer.start()
layers = resolved_layers
results: list[dict[str, Any]] = []
verification_summary: dict[str, Any] | None = None
@@ -119,12 +129,16 @@ class FlowRunner:
ods_tasks = [t for t in task_codes if t.startswith("ODS_")]
if ods_tasks:
self.logger.info("从 API 获取数据: %s", ods_tasks)
timer.start_step("FETCH_BEFORE_VERIFY")
results = self.task_executor.run_tasks(ods_tasks, data_source=data_source)
timer.stop_step("FETCH_BEFORE_VERIFY")
else:
auto_tasks = self._resolve_tasks(["ODS"])
if auto_tasks:
self.logger.info("从 API 获取数据: %s", auto_tasks)
timer.start_step("FETCH_BEFORE_VERIFY")
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
timer.stop_step("FETCH_BEFORE_VERIFY")
ods_dump_dirs = {
r.get("task_code"): r.get("dump_dir")
@@ -150,15 +164,18 @@ class FlowRunner:
# 增量 ETLincrement_only 或 increment_verify
self.logger.info("Flow %s: 执行增量 ETL层=%s", run_label, layers)
timer.start_step("INCREMENT_ETL")
if task_codes:
results = self.task_executor.run_tasks(task_codes, data_source=data_source)
else:
auto_tasks = self._resolve_tasks(layers)
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
timer.stop_step("INCREMENT_ETL")
# increment_verify 模式:增量后执行校验
if processing_mode == "increment_verify":
self.logger.info("Flow %s: 开始校验并修复", run_label)
timer.start_step("VERIFICATION")
verification_summary = self._run_verification(
layers=layers,
window_start=window_start,
@@ -168,14 +185,36 @@ class FlowRunner:
use_local_json=use_local_json,
verify_tables=verify_tables,
)
timer.stop_step("VERIFICATION")
flow_logger.set_verification_result(verification_summary)
# 汇总计数
# CHANGE [2026-02-20] intent: 集成 ConsistencyChecker 到 Flow 执行流程ETL 完成后自动运行一致性检查(需求 16.1, 16.4
# assumptions: ConsistencyChecker 已通过单元测试,报告输出到 ETL_REPORT_ROOT
# prompt: "将 ConsistencyChecker 集成到 FlowRunner.run()"
consistency_report_path = self._run_post_consistency_check(timer)
# 输出计时报告
try:
timing_report = timer.finish(write_report=True)
self.logger.info("计时报告已生成")
except KeyError as ke:
self.logger.warning("计时报告输出跳过(环境变量缺失): %s", ke)
timing_report = timer.finish(write_report=False)
# 汇总计数 — CHANGE 2026-02-21: BUG 11 fix — errors 可能是 list需类型安全处理
def _safe_int(val) -> int:
"""将 int/list/None 统一转为 int 计数。"""
if isinstance(val, int):
return val
if isinstance(val, list):
return len(val)
return 0
flow_logger.set_counts(
fetched=sum(r.get("counts", {}).get("fetched", 0) for r in results),
inserted=sum(r.get("counts", {}).get("inserted", 0) for r in results),
updated=sum(r.get("counts", {}).get("updated", 0) for r in results),
errors=sum(r.get("counts", {}).get("errors", 0) for r in results),
fetched=sum(_safe_int(r.get("counts", {}).get("fetched", 0)) for r in results),
inserted=sum(_safe_int(r.get("counts", {}).get("inserted", 0)) for r in results),
updated=sum(_safe_int(r.get("counts", {}).get("updated", 0)) for r in results),
errors=sum(_safe_int(r.get("counts", {}).get("errors", 0)) for r in results),
)
summary_text = flow_logger.end(status="成功")
@@ -183,17 +222,66 @@ class FlowRunner:
return {
"status": "SUCCESS",
"pipeline": run_label,
"flow": run_label,
"layers": layers,
"results": results,
"verification_summary": verification_summary,
"consistency_report_path": consistency_report_path,
}
except Exception as exc:
# 异常时也尝试输出计时报告(便于排查耗时瓶颈)
try:
timer.finish(write_report=True)
except Exception:
pass
summary_text = flow_logger.end(status="失败", error_message=str(exc))
self.logger.error("\n%s", summary_text)
raise
def _run_post_consistency_check(self, timer: EtlTimer) -> str | None:
"""ETL 完成后运行数据一致性检查,输出黑盒测试报告。
返回报告文件路径,失败时返回 None不阻断主流程
"""
try:
from quality.consistency_checker import (
run_consistency_check,
write_consistency_report,
)
except ImportError:
self.logger.warning("一致性检查模块未安装,跳过")
return None
timer.start_step("CONSISTENCY_CHECK")
try:
api_sample_dir_str = os.environ.get("API_SAMPLE_CACHE_ROOT")
api_sample_dir = Path(api_sample_dir_str) if api_sample_dir_str else None
report = run_consistency_check(
self.db_conn,
api_sample_dir=api_sample_dir,
include_api_vs_ods=bool(api_sample_dir),
include_ods_vs_dwd=True,
tz=self.tz,
)
report_path = write_consistency_report(report)
self.logger.info("一致性检查报告已生成: %s", report_path)
return report_path
except KeyError as ke:
self.logger.warning("一致性检查报告输出跳过(环境变量缺失): %s", ke)
return None
except Exception as exc:
self.logger.warning("一致性检查失败(不阻断主流程): %s", exc, exc_info=True)
return None
finally:
try:
timer.stop_step("CONSISTENCY_CHECK")
except KeyError:
pass
def _resolve_tasks(self, layers: list[str]) -> list[str]:
"""根据层列表解析任务代码。