在前后端开发联调前的提交20260223

2026-02-23 23:02:20 +08:00
parent 254ccb1e77
commit fafc95e64c
1142 changed files with 10366960 additions and 36957 deletions
--- a/apps/etl/connectors/feiqiu/orchestration/flow_runner.py
+++ b/apps/etl/connectors/feiqiu/orchestration/flow_runner.py
@@ -4,18 +4,21 @@
 从原 ETLScheduler 中提取 Flow 编排逻辑，委托 TaskExecutor 执行具体任务。
 所有依赖通过构造函数注入，不自行创建资源。

-术语说明：代码中保留 pipeline 参数名以兼容调用方，概念上统一使用 Flow。
+术语说明：统一使用 Flow 概念，pipeline 参数已移除。
 """
 from __future__ import annotations

 import logging
+import os
 import uuid
 from datetime import datetime, timedelta
+from pathlib import Path
 from typing import Any, Dict, List, Optional
 from zoneinfo import ZoneInfo

 from tasks.verification import filter_verify_tables
 from orchestration.topological_sort import topological_sort
+from utils.timer import EtlTimer


 class FlowRunner:
@@ -49,9 +52,10 @@ class FlowRunner:
        self.logger = logger
        self.tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))

+    # CHANGE [2026-02-20] intent: 移除 pipeline 参数，统一使用 flow（消除历史别名）
    def run(
        self,
-        pipeline: str | None = None,
+        flow: str | None = None,
        layers: list[str] | None = None,
        processing_mode: str = "increment_only",
        data_source: str = "hybrid",
@@ -65,9 +69,9 @@ class FlowRunner:
        """执行 Flow，返回汇总结果。

        Args:
-            pipeline: Flow 名称 (api_ods, api_ods_dwd, api_full, ...)，与 layers 二选一（参数名保留以兼容调用方）
+            flow: Flow 名称 (api_ods, api_ods_dwd, api_full, ...)，与 layers 二选一
            layers: 直接指定层列表 (["ODS", "DWD"] 等)，与 flow 名称二选一
-            processing_mode: 处理模式 (increment_only / verify_only / increment_verify)
+            processing_mode: 处理模式 (increment_only / verify_only / increment_verify / full_window)
            data_source: 数据源模式 (online / offline / hybrid)
            window_start: 时间窗口开始
            window_end: 时间窗口结束
@@ -77,16 +81,16 @@ class FlowRunner:
            verify_tables: 指定校验的表名列表（可用于单表验证）

        Returns:
-            执行结果字典，包含 status / pipeline / layers / results / verification_summary
+            执行结果字典，包含 status / flow / layers / results / verification_summary
        """
        from utils.task_logger import TaskLogger

        # 解析层列表：Flow 名称查找 或 直接使用 layers 参数
-        if pipeline is not None:
-            if pipeline not in self.FLOW_LAYERS:
-                raise ValueError(f"无效的 Flow 名称: {pipeline}")
-            resolved_layers = self.FLOW_LAYERS[pipeline]
-            run_label = pipeline
+        if flow is not None:
+            if flow not in self.FLOW_LAYERS:
+                raise ValueError(f"无效的 Flow 名称: {flow}")
+            resolved_layers = self.FLOW_LAYERS[flow]
+            run_label = flow
        elif layers is not None:
            resolved_layers = layers
            run_label = f"layers({','.join(layers)})"
@@ -97,6 +101,12 @@ class FlowRunner:
        flow_logger = TaskLogger(f"FLOW_{run_label.upper()}", self.logger)
        flow_logger.start(f"开始执行 Flow: {run_label}")

+        # CHANGE [2026-02-20] intent: 集成 EtlTimer 到 Flow 执行流程，记录每个任务的耗时（需求 15.1, 15.2, 15.3）
+        #   assumptions: EtlTimer 已通过单元测试，输出路径由 ETL_REPORT_ROOT 环境变量控制
+        #   prompt: "将 EtlTimer 集成到 FlowRunner.run()"
+        timer = EtlTimer(tz=self.tz)
+        timer.start()
+
        layers = resolved_layers
        results: list[dict[str, Any]] = []
        verification_summary: dict[str, Any] | None = None
@@ -119,12 +129,16 @@ class FlowRunner:
                        ods_tasks = [t for t in task_codes if t.startswith("ODS_")]
                        if ods_tasks:
                            self.logger.info("从 API 获取数据: %s", ods_tasks)
+                            timer.start_step("FETCH_BEFORE_VERIFY")
                            results = self.task_executor.run_tasks(ods_tasks, data_source=data_source)
+                            timer.stop_step("FETCH_BEFORE_VERIFY")
                    else:
                        auto_tasks = self._resolve_tasks(["ODS"])
                        if auto_tasks:
                            self.logger.info("从 API 获取数据: %s", auto_tasks)
+                            timer.start_step("FETCH_BEFORE_VERIFY")
                            results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
+                            timer.stop_step("FETCH_BEFORE_VERIFY")

                    ods_dump_dirs = {
                        r.get("task_code"): r.get("dump_dir")
@@ -150,15 +164,18 @@ class FlowRunner:
                # 增量 ETL（increment_only 或 increment_verify）
                self.logger.info("Flow %s: 执行增量 ETL，层=%s", run_label, layers)

+                timer.start_step("INCREMENT_ETL")
                if task_codes:
                    results = self.task_executor.run_tasks(task_codes, data_source=data_source)
                else:
                    auto_tasks = self._resolve_tasks(layers)
                    results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
+                timer.stop_step("INCREMENT_ETL")

                # increment_verify 模式：增量后执行校验
                if processing_mode == "increment_verify":
                    self.logger.info("Flow %s: 开始校验并修复", run_label)
+                    timer.start_step("VERIFICATION")
                    verification_summary = self._run_verification(
                        layers=layers,
                        window_start=window_start,
@@ -168,14 +185,36 @@ class FlowRunner:
                        use_local_json=use_local_json,
                        verify_tables=verify_tables,
                    )
+                    timer.stop_step("VERIFICATION")
                    flow_logger.set_verification_result(verification_summary)

-            # 汇总计数
+            # CHANGE [2026-02-20] intent: 集成 ConsistencyChecker 到 Flow 执行流程，ETL 完成后自动运行一致性检查（需求 16.1, 16.4）
+            #   assumptions: ConsistencyChecker 已通过单元测试，报告输出到 ETL_REPORT_ROOT
+            #   prompt: "将 ConsistencyChecker 集成到 FlowRunner.run()"
+            consistency_report_path = self._run_post_consistency_check(timer)
+
+            # 输出计时报告
+            try:
+                timing_report = timer.finish(write_report=True)
+                self.logger.info("计时报告已生成")
+            except KeyError as ke:
+                self.logger.warning("计时报告输出跳过（环境变量缺失）: %s", ke)
+                timing_report = timer.finish(write_report=False)
+
+            # 汇总计数 — CHANGE 2026-02-21: BUG 11 fix — errors 可能是 list，需类型安全处理
+            def _safe_int(val) -> int:
+                """将 int/list/None 统一转为 int 计数。"""
+                if isinstance(val, int):
+                    return val
+                if isinstance(val, list):
+                    return len(val)
+                return 0
+
            flow_logger.set_counts(
-                fetched=sum(r.get("counts", {}).get("fetched", 0) for r in results),
-                inserted=sum(r.get("counts", {}).get("inserted", 0) for r in results),
-                updated=sum(r.get("counts", {}).get("updated", 0) for r in results),
-                errors=sum(r.get("counts", {}).get("errors", 0) for r in results),
+                fetched=sum(_safe_int(r.get("counts", {}).get("fetched", 0)) for r in results),
+                inserted=sum(_safe_int(r.get("counts", {}).get("inserted", 0)) for r in results),
+                updated=sum(_safe_int(r.get("counts", {}).get("updated", 0)) for r in results),
+                errors=sum(_safe_int(r.get("counts", {}).get("errors", 0)) for r in results),
            )

            summary_text = flow_logger.end(status="成功")
@@ -183,17 +222,66 @@ class FlowRunner:

            return {
                "status": "SUCCESS",
-                "pipeline": run_label,
+                "flow": run_label,
                "layers": layers,
                "results": results,
                "verification_summary": verification_summary,
+                "consistency_report_path": consistency_report_path,
            }

        except Exception as exc:
+            # 异常时也尝试输出计时报告（便于排查耗时瓶颈）
+            try:
+                timer.finish(write_report=True)
+            except Exception:
+                pass
            summary_text = flow_logger.end(status="失败", error_message=str(exc))
            self.logger.error("\n%s", summary_text)
            raise

+    def _run_post_consistency_check(self, timer: EtlTimer) -> str | None:
+        """ETL 完成后运行数据一致性检查，输出黑盒测试报告。
+
+        返回报告文件路径，失败时返回 None（不阻断主流程）。
+        """
+        try:
+            from quality.consistency_checker import (
+                run_consistency_check,
+                write_consistency_report,
+            )
+        except ImportError:
+            self.logger.warning("一致性检查模块未安装，跳过")
+            return None
+
+        timer.start_step("CONSISTENCY_CHECK")
+        try:
+            api_sample_dir_str = os.environ.get("API_SAMPLE_CACHE_ROOT")
+            api_sample_dir = Path(api_sample_dir_str) if api_sample_dir_str else None
+
+            report = run_consistency_check(
+                self.db_conn,
+                api_sample_dir=api_sample_dir,
+                include_api_vs_ods=bool(api_sample_dir),
+                include_ods_vs_dwd=True,
+                tz=self.tz,
+            )
+
+            report_path = write_consistency_report(report)
+            self.logger.info("一致性检查报告已生成: %s", report_path)
+            return report_path
+
+        except KeyError as ke:
+            self.logger.warning("一致性检查报告输出跳过（环境变量缺失）: %s", ke)
+            return None
+        except Exception as exc:
+            self.logger.warning("一致性检查失败（不阻断主流程）: %s", exc, exc_info=True)
+            return None
+        finally:
+            try:
+                timer.stop_step("CONSISTENCY_CHECK")
+            except KeyError:
+                pass
+
    def _resolve_tasks(self, layers: list[str]) -> list[str]:
        """根据层列表解析任务代码。