在准备环境前提交次全部更改。
This commit is contained in:
62
apps/etl/connectors/feiqiu/orchestration/cursor_manager.py
Normal file
62
apps/etl/connectors/feiqiu/orchestration/cursor_manager.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""游标管理器"""
|
||||
from datetime import datetime
|
||||
|
||||
class CursorManager:
|
||||
"""ETL游标管理"""
|
||||
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
|
||||
def get_or_create(self, task_id: int, store_id: int) -> dict:
|
||||
"""获取或创建游标"""
|
||||
rows = self.db.query(
|
||||
"SELECT * FROM meta.etl_cursor WHERE task_id=%s AND store_id=%s",
|
||||
(task_id, store_id)
|
||||
)
|
||||
|
||||
if rows:
|
||||
return rows[0]
|
||||
|
||||
# 创建新游标
|
||||
self.db.execute(
|
||||
"""
|
||||
INSERT INTO meta.etl_cursor(task_id, store_id, last_start, last_end, last_id, extra)
|
||||
VALUES(%s, %s, NULL, NULL, NULL, '{}'::jsonb)
|
||||
""",
|
||||
(task_id, store_id)
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
rows = self.db.query(
|
||||
"SELECT * FROM meta.etl_cursor WHERE task_id=%s AND store_id=%s",
|
||||
(task_id, store_id)
|
||||
)
|
||||
return rows[0] if rows else None
|
||||
|
||||
def advance(self, task_id: int, store_id: int, window_start: datetime,
|
||||
window_end: datetime, run_id: int, last_id: int = None):
|
||||
"""推进游标"""
|
||||
if last_id is not None:
|
||||
sql = """
|
||||
UPDATE meta.etl_cursor
|
||||
SET last_start = %s,
|
||||
last_end = %s,
|
||||
last_id = GREATEST(COALESCE(last_id, 0), %s),
|
||||
last_run_id = %s,
|
||||
updated_at = now()
|
||||
WHERE task_id = %s AND store_id = %s
|
||||
"""
|
||||
self.db.execute(sql, (window_start, window_end, last_id, run_id, task_id, store_id))
|
||||
else:
|
||||
sql = """
|
||||
UPDATE meta.etl_cursor
|
||||
SET last_start = %s,
|
||||
last_end = %s,
|
||||
last_run_id = %s,
|
||||
updated_at = now()
|
||||
WHERE task_id = %s AND store_id = %s
|
||||
"""
|
||||
self.db.execute(sql, (window_start, window_end, run_id, task_id, store_id))
|
||||
|
||||
self.db.commit()
|
||||
391
apps/etl/connectors/feiqiu/orchestration/flow_runner.py
Normal file
391
apps/etl/connectors/feiqiu/orchestration/flow_runner.py
Normal file
@@ -0,0 +1,391 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Flow 运行器:Flow 定义、层→任务映射、校验编排。
|
||||
|
||||
从原 ETLScheduler 中提取 Flow 编排逻辑,委托 TaskExecutor 执行具体任务。
|
||||
所有依赖通过构造函数注入,不自行创建资源。
|
||||
|
||||
术语说明:代码中保留 pipeline 参数名以兼容调用方,概念上统一使用 Flow。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List, Optional
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from tasks.verification import filter_verify_tables
|
||||
from orchestration.topological_sort import topological_sort
|
||||
|
||||
|
||||
class FlowRunner:
|
||||
"""Flow 编排器:根据 Flow 定义执行多层 ETL 任务并可选运行后置校验。"""
|
||||
|
||||
# Flow 定义:每个 Flow 包含的层(从 scheduler.py 模块级常量迁移至此)
|
||||
FLOW_LAYERS: dict[str, list[str]] = {
|
||||
"api_ods": ["ODS"],
|
||||
"api_ods_dwd": ["ODS", "DWD"],
|
||||
"api_full": ["ODS", "DWD", "DWS", "INDEX"],
|
||||
"ods_dwd": ["DWD"],
|
||||
"dwd_dws": ["DWS"],
|
||||
"dwd_dws_index": ["DWS", "INDEX"],
|
||||
"dwd_index": ["INDEX"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
task_executor,
|
||||
task_registry,
|
||||
db_conn,
|
||||
api_client,
|
||||
logger: logging.Logger,
|
||||
):
|
||||
self.config = config
|
||||
self.task_executor = task_executor
|
||||
self.task_registry = task_registry
|
||||
self.db_conn = db_conn
|
||||
self.api_client = api_client
|
||||
self.logger = logger
|
||||
self.tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
|
||||
|
||||
def run(
|
||||
self,
|
||||
pipeline: str | None = None,
|
||||
layers: list[str] | None = None,
|
||||
processing_mode: str = "increment_only",
|
||||
data_source: str = "hybrid",
|
||||
window_start: datetime | None = None,
|
||||
window_end: datetime | None = None,
|
||||
window_split: str | None = None,
|
||||
task_codes: list[str] | None = None,
|
||||
fetch_before_verify: bool = False,
|
||||
verify_tables: list[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""执行 Flow,返回汇总结果。
|
||||
|
||||
Args:
|
||||
pipeline: Flow 名称 (api_ods, api_ods_dwd, api_full, ...),与 layers 二选一(参数名保留以兼容调用方)
|
||||
layers: 直接指定层列表 (["ODS", "DWD"] 等),与 flow 名称二选一
|
||||
processing_mode: 处理模式 (increment_only / verify_only / increment_verify)
|
||||
data_source: 数据源模式 (online / offline / hybrid)
|
||||
window_start: 时间窗口开始
|
||||
window_end: 时间窗口结束
|
||||
window_split: 时间窗口切分 (none / day / week / month)
|
||||
task_codes: 要执行的任务代码列表(作为 Flow 内的任务过滤器)
|
||||
fetch_before_verify: 校验前是否先从 API 获取数据(仅在 verify_only 模式下有效)
|
||||
verify_tables: 指定校验的表名列表(可用于单表验证)
|
||||
|
||||
Returns:
|
||||
执行结果字典,包含 status / pipeline / layers / results / verification_summary
|
||||
"""
|
||||
from utils.task_logger import TaskLogger
|
||||
|
||||
# 解析层列表:Flow 名称查找 或 直接使用 layers 参数
|
||||
if pipeline is not None:
|
||||
if pipeline not in self.FLOW_LAYERS:
|
||||
raise ValueError(f"无效的 Flow 名称: {pipeline}")
|
||||
resolved_layers = self.FLOW_LAYERS[pipeline]
|
||||
run_label = pipeline
|
||||
elif layers is not None:
|
||||
resolved_layers = layers
|
||||
run_label = f"layers({','.join(layers)})"
|
||||
else:
|
||||
raise ValueError("必须指定 flow 名称或 layers 参数之一")
|
||||
|
||||
run_uuid = uuid.uuid4().hex
|
||||
flow_logger = TaskLogger(f"FLOW_{run_label.upper()}", self.logger)
|
||||
flow_logger.start(f"开始执行 Flow: {run_label}")
|
||||
|
||||
layers = resolved_layers
|
||||
results: list[dict[str, Any]] = []
|
||||
verification_summary: dict[str, Any] | None = None
|
||||
ods_dump_dirs: dict[str, str] = {}
|
||||
use_local_json = bool(self.config.get("verification.ods_use_local_json", False))
|
||||
|
||||
# 设置默认时间窗口
|
||||
if window_end is None:
|
||||
window_end = datetime.now(self.tz)
|
||||
if window_start is None:
|
||||
window_start = window_end - timedelta(hours=24)
|
||||
|
||||
try:
|
||||
if processing_mode == "verify_only":
|
||||
# 仅校验模式
|
||||
if fetch_before_verify:
|
||||
self.logger.info("Flow %s: 校验模式(先获取 API 数据)", run_label)
|
||||
|
||||
if task_codes:
|
||||
ods_tasks = [t for t in task_codes if t.startswith("ODS_")]
|
||||
if ods_tasks:
|
||||
self.logger.info("从 API 获取数据: %s", ods_tasks)
|
||||
results = self.task_executor.run_tasks(ods_tasks, data_source=data_source)
|
||||
else:
|
||||
auto_tasks = self._resolve_tasks(["ODS"])
|
||||
if auto_tasks:
|
||||
self.logger.info("从 API 获取数据: %s", auto_tasks)
|
||||
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
|
||||
|
||||
ods_dump_dirs = {
|
||||
r.get("task_code"): r.get("dump_dir")
|
||||
for r in results
|
||||
if r.get("task_code") and r.get("dump_dir")
|
||||
}
|
||||
self.logger.info("API 数据获取完成,开始校验并修复")
|
||||
else:
|
||||
self.logger.info("Flow %s: 仅校验模式,跳过增量 ETL,直接执行校验并修复", run_label)
|
||||
|
||||
verification_summary = self._run_verification(
|
||||
layers=layers,
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
window_split=window_split,
|
||||
fetch_from_api=fetch_before_verify,
|
||||
ods_dump_dirs=ods_dump_dirs,
|
||||
use_local_json=use_local_json,
|
||||
verify_tables=verify_tables,
|
||||
)
|
||||
flow_logger.set_verification_result(verification_summary)
|
||||
else:
|
||||
# 增量 ETL(increment_only 或 increment_verify)
|
||||
self.logger.info("Flow %s: 执行增量 ETL,层=%s", run_label, layers)
|
||||
|
||||
if task_codes:
|
||||
results = self.task_executor.run_tasks(task_codes, data_source=data_source)
|
||||
else:
|
||||
auto_tasks = self._resolve_tasks(layers)
|
||||
results = self.task_executor.run_tasks(auto_tasks, data_source=data_source)
|
||||
|
||||
# increment_verify 模式:增量后执行校验
|
||||
if processing_mode == "increment_verify":
|
||||
self.logger.info("Flow %s: 开始校验并修复", run_label)
|
||||
verification_summary = self._run_verification(
|
||||
layers=layers,
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
window_split=window_split,
|
||||
ods_dump_dirs=ods_dump_dirs,
|
||||
use_local_json=use_local_json,
|
||||
verify_tables=verify_tables,
|
||||
)
|
||||
flow_logger.set_verification_result(verification_summary)
|
||||
|
||||
# 汇总计数
|
||||
flow_logger.set_counts(
|
||||
fetched=sum(r.get("counts", {}).get("fetched", 0) for r in results),
|
||||
inserted=sum(r.get("counts", {}).get("inserted", 0) for r in results),
|
||||
updated=sum(r.get("counts", {}).get("updated", 0) for r in results),
|
||||
errors=sum(r.get("counts", {}).get("errors", 0) for r in results),
|
||||
)
|
||||
|
||||
summary_text = flow_logger.end(status="成功")
|
||||
self.logger.info("\n%s", summary_text)
|
||||
|
||||
return {
|
||||
"status": "SUCCESS",
|
||||
"pipeline": run_label,
|
||||
"layers": layers,
|
||||
"results": results,
|
||||
"verification_summary": verification_summary,
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
summary_text = flow_logger.end(status="失败", error_message=str(exc))
|
||||
self.logger.error("\n%s", summary_text)
|
||||
raise
|
||||
|
||||
def _resolve_tasks(self, layers: list[str]) -> list[str]:
|
||||
"""根据层列表解析任务代码。
|
||||
|
||||
优先级:配置值 > TaskRegistry.get_tasks_by_layer() > 空列表(记录警告)。
|
||||
"""
|
||||
# CHANGE [2026-07-17] intent: 移除所有硬编码回退列表,统一走 Registry(需求 7.1, 7.2, 7.3)
|
||||
# assumptions: TaskRegistry 已注册所有层的任务;DWD 层新增 run.dwd_tasks 配置键
|
||||
# prompt: "统一 _resolve_tasks() 去掉硬编码回退"
|
||||
# 配置键与层名的映射
|
||||
_LAYER_CONFIG_KEY = {
|
||||
"ODS": "run.ods_tasks",
|
||||
"DWD": "run.dwd_tasks",
|
||||
"DWS": "run.dws_tasks",
|
||||
"INDEX": "run.index_tasks",
|
||||
}
|
||||
|
||||
tasks: list[str] = []
|
||||
|
||||
for layer in layers:
|
||||
layer_upper = layer.upper()
|
||||
|
||||
# 1. 优先使用配置值
|
||||
config_key = _LAYER_CONFIG_KEY.get(layer_upper)
|
||||
if config_key:
|
||||
config_tasks = self.config.get(config_key, [])
|
||||
if config_tasks:
|
||||
tasks.extend(config_tasks)
|
||||
continue
|
||||
|
||||
# 2. 回退到 Registry
|
||||
registry_tasks = self.task_registry.get_tasks_by_layer(layer_upper)
|
||||
if registry_tasks:
|
||||
tasks.extend(registry_tasks)
|
||||
else:
|
||||
# 3. Registry 也为空,记录警告并跳过
|
||||
self.logger.warning(
|
||||
"层 %s 在 Registry 中无已注册任务且无配置覆盖,跳过", layer_upper
|
||||
)
|
||||
|
||||
# CHANGE [2026-07-18] intent: 对收集到的任务执行拓扑排序,确保依赖方在被依赖方之后(需求 8.3, 8.4, 8.5)
|
||||
if tasks:
|
||||
tasks = topological_sort(tasks, self.task_registry)
|
||||
|
||||
return tasks
|
||||
|
||||
|
||||
def _run_verification(
|
||||
self,
|
||||
layers: list[str],
|
||||
window_start: datetime,
|
||||
window_end: datetime,
|
||||
window_split: str | None = None,
|
||||
fetch_from_api: bool = False,
|
||||
ods_dump_dirs: dict[str, str] | None = None,
|
||||
use_local_json: bool = False,
|
||||
verify_tables: list[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""对指定层执行后置校验(从原 _run_layer_verification 迁移)。"""
|
||||
try:
|
||||
from tasks.verification import get_verifier_for_layer, build_window_segments
|
||||
except ImportError:
|
||||
self.logger.warning("校验框架未安装,跳过后置校验")
|
||||
return {"status": "SKIPPED", "message": "校验框架未安装"}
|
||||
|
||||
total_tables = 0
|
||||
consistent_tables = 0
|
||||
total_backfilled = 0
|
||||
total_error_tables = 0
|
||||
layer_results: dict[str, Any] = {}
|
||||
skip_ods_on_fetch = bool(self.config.get("verification.skip_ods_when_fetch_before_verify", True))
|
||||
ods_dump_dirs = ods_dump_dirs or {}
|
||||
|
||||
segments = build_window_segments(window_start, window_end, window_split)
|
||||
|
||||
for layer in layers:
|
||||
try:
|
||||
if layer.upper() == "ODS" and fetch_from_api and skip_ods_on_fetch:
|
||||
self.logger.info("ODS 层在 fetch_before_verify 下已完成入库,跳过二次校验")
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "fetch_before_verify",
|
||||
}
|
||||
continue
|
||||
|
||||
# CHANGE [2025-07-18] intent: DWS/INDEX 层跳过完整性校验,仅记录日志(需求 6.5)
|
||||
# assumptions: DWS/INDEX 层无轻量级 verifier,跳过最安全
|
||||
# prompt: "实现 DWS/INDEX 层轻量级校验"
|
||||
if layer.upper() in ("DWS", "INDEX"):
|
||||
self.logger.info(
|
||||
"DWS/INDEX 层使用轻量级校验,跳过完整性检查: %s", layer
|
||||
)
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "lightweight_dws_index",
|
||||
}
|
||||
continue
|
||||
|
||||
if layer.upper() == "ODS" and fetch_from_api:
|
||||
if use_local_json:
|
||||
if not ods_dump_dirs:
|
||||
self.logger.warning("ODS 校验配置为使用本地 JSON,但未找到 dump 目录,跳过 ODS 校验")
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "local_json_missing",
|
||||
}
|
||||
continue
|
||||
verifier = get_verifier_for_layer(
|
||||
layer,
|
||||
self.db_conn,
|
||||
self.logger,
|
||||
api_client=self.api_client,
|
||||
fetch_from_api=True,
|
||||
local_dump_dirs=ods_dump_dirs,
|
||||
use_local_json=True,
|
||||
)
|
||||
self.logger.info("ODS 层使用本地 JSON 校验(不请求 API)")
|
||||
else:
|
||||
verifier = get_verifier_for_layer(
|
||||
layer,
|
||||
self.db_conn,
|
||||
self.logger,
|
||||
api_client=self.api_client,
|
||||
fetch_from_api=True,
|
||||
)
|
||||
self.logger.info("ODS 层启用 API 数据校验")
|
||||
else:
|
||||
verifier_kwargs: dict[str, Any] = {}
|
||||
if layer.upper() == "INDEX":
|
||||
try:
|
||||
lookback_days = int(self.config.get("run.index_lookback_days", 60))
|
||||
except (TypeError, ValueError):
|
||||
lookback_days = 60
|
||||
verifier_kwargs = {
|
||||
"lookback_days": lookback_days,
|
||||
"config": self.config,
|
||||
}
|
||||
self.logger.info("INDEX 层校验使用回溯天数: %s", lookback_days)
|
||||
if layer.upper() == "DWD":
|
||||
verifier_kwargs["config"] = self.config
|
||||
verifier = get_verifier_for_layer(
|
||||
layer,
|
||||
self.db_conn,
|
||||
self.logger,
|
||||
**verifier_kwargs,
|
||||
)
|
||||
|
||||
# 使用 filter_verify_tables 替代原内联静态方法
|
||||
layer_tables = filter_verify_tables(layer, verify_tables)
|
||||
if verify_tables and not layer_tables:
|
||||
self.logger.info("层 %s 无匹配表,跳过校验", layer)
|
||||
layer_results[layer] = {
|
||||
"status": "SKIPPED",
|
||||
"reason": "table_filter",
|
||||
}
|
||||
continue
|
||||
|
||||
self.logger.info("开始校验层: %s,时间窗口: %s ~ %s", layer, window_start, window_end)
|
||||
|
||||
layer_summary = verifier.verify_and_backfill(
|
||||
window_start=window_start,
|
||||
window_end=window_end,
|
||||
auto_backfill=True,
|
||||
split_unit=window_split or "month",
|
||||
tables=layer_tables,
|
||||
)
|
||||
|
||||
layer_results[layer] = layer_summary.to_dict() if hasattr(layer_summary, 'to_dict') else {}
|
||||
|
||||
if hasattr(layer_summary, 'total_tables'):
|
||||
total_tables += layer_summary.total_tables
|
||||
consistent_tables += layer_summary.consistent_tables
|
||||
total_backfilled += layer_summary.total_backfilled
|
||||
total_error_tables += getattr(layer_summary, 'error_tables', 0)
|
||||
|
||||
self.logger.info(
|
||||
"层 %s 校验完成: 表数=%d, 一致=%d, 错误=%d, 补齐=%d",
|
||||
layer,
|
||||
getattr(layer_summary, 'total_tables', 0),
|
||||
getattr(layer_summary, 'consistent_tables', 0),
|
||||
getattr(layer_summary, 'error_tables', 0),
|
||||
getattr(layer_summary, 'total_backfilled', 0),
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error("层 %s 校验失败: %s", layer, exc, exc_info=True)
|
||||
layer_results[layer] = {"status": "ERROR", "error": str(exc)}
|
||||
|
||||
return {
|
||||
"status": "COMPLETED",
|
||||
"total_tables": total_tables,
|
||||
"consistent_tables": consistent_tables,
|
||||
"total_backfilled": total_backfilled,
|
||||
"error_tables": total_error_tables,
|
||||
"layers": layer_results,
|
||||
}
|
||||
144
apps/etl/connectors/feiqiu/orchestration/run_tracker.py
Normal file
144
apps/etl/connectors/feiqiu/orchestration/run_tracker.py
Normal file
@@ -0,0 +1,144 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""运行记录追踪器"""
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class RunTracker:
|
||||
"""ETL运行记录管理"""
|
||||
|
||||
def __init__(self, db_connection):
|
||||
self.db = db_connection
|
||||
|
||||
def create_run(self, task_id: int, store_id: int, run_uuid: str,
|
||||
export_dir: str, log_path: str, status: str,
|
||||
window_start: datetime = None, window_end: datetime = None,
|
||||
window_minutes: int = None, overlap_seconds: int = None,
|
||||
request_params: dict = None) -> int:
|
||||
"""创建运行记录"""
|
||||
sql = """
|
||||
INSERT INTO meta.etl_run(
|
||||
run_uuid, task_id, store_id, status, started_at, window_start, window_end,
|
||||
window_minutes, overlap_seconds, fetched_count, loaded_count, updated_count,
|
||||
skipped_count, error_count, unknown_fields, export_dir, log_path,
|
||||
request_params, manifest, error_message, extra
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, now(), %s, %s, %s, %s, 0, 0, 0, 0, 0, 0, %s, %s, %s,
|
||||
'{}'::jsonb, NULL, '{}'::jsonb
|
||||
)
|
||||
RETURNING run_id
|
||||
"""
|
||||
|
||||
result = self.db.query(
|
||||
sql,
|
||||
(run_uuid, task_id, store_id, status, window_start, window_end,
|
||||
window_minutes, overlap_seconds, export_dir, log_path,
|
||||
json.dumps(request_params or {}, ensure_ascii=False))
|
||||
)
|
||||
|
||||
run_id = result[0]["run_id"]
|
||||
self.db.commit()
|
||||
return run_id
|
||||
|
||||
def update_run(
|
||||
self,
|
||||
run_id: int,
|
||||
counts: dict,
|
||||
status: str,
|
||||
ended_at: datetime = None,
|
||||
manifest: dict = None,
|
||||
error_message: str = None,
|
||||
window: dict | None = None,
|
||||
request_params: dict | None = None,
|
||||
overlap_seconds: int | None = None,
|
||||
):
|
||||
"""更新运行记录"""
|
||||
sql = """
|
||||
UPDATE meta.etl_run
|
||||
SET fetched_count = %s,
|
||||
loaded_count = %s,
|
||||
updated_count = %s,
|
||||
skipped_count = %s,
|
||||
error_count = %s,
|
||||
unknown_fields = %s,
|
||||
status = %s,
|
||||
ended_at = %s,
|
||||
manifest = %s,
|
||||
error_message = %s,
|
||||
window_start = COALESCE(%s, window_start),
|
||||
window_end = COALESCE(%s, window_end),
|
||||
window_minutes = COALESCE(%s, window_minutes),
|
||||
overlap_seconds = COALESCE(%s, overlap_seconds),
|
||||
request_params = CASE WHEN %s IS NULL THEN request_params ELSE %s::jsonb END
|
||||
WHERE run_id = %s
|
||||
"""
|
||||
|
||||
def _count(v, default: int = 0) -> int:
|
||||
if v is None:
|
||||
return default
|
||||
if isinstance(v, bool):
|
||||
return int(v)
|
||||
if isinstance(v, int):
|
||||
return int(v)
|
||||
if isinstance(v, str):
|
||||
try:
|
||||
return int(v)
|
||||
except Exception:
|
||||
return default
|
||||
if isinstance(v, (list, tuple, set, dict)):
|
||||
try:
|
||||
return len(v)
|
||||
except Exception:
|
||||
return default
|
||||
return default
|
||||
|
||||
safe_counts = counts or {}
|
||||
|
||||
window_start = None
|
||||
window_end = None
|
||||
window_minutes = None
|
||||
if isinstance(window, dict):
|
||||
window_start = window.get("start") or window.get("window_start")
|
||||
window_end = window.get("end") or window.get("window_end")
|
||||
window_minutes = window.get("minutes") or window.get("window_minutes")
|
||||
|
||||
request_json = None if request_params is None else json.dumps(request_params or {}, ensure_ascii=False)
|
||||
self.db.execute(
|
||||
sql,
|
||||
(
|
||||
_count(safe_counts.get("fetched", 0)),
|
||||
_count(safe_counts.get("inserted", 0)),
|
||||
_count(safe_counts.get("updated", 0)),
|
||||
_count(safe_counts.get("skipped", 0)),
|
||||
_count(safe_counts.get("errors", 0)),
|
||||
_count(safe_counts.get("unknown_fields", 0)),
|
||||
status,
|
||||
ended_at,
|
||||
json.dumps(manifest or {}, ensure_ascii=False),
|
||||
error_message,
|
||||
window_start,
|
||||
window_end,
|
||||
window_minutes,
|
||||
overlap_seconds,
|
||||
request_json,
|
||||
request_json,
|
||||
run_id,
|
||||
),
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
@staticmethod
|
||||
def map_run_status(status: str) -> str:
|
||||
"""
|
||||
将任务返回的状态转换为 meta.run_status_enum
|
||||
(SUCC / FAIL / PARTIAL)
|
||||
"""
|
||||
normalized = (status or "").upper()
|
||||
if normalized in {"SUCCESS", "SUCC"}:
|
||||
return "SUCC"
|
||||
if normalized in {"FAIL", "FAILED", "ERROR"}:
|
||||
return "FAIL"
|
||||
if normalized in {"RUNNING", "PARTIAL", "PENDING", "IN_PROGRESS"}:
|
||||
return "PARTIAL"
|
||||
# 未知状态默认标记为 FAIL,便于排查
|
||||
return "FAIL"
|
||||
|
||||
90
apps/etl/connectors/feiqiu/orchestration/scheduler.py
Normal file
90
apps/etl/connectors/feiqiu/orchestration/scheduler.py
Normal file
@@ -0,0 +1,90 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""ETL 调度器(薄包装层)
|
||||
|
||||
已弃用:请直接使用 TaskExecutor 和 FlowRunner。
|
||||
保留此类以兼容 GUI 层、run_update.py 等现有调用方。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from api.client import APIClient
|
||||
from database.connection import DatabaseConnection
|
||||
from database.operations import DatabaseOperations
|
||||
from orchestration.cursor_manager import CursorManager
|
||||
from orchestration.run_tracker import RunTracker
|
||||
from orchestration.task_registry import default_registry
|
||||
from orchestration.task_executor import TaskExecutor
|
||||
from orchestration.flow_runner import FlowRunner
|
||||
|
||||
|
||||
# 保留模块级常量以兼容外部引用
|
||||
PIPELINE_LAYERS = FlowRunner.FLOW_LAYERS
|
||||
|
||||
|
||||
class ETLScheduler:
|
||||
"""调度器薄包装层(已弃用)。
|
||||
|
||||
内部委托 TaskExecutor 和 FlowRunner 执行。
|
||||
保留公共接口以兼容现有调用方(run_update.py、GUI 等)。
|
||||
"""
|
||||
|
||||
def __init__(self, config, logger):
|
||||
warnings.warn(
|
||||
"ETLScheduler 已弃用,请直接使用 TaskExecutor 和 FlowRunner",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
self.config = config
|
||||
self.logger = logger
|
||||
|
||||
# 创建资源(与原实现一致)
|
||||
self.db_conn = DatabaseConnection(
|
||||
dsn=config["db"]["dsn"],
|
||||
session=config["db"].get("session"),
|
||||
connect_timeout=config["db"].get("connect_timeout_sec"),
|
||||
)
|
||||
self.db_ops = DatabaseOperations(self.db_conn)
|
||||
self.api_client = APIClient(
|
||||
base_url=config["api"]["base_url"],
|
||||
token=config["api"]["token"],
|
||||
timeout=config["api"]["timeout_sec"],
|
||||
retry_max=config["api"]["retries"]["max_attempts"],
|
||||
headers_extra=config["api"].get("headers_extra"),
|
||||
)
|
||||
|
||||
cursor_mgr = CursorManager(self.db_conn)
|
||||
run_tracker = RunTracker(self.db_conn)
|
||||
self.task_registry = default_registry
|
||||
|
||||
# 内部组件
|
||||
self.task_executor = TaskExecutor(
|
||||
config, self.db_ops, self.api_client,
|
||||
cursor_mgr, run_tracker, self.task_registry, logger,
|
||||
)
|
||||
self.flow_runner = FlowRunner(
|
||||
config, self.task_executor, self.task_registry,
|
||||
self.db_conn, self.api_client, logger,
|
||||
)
|
||||
|
||||
def run_tasks(self, task_codes=None) -> list:
|
||||
"""执行任务列表(委托 TaskExecutor)。"""
|
||||
if not task_codes:
|
||||
task_codes = self.config.get("run.tasks", [])
|
||||
data_source = str(self.config.get("run.data_source", "hybrid") or "hybrid")
|
||||
return self.task_executor.run_tasks(task_codes, data_source=data_source)
|
||||
|
||||
def run_flow_with_verification(self, **kwargs) -> dict:
|
||||
"""执行 Flow(委托 FlowRunner)。"""
|
||||
# 从配置读取 data_source(如果调用方未传入)
|
||||
if "data_source" not in kwargs:
|
||||
kwargs["data_source"] = str(
|
||||
self.config.get("run.data_source", "hybrid") or "hybrid"
|
||||
)
|
||||
return self.flow_runner.run(**kwargs)
|
||||
|
||||
def close(self):
|
||||
"""关闭数据库连接。"""
|
||||
self.db_conn.close()
|
||||
497
apps/etl/connectors/feiqiu/orchestration/task_executor.py
Normal file
497
apps/etl/connectors/feiqiu/orchestration/task_executor.py
Normal file
@@ -0,0 +1,497 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""任务执行器:封装单个 ETL 任务的完整执行生命周期。
|
||||
|
||||
从原 ETLScheduler 中提取的执行层,负责:
|
||||
- 单任务执行(抓取/入库/ODS 录制+加载)
|
||||
- 游标管理(成功后推进水位)
|
||||
- 运行记录(创建/更新 meta.etl_run)
|
||||
|
||||
设计原则:
|
||||
- data_source 作为显式参数传入,不依赖全局状态
|
||||
- 工具类任务判断通过 TaskRegistry 元数据查询
|
||||
- 所有依赖通过构造函数注入,不自行创建资源
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from api.recording_client import RecordingAPIClient
|
||||
from api.local_json_client import LocalJsonClient
|
||||
from orchestration.cursor_manager import CursorManager
|
||||
from orchestration.run_tracker import RunTracker
|
||||
from orchestration.task_registry import TaskRegistry
|
||||
|
||||
|
||||
class DataSource(str, Enum):
|
||||
"""数据源模式,取代原 pipeline.flow 配置键(配置键保留以兼容旧配置)。"""
|
||||
ONLINE = "online" # 仅在线抓取(原 FETCH_ONLY)
|
||||
OFFLINE = "offline" # 仅本地入库(原 INGEST_ONLY)
|
||||
HYBRID = "hybrid" # 抓取 + 入库(原 FULL)
|
||||
|
||||
|
||||
class TaskExecutor:
|
||||
"""任务执行器:封装单个 ETL 任务的完整执行生命周期。
|
||||
|
||||
通过构造函数注入所有依赖,不自行创建 DatabaseConnection 或 APIClient。
|
||||
data_source 作为方法参数传入,替代原配置状态。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
db_ops,
|
||||
api_client,
|
||||
cursor_mgr: CursorManager,
|
||||
run_tracker: RunTracker,
|
||||
task_registry: TaskRegistry,
|
||||
logger: logging.Logger,
|
||||
):
|
||||
self.config = config
|
||||
self.db_ops = db_ops
|
||||
self.api_client = api_client
|
||||
self.cursor_mgr = cursor_mgr
|
||||
self.run_tracker = run_tracker
|
||||
self.task_registry = task_registry
|
||||
self.logger = logger
|
||||
|
||||
self.tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
|
||||
self.fetch_root = Path(
|
||||
config.get("io.fetch_root")
|
||||
or config.get("pipeline.fetch_root")
|
||||
or config["io"]["export_root"]
|
||||
)
|
||||
self.ingest_source_dir = (
|
||||
config.get("io.ingest_source_dir")
|
||||
or config.get("pipeline.ingest_source_dir")
|
||||
or ""
|
||||
)
|
||||
self.write_pretty_json = bool(config.get("io.write_pretty_json", False))
|
||||
|
||||
# ------------------------------------------------------------------ 公共接口
|
||||
|
||||
def run_tasks(
|
||||
self,
|
||||
task_codes: list[str],
|
||||
data_source: str = "hybrid",
|
||||
) -> list[dict[str, Any]]:
|
||||
"""批量执行任务列表,返回每个任务的结果。"""
|
||||
run_uuid = uuid.uuid4().hex
|
||||
store_id = self.config.get("app.store_id")
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
file_handler = self._attach_run_file_logger(run_uuid)
|
||||
try:
|
||||
self.logger.info("开始运行任务: %s, run_uuid=%s", task_codes, run_uuid)
|
||||
|
||||
for task_code in task_codes:
|
||||
try:
|
||||
task_result = self.run_single_task(
|
||||
task_code, run_uuid, store_id, data_source=data_source,
|
||||
)
|
||||
result_entry: dict[str, Any] = {
|
||||
"task_code": task_code,
|
||||
"status": "成功" if task_result else "完成",
|
||||
"counts": task_result.get("counts", {}) if isinstance(task_result, dict) else {},
|
||||
}
|
||||
if isinstance(task_result, dict):
|
||||
if task_result.get("dump_dir"):
|
||||
result_entry["dump_dir"] = task_result["dump_dir"]
|
||||
if task_result.get("last_dump"):
|
||||
result_entry["last_dump"] = task_result["last_dump"]
|
||||
results.append(result_entry)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.error("任务 %s 失败: %s", task_code, exc, exc_info=True)
|
||||
results.append({
|
||||
"task_code": task_code,
|
||||
"status": "失败",
|
||||
"error": str(exc),
|
||||
"counts": {},
|
||||
})
|
||||
continue
|
||||
|
||||
self.logger.info("所有任务执行完成")
|
||||
return results
|
||||
finally:
|
||||
if file_handler is not None:
|
||||
try:
|
||||
logging.getLogger().removeHandler(file_handler)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
file_handler.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def run_single_task(
|
||||
self,
|
||||
task_code: str,
|
||||
run_uuid: str,
|
||||
store_id: int,
|
||||
data_source: str = "hybrid",
|
||||
) -> dict[str, Any]:
|
||||
"""执行单个任务的完整生命周期。
|
||||
|
||||
Args:
|
||||
task_code: 任务代码
|
||||
run_uuid: 本次运行的唯一标识
|
||||
store_id: 门店 ID
|
||||
data_source: 数据源模式(online/offline/hybrid)
|
||||
"""
|
||||
task_code_upper = task_code.upper()
|
||||
|
||||
# 工具类任务:通过 TaskRegistry 元数据判断,跳过游标和运行记录
|
||||
if self.task_registry.is_utility_task(task_code_upper):
|
||||
return self._run_utility_task(task_code_upper, store_id)
|
||||
|
||||
task_cfg = self._load_task_config(task_code, store_id)
|
||||
if not task_cfg:
|
||||
self.logger.warning("任务 %s 未启用或不存在", task_code)
|
||||
return {"status": "SKIP", "counts": {}}
|
||||
|
||||
task_id = task_cfg["task_id"]
|
||||
cursor_data = self.cursor_mgr.get_or_create(task_id, store_id)
|
||||
|
||||
# 创建运行记录
|
||||
export_dir = Path(self.config["io"]["export_root"]) / datetime.now(self.tz).strftime("%Y%m%d")
|
||||
log_path = str(Path(self.config["io"]["log_root"]) / f"{run_uuid}.log")
|
||||
run_id = self.run_tracker.create_run(
|
||||
task_id=task_id,
|
||||
store_id=store_id,
|
||||
run_uuid=run_uuid,
|
||||
export_dir=str(export_dir),
|
||||
log_path=log_path,
|
||||
status=RunTracker.map_run_status("RUNNING"),
|
||||
)
|
||||
|
||||
fetch_dir = self._build_fetch_dir(task_code, run_id)
|
||||
fetch_stats = None
|
||||
|
||||
try:
|
||||
# ODS 任务(ODS_JSON_ARCHIVE 除外)走特殊路径
|
||||
if self._is_ods_task(task_code):
|
||||
if self._flow_includes_fetch(data_source):
|
||||
result, last_dump = self._execute_ods_record_and_load(
|
||||
task_code, cursor_data, fetch_dir, run_id,
|
||||
)
|
||||
if isinstance(result, dict):
|
||||
result.setdefault("dump_dir", str(fetch_dir))
|
||||
if last_dump:
|
||||
result.setdefault("last_dump", last_dump)
|
||||
else:
|
||||
source_dir = self._resolve_ingest_source(fetch_dir, None)
|
||||
result = self._execute_ingest(task_code, cursor_data, source_dir)
|
||||
|
||||
self.run_tracker.update_run(
|
||||
run_id=run_id,
|
||||
counts=result.get("counts") or {},
|
||||
status=RunTracker.map_run_status(result.get("status")),
|
||||
ended_at=datetime.now(self.tz),
|
||||
window=result.get("window"),
|
||||
request_params=result.get("request_params"),
|
||||
overlap_seconds=self.config.get("run.overlap_seconds"),
|
||||
)
|
||||
|
||||
if (result.get("status") or "").upper() == "SUCCESS":
|
||||
window = result.get("window")
|
||||
if isinstance(window, dict):
|
||||
self.cursor_mgr.advance(
|
||||
task_id=task_id,
|
||||
store_id=store_id,
|
||||
window_start=window.get("start"),
|
||||
window_end=window.get("end"),
|
||||
run_id=run_id,
|
||||
)
|
||||
self._maybe_run_integrity_check(task_code, window)
|
||||
return result
|
||||
|
||||
# 非 ODS 任务:按 data_source 决定抓取/入库阶段
|
||||
if self._flow_includes_fetch(data_source):
|
||||
fetch_stats = self._execute_fetch(task_code, cursor_data, fetch_dir, run_id)
|
||||
if data_source == DataSource.ONLINE or data_source == "online":
|
||||
counts = self._counts_from_fetch(fetch_stats)
|
||||
self.run_tracker.update_run(
|
||||
run_id=run_id,
|
||||
counts=counts,
|
||||
status=RunTracker.map_run_status("SUCCESS"),
|
||||
ended_at=datetime.now(self.tz),
|
||||
)
|
||||
return {"status": "SUCCESS", "counts": counts}
|
||||
|
||||
if self._flow_includes_ingest(data_source):
|
||||
source_dir = self._resolve_ingest_source(fetch_dir, fetch_stats)
|
||||
result = self._execute_ingest(task_code, cursor_data, source_dir)
|
||||
|
||||
self.run_tracker.update_run(
|
||||
run_id=run_id,
|
||||
counts=result["counts"],
|
||||
status=RunTracker.map_run_status(result["status"]),
|
||||
ended_at=datetime.now(self.tz),
|
||||
window=result.get("window"),
|
||||
request_params=result.get("request_params"),
|
||||
overlap_seconds=self.config.get("run.overlap_seconds"),
|
||||
)
|
||||
|
||||
if (result.get("status") or "").upper() == "SUCCESS":
|
||||
window = result.get("window")
|
||||
if window:
|
||||
self.cursor_mgr.advance(
|
||||
task_id=task_id,
|
||||
store_id=store_id,
|
||||
window_start=window.get("start"),
|
||||
window_end=window.get("end"),
|
||||
run_id=run_id,
|
||||
)
|
||||
self._maybe_run_integrity_check(task_code, window)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as exc:
|
||||
self.run_tracker.update_run(
|
||||
run_id=run_id,
|
||||
counts={},
|
||||
status=RunTracker.map_run_status("FAIL"),
|
||||
ended_at=datetime.now(self.tz),
|
||||
error_message=str(exc),
|
||||
)
|
||||
raise
|
||||
|
||||
return {"status": "COMPLETE", "counts": {}}
|
||||
|
||||
# ------------------------------------------------------------------ 内部方法
|
||||
|
||||
def _execute_fetch(
|
||||
self,
|
||||
task_code: str,
|
||||
cursor_data: dict | None,
|
||||
fetch_dir: Path,
|
||||
run_id: int,
|
||||
):
|
||||
"""在线抓取阶段:用 RecordingAPIClient 拉取并落盘,不做 Transform/Load。"""
|
||||
recording_client = RecordingAPIClient(
|
||||
base_client=self.api_client,
|
||||
output_dir=fetch_dir,
|
||||
task_code=task_code,
|
||||
run_id=run_id,
|
||||
write_pretty=self.write_pretty_json,
|
||||
)
|
||||
task = self.task_registry.create_task(
|
||||
task_code, self.config, self.db_ops, recording_client, self.logger,
|
||||
)
|
||||
context = task._build_context(cursor_data) # type: ignore[attr-defined]
|
||||
self.logger.info("%s: 抓取阶段开始,目录=%s", task_code, fetch_dir)
|
||||
|
||||
extracted = task.extract(context)
|
||||
stats = recording_client.last_dump or {}
|
||||
extracted_count = 0
|
||||
if isinstance(extracted, dict):
|
||||
extracted_count = int(extracted.get("fetched") or 0) or len(extracted.get("records", []))
|
||||
fetched_count = stats.get("records") or extracted_count or 0
|
||||
self.logger.info(
|
||||
"%s: 抓取完成,文件=%s,记录数=%s",
|
||||
task_code,
|
||||
stats.get("file"),
|
||||
fetched_count,
|
||||
)
|
||||
return {"file": stats.get("file"), "records": fetched_count, "pages": stats.get("pages")}
|
||||
|
||||
@staticmethod
|
||||
def _is_ods_task(task_code: str) -> bool:
|
||||
"""判断是否为 ODS 任务(ODS_JSON_ARCHIVE 除外)。"""
|
||||
tc = str(task_code or "").upper()
|
||||
return tc.startswith("ODS_") and tc != "ODS_JSON_ARCHIVE"
|
||||
|
||||
def _execute_ods_record_and_load(
|
||||
self,
|
||||
task_code: str,
|
||||
cursor_data: dict | None,
|
||||
fetch_dir: Path,
|
||||
run_id: int,
|
||||
) -> tuple[dict, dict]:
|
||||
"""ODS 任务:在线抓取 + 直接入库(ODS 任务在 execute() 内完成 DB upsert)。"""
|
||||
recording_client = RecordingAPIClient(
|
||||
base_client=self.api_client,
|
||||
output_dir=fetch_dir,
|
||||
task_code=task_code,
|
||||
run_id=run_id,
|
||||
write_pretty=self.write_pretty_json,
|
||||
)
|
||||
task = self.task_registry.create_task(
|
||||
task_code, self.config, self.db_ops, recording_client, self.logger,
|
||||
)
|
||||
self.logger.info("%s: ODS fetch+load start, dir=%s", task_code, fetch_dir)
|
||||
result = task.execute(cursor_data)
|
||||
return result, (recording_client.last_dump or {})
|
||||
|
||||
def _execute_ingest(
|
||||
self,
|
||||
task_code: str,
|
||||
cursor_data: dict | None,
|
||||
source_dir: Path,
|
||||
):
|
||||
"""本地清洗入库:使用 LocalJsonClient 回放 JSON,走原有任务 ETL。"""
|
||||
local_client = LocalJsonClient(source_dir)
|
||||
task = self.task_registry.create_task(
|
||||
task_code, self.config, self.db_ops, local_client, self.logger,
|
||||
)
|
||||
self.logger.info("%s: 本地清洗入库开始,源目录=%s", task_code, source_dir)
|
||||
return task.execute(cursor_data)
|
||||
|
||||
def _build_fetch_dir(self, task_code: str, run_id: int) -> Path:
|
||||
"""构建抓取输出目录路径。"""
|
||||
ts = datetime.now(self.tz).strftime("%Y%m%d-%H%M%S")
|
||||
task_code = str(task_code or "").upper()
|
||||
return Path(self.fetch_root) / task_code / f"{task_code}-{run_id}-{ts}"
|
||||
|
||||
def _resolve_ingest_source(self, fetch_dir: Path, fetch_stats: dict | None) -> Path:
|
||||
"""确定本地清洗入库的 JSON 源目录。"""
|
||||
if fetch_stats and fetch_dir.exists():
|
||||
return fetch_dir
|
||||
if self.ingest_source_dir:
|
||||
return Path(self.ingest_source_dir)
|
||||
raise FileNotFoundError("未提供本地清洗入库所需的 JSON 目录")
|
||||
|
||||
def _counts_from_fetch(self, stats: dict | None) -> dict:
|
||||
"""从抓取统计中构建计数字典。"""
|
||||
fetched = (stats or {}).get("records") or 0
|
||||
return {
|
||||
"fetched": fetched,
|
||||
"inserted": 0,
|
||||
"updated": 0,
|
||||
"skipped": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _flow_includes_fetch(data_source: str) -> bool:
|
||||
"""判断当前 data_source 是否包含抓取阶段。"""
|
||||
ds = str(data_source).lower()
|
||||
return ds in {"online", "hybrid"}
|
||||
|
||||
@staticmethod
|
||||
def _flow_includes_ingest(data_source: str) -> bool:
|
||||
"""判断当前 data_source 是否包含入库阶段。"""
|
||||
ds = str(data_source).lower()
|
||||
return ds in {"offline", "hybrid"}
|
||||
|
||||
def _run_utility_task(self, task_code: str, store_id: int) -> Dict[str, Any]:
|
||||
"""执行工具类任务(不记录 cursor/run,直接执行)。"""
|
||||
self.logger.info("%s: 开始执行工具类任务", task_code)
|
||||
|
||||
try:
|
||||
api_client = None
|
||||
if task_code == "ODS_JSON_ARCHIVE":
|
||||
run_id = int(datetime.now(self.tz).timestamp())
|
||||
fetch_dir = self._build_fetch_dir(task_code, run_id)
|
||||
api_client = RecordingAPIClient(
|
||||
base_client=self.api_client,
|
||||
output_dir=fetch_dir,
|
||||
task_code=task_code,
|
||||
run_id=run_id,
|
||||
write_pretty=self.write_pretty_json,
|
||||
)
|
||||
|
||||
task = self.task_registry.create_task(
|
||||
task_code, self.config, self.db_ops, api_client, self.logger,
|
||||
)
|
||||
|
||||
result = task.execute(None)
|
||||
|
||||
status = (result.get("status") or "").upper() if isinstance(result, dict) else "SUCCESS"
|
||||
counts = result.get("counts", {}) if isinstance(result, dict) else {}
|
||||
|
||||
if status == "SUCCESS":
|
||||
self.logger.info("%s: 工具类任务执行成功", task_code)
|
||||
if counts:
|
||||
self.logger.info("%s: 结果统计: %s", task_code, counts)
|
||||
else:
|
||||
self.logger.warning("%s: 工具类任务执行结果: %s", task_code, status)
|
||||
|
||||
return {"status": status, "counts": counts}
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error("%s: 工具类任务执行失败: %s", task_code, exc, exc_info=True)
|
||||
raise
|
||||
|
||||
def _load_task_config(self, task_code: str, store_id: int) -> dict | None:
|
||||
"""从数据库加载任务配置。"""
|
||||
sql = """
|
||||
SELECT task_id, task_code, store_id, enabled, cursor_field,
|
||||
window_minutes_default, overlap_seconds, page_size, retry_max, params
|
||||
FROM meta.etl_task
|
||||
WHERE store_id = %s AND task_code = %s AND enabled = TRUE
|
||||
"""
|
||||
rows = self.db_ops.query(sql, (store_id, task_code))
|
||||
return rows[0] if rows else None
|
||||
|
||||
def _maybe_run_integrity_check(self, task_code: str, window: dict | None) -> None:
|
||||
"""在 DWD_LOAD_FROM_ODS 成功后可选执行完整性校验。"""
|
||||
if not self.config.get("integrity.auto_check", False):
|
||||
return
|
||||
if str(task_code or "").upper() != "DWD_LOAD_FROM_ODS":
|
||||
return
|
||||
if not isinstance(window, dict):
|
||||
return
|
||||
window_start = window.get("start")
|
||||
window_end = window.get("end")
|
||||
if not window_start or not window_end:
|
||||
return
|
||||
|
||||
try:
|
||||
from quality.integrity_checker import IntegrityWindow, run_integrity_window
|
||||
|
||||
include_dimensions = bool(self.config.get("integrity.include_dimensions", False))
|
||||
task_codes = str(self.config.get("integrity.ods_task_codes", "") or "").strip()
|
||||
report = run_integrity_window(
|
||||
cfg=self.config,
|
||||
window=IntegrityWindow(
|
||||
start=window_start,
|
||||
end=window_end,
|
||||
label="etl_window",
|
||||
granularity="window",
|
||||
),
|
||||
include_dimensions=include_dimensions,
|
||||
task_codes=task_codes,
|
||||
logger=self.logger,
|
||||
write_report=True,
|
||||
)
|
||||
self.logger.info(
|
||||
"Integrity check done: report=%s missing=%s errors=%s",
|
||||
report.get("report_path"),
|
||||
report.get("api_to_ods", {}).get("total_missing"),
|
||||
report.get("api_to_ods", {}).get("total_errors"),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("Integrity check failed: %s", exc, exc_info=True)
|
||||
|
||||
def _attach_run_file_logger(self, run_uuid: str) -> logging.Handler | None:
|
||||
"""为本次 run_uuid 动态挂载文件日志处理器。"""
|
||||
log_root = Path(self.config["io"]["log_root"])
|
||||
try:
|
||||
log_root.mkdir(parents=True, exist_ok=True)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("创建日志目录失败:%s(%s)", log_root, exc)
|
||||
return None
|
||||
|
||||
log_path = log_root / f"{run_uuid}.log"
|
||||
try:
|
||||
handler: logging.Handler = logging.FileHandler(log_path, encoding="utf-8")
|
||||
except Exception as exc: # noqa: BLE001
|
||||
self.logger.warning("创建文件日志失败:%s(%s)", log_path, exc)
|
||||
return None
|
||||
|
||||
fmt = logging.Formatter(
|
||||
fmt="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
handler.setFormatter(fmt)
|
||||
handler.setLevel(logging.INFO)
|
||||
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.addHandler(handler)
|
||||
return handler
|
||||
173
apps/etl/connectors/feiqiu/orchestration/task_registry.py
Normal file
173
apps/etl/connectors/feiqiu/orchestration/task_registry.py
Normal file
@@ -0,0 +1,173 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# AI_CHANGELOG
|
||||
# - 2026-02-14 | 删除废弃代码残留:移除重复的 ODS_TASK_CLASSES 注册循环(底部),保留唯一一处(顶部)
|
||||
# 直接原因: 原文件有两处 `for code, task_cls in ODS_TASK_CLASSES.items()` 循环,导致重复注册
|
||||
# 验证: `python -c "from orchestration.task_registry import default_registry; print(len(default_registry.get_all_task_codes()))"` → 52
|
||||
# CHANGE [2026-02-14] intent: 移除 14 个废弃独立 ODS 任务 + 3 个废弃独立 DWD 任务的导入与注册
|
||||
# assumptions: 这些任务写入不存在的 billiards.* schema,已被通用 ODS 任务(ods.*)替代
|
||||
# prompt: "删除废弃的独立 ODS/DWD 任务及其 loader"
|
||||
# 验证: pytest tests/unit -x 确认无 import 错误
|
||||
"""任务注册表"""
|
||||
from dataclasses import dataclass, field
|
||||
# ODS 层任务(仅保留通用 ODS 任务工厂 + JSON 归档)
|
||||
from tasks.ods.ods_tasks import ODS_TASK_CLASSES
|
||||
from tasks.ods.ods_json_archive_task import OdsJsonArchiveTask
|
||||
|
||||
# DWD 层任务(仅保留核心装载 + 质量检查)
|
||||
from tasks.dwd.dwd_load_task import DwdLoadTask
|
||||
from tasks.dwd.dwd_quality_task import DwdQualityTask
|
||||
|
||||
# 工具类任务
|
||||
from tasks.utility.manual_ingest_task import ManualIngestTask
|
||||
from tasks.utility.init_schema_task import InitOdsSchemaTask
|
||||
from tasks.utility.init_dwd_schema_task import InitDwdSchemaTask
|
||||
from tasks.utility.init_dws_schema_task import InitDwsSchemaTask
|
||||
from tasks.utility.check_cutoff_task import CheckCutoffTask
|
||||
from tasks.utility.dws_build_order_summary_task import DwsBuildOrderSummaryTask
|
||||
from tasks.utility.data_integrity_task import DataIntegrityTask
|
||||
from tasks.utility.seed_dws_config_task import SeedDwsConfigTask
|
||||
|
||||
# DWS 层任务导入
|
||||
from tasks.dws import (
|
||||
AssistantDailyTask,
|
||||
AssistantMonthlyTask,
|
||||
AssistantCustomerTask,
|
||||
AssistantSalaryTask,
|
||||
AssistantFinanceTask,
|
||||
MemberConsumptionTask,
|
||||
MemberVisitTask,
|
||||
FinanceDailyTask,
|
||||
FinanceRechargeTask,
|
||||
FinanceIncomeStructureTask,
|
||||
FinanceDiscountDetailTask,
|
||||
# 指数算法任务
|
||||
WinbackIndexTask,
|
||||
NewconvIndexTask,
|
||||
MlManualImportTask,
|
||||
RelationIndexTask,
|
||||
)
|
||||
# CHANGE [2026-07-14] intent: 合并 MV 刷新 + 数据清理为 DWS_MAINTENANCE
|
||||
from tasks.dws.maintenance_task import DwsMaintenanceTask
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskMeta:
|
||||
"""任务元数据"""
|
||||
task_class: type
|
||||
requires_db_config: bool = True
|
||||
layer: str | None = None # "ODS" / "DWD" / "DWS" / "INDEX" / None
|
||||
task_type: str = "etl" # "etl" / "utility" / "verification"
|
||||
depends_on: list[str] = field(default_factory=list) # 依赖的任务代码列表
|
||||
|
||||
|
||||
class TaskRegistry:
|
||||
"""任务注册和工厂"""
|
||||
|
||||
def __init__(self):
|
||||
self._tasks: dict[str, TaskMeta] = {}
|
||||
|
||||
def register(
|
||||
self,
|
||||
task_code: str,
|
||||
task_class: type,
|
||||
requires_db_config: bool = True,
|
||||
layer: str | None = None,
|
||||
task_type: str = "etl",
|
||||
depends_on: list[str] | None = None,
|
||||
):
|
||||
"""注册任务类及其元数据。向后兼容:仅传 task_code 和 task_class 时使用默认值。"""
|
||||
self._tasks[task_code.upper()] = TaskMeta(
|
||||
task_class=task_class,
|
||||
requires_db_config=requires_db_config,
|
||||
layer=layer,
|
||||
task_type=task_type,
|
||||
depends_on=depends_on or [],
|
||||
)
|
||||
|
||||
def create_task(self, task_code: str, config, db_connection, api_client, logger):
|
||||
"""创建任务实例"""
|
||||
task_code = task_code.upper()
|
||||
if task_code not in self._tasks:
|
||||
raise ValueError(f"未知的任务类型: {task_code}")
|
||||
|
||||
task_class = self._tasks[task_code].task_class
|
||||
return task_class(config, db_connection, api_client, logger)
|
||||
|
||||
def get_metadata(self, task_code: str) -> TaskMeta | None:
|
||||
"""查询任务元数据。"""
|
||||
return self._tasks.get(task_code.upper())
|
||||
|
||||
def get_tasks_by_layer(self, layer: str) -> list[str]:
|
||||
"""获取指定层的所有任务代码。"""
|
||||
return [
|
||||
code for code, meta in self._tasks.items()
|
||||
if meta.layer and meta.layer.upper() == layer.upper()
|
||||
]
|
||||
|
||||
def is_utility_task(self, task_code: str) -> bool:
|
||||
"""判断是否为工具类任务(不需要游标/运行记录)。"""
|
||||
meta = self.get_metadata(task_code)
|
||||
return meta is not None and not meta.requires_db_config
|
||||
|
||||
def get_all_task_codes(self) -> list[str]:
|
||||
"""获取所有已注册的任务代码"""
|
||||
return list(self._tasks.keys())
|
||||
|
||||
|
||||
|
||||
|
||||
# 默认注册表
|
||||
default_registry = TaskRegistry()
|
||||
|
||||
# ── ODS 层:通用 ODS 任务(由 ODS_TASK_CLASSES 动态生成)─────
|
||||
for code, task_cls in ODS_TASK_CLASSES.items():
|
||||
default_registry.register(code, task_cls, layer="ODS")
|
||||
|
||||
# ── DWD 层任务 ────────────────────────────────────────────────
|
||||
default_registry.register("DWD_LOAD_FROM_ODS", DwdLoadTask, layer="DWD")
|
||||
default_registry.register("DWD_QUALITY_CHECK", DwdQualityTask, requires_db_config=False, layer="DWD", task_type="verification")
|
||||
|
||||
# ── 工具类任务 ────────────────────────────────────────────────
|
||||
default_registry.register("MANUAL_INGEST", ManualIngestTask, requires_db_config=False, task_type="utility")
|
||||
default_registry.register("INIT_ODS_SCHEMA", InitOdsSchemaTask, requires_db_config=False, task_type="utility")
|
||||
default_registry.register("INIT_DWD_SCHEMA", InitDwdSchemaTask, requires_db_config=False, task_type="utility")
|
||||
default_registry.register("INIT_DWS_SCHEMA", InitDwsSchemaTask, requires_db_config=False, task_type="utility")
|
||||
default_registry.register("ODS_JSON_ARCHIVE", OdsJsonArchiveTask, requires_db_config=False, task_type="utility")
|
||||
default_registry.register("CHECK_CUTOFF", CheckCutoffTask, requires_db_config=False, task_type="utility")
|
||||
default_registry.register("SEED_DWS_CONFIG", SeedDwsConfigTask, task_type="utility")
|
||||
|
||||
# ── 校验类任务 ────────────────────────────────────────────────
|
||||
default_registry.register("DATA_INTEGRITY_CHECK", DataIntegrityTask, requires_db_config=False, task_type="verification")
|
||||
|
||||
# ── DWS 层业务任务 ────────────────────────────────────────────
|
||||
default_registry.register("DWS_BUILD_ORDER_SUMMARY", DwsBuildOrderSummaryTask, requires_db_config=False, layer="DWS")
|
||||
default_registry.register("DWS_ASSISTANT_DAILY", AssistantDailyTask, layer="DWS")
|
||||
# CHANGE [2026-07-17] intent: 为已知依赖关系添加 depends_on 声明(需求 8.1, 8.2)
|
||||
default_registry.register("DWS_ASSISTANT_MONTHLY", AssistantMonthlyTask, layer="DWS", depends_on=["DWS_ASSISTANT_DAILY"])
|
||||
default_registry.register("DWS_ASSISTANT_CUSTOMER", AssistantCustomerTask, layer="DWS")
|
||||
default_registry.register("DWS_ASSISTANT_SALARY", AssistantSalaryTask, layer="DWS")
|
||||
default_registry.register("DWS_ASSISTANT_FINANCE", AssistantFinanceTask, layer="DWS", depends_on=["DWS_ASSISTANT_SALARY"])
|
||||
default_registry.register("DWS_MEMBER_CONSUMPTION", MemberConsumptionTask, layer="DWS")
|
||||
default_registry.register("DWS_MEMBER_VISIT", MemberVisitTask, layer="DWS")
|
||||
default_registry.register("DWS_FINANCE_DAILY", FinanceDailyTask, layer="DWS")
|
||||
default_registry.register("DWS_FINANCE_RECHARGE", FinanceRechargeTask, layer="DWS")
|
||||
default_registry.register("DWS_FINANCE_INCOME_STRUCTURE", FinanceIncomeStructureTask, layer="DWS")
|
||||
default_registry.register("DWS_FINANCE_DISCOUNT_DETAIL", FinanceDiscountDetailTask, layer="DWS")
|
||||
# CHANGE [2026-07-14] intent: 移除 DWS_RETENTION_CLEANUP / DWS_MV_REFRESH_FINANCE_DAILY / DWS_MV_REFRESH_ASSISTANT_DAILY,
|
||||
# 替换为统一维护任务 DWS_MAINTENANCE(需求 4.5)
|
||||
# depends_on: 所有其他 DWS 任务——MV 刷新和清理应在数据写入后执行
|
||||
default_registry.register("DWS_MAINTENANCE", DwsMaintenanceTask, layer="DWS", depends_on=[
|
||||
"DWS_ASSISTANT_DAILY", "DWS_ASSISTANT_MONTHLY", "DWS_ASSISTANT_CUSTOMER",
|
||||
"DWS_ASSISTANT_SALARY", "DWS_ASSISTANT_FINANCE",
|
||||
"DWS_MEMBER_CONSUMPTION", "DWS_MEMBER_VISIT",
|
||||
"DWS_FINANCE_DAILY", "DWS_FINANCE_RECHARGE",
|
||||
"DWS_FINANCE_INCOME_STRUCTURE", "DWS_FINANCE_DISCOUNT_DETAIL",
|
||||
"DWS_BUILD_ORDER_SUMMARY",
|
||||
])
|
||||
|
||||
# ── INDEX 层:指数算法任务 ────────────────────────────────────
|
||||
# CHANGE [2026-07-17] intent: 为指数任务添加 depends_on 声明(需求 8.1, 8.2)
|
||||
default_registry.register("DWS_WINBACK_INDEX", WinbackIndexTask, requires_db_config=False, layer="INDEX", depends_on=["DWS_MEMBER_VISIT", "DWS_MEMBER_CONSUMPTION"])
|
||||
default_registry.register("DWS_NEWCONV_INDEX", NewconvIndexTask, requires_db_config=False, layer="INDEX", depends_on=["DWS_MEMBER_VISIT", "DWS_MEMBER_CONSUMPTION"])
|
||||
default_registry.register("DWS_ML_MANUAL_IMPORT", MlManualImportTask, requires_db_config=False, layer="INDEX")
|
||||
default_registry.register("DWS_RELATION_INDEX", RelationIndexTask, requires_db_config=False, layer="INDEX", depends_on=["DWS_ASSISTANT_DAILY"])
|
||||
61
apps/etl/connectors/feiqiu/orchestration/topological_sort.py
Normal file
61
apps/etl/connectors/feiqiu/orchestration/topological_sort.py
Normal file
@@ -0,0 +1,61 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""拓扑排序模块 — Kahn's algorithm
|
||||
|
||||
对任务列表按依赖关系执行拓扑排序:
|
||||
- 仅对当前执行列表内的任务排序
|
||||
- depends_on 中引用的任务不在列表内时记录警告
|
||||
- 检测循环依赖并抛出 ValueError
|
||||
"""
|
||||
from collections import deque
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def topological_sort(task_codes: list[str], registry) -> list[str]:
|
||||
"""对任务列表执行拓扑排序(Kahn's algorithm)。
|
||||
|
||||
Args:
|
||||
task_codes: 待排序的任务代码列表
|
||||
registry: TaskRegistry 实例,提供 get_metadata() 查询依赖
|
||||
|
||||
Returns:
|
||||
按依赖顺序排列的任务代码列表(被依赖方在前)
|
||||
|
||||
Raises:
|
||||
ValueError: 检测到循环依赖时,错误信息包含涉及的任务代码
|
||||
"""
|
||||
if not task_codes:
|
||||
return []
|
||||
|
||||
in_degree = {code: 0 for code in task_codes}
|
||||
graph = {code: [] for code in task_codes}
|
||||
task_set = set(task_codes)
|
||||
|
||||
for code in task_codes:
|
||||
meta = registry.get_metadata(code)
|
||||
if meta and meta.depends_on:
|
||||
for dep in meta.depends_on:
|
||||
if dep in task_set:
|
||||
graph[dep].append(code)
|
||||
in_degree[code] += 1
|
||||
else:
|
||||
logger.warning(
|
||||
"任务 %s 依赖 %s,但后者不在当前执行列表中", code, dep
|
||||
)
|
||||
|
||||
queue = deque(code for code in task_codes if in_degree[code] == 0)
|
||||
result = []
|
||||
while queue:
|
||||
node = queue.popleft()
|
||||
result.append(node)
|
||||
for neighbor in graph[node]:
|
||||
in_degree[neighbor] -= 1
|
||||
if in_degree[neighbor] == 0:
|
||||
queue.append(neighbor)
|
||||
|
||||
if len(result) != len(task_codes):
|
||||
cycle_tasks = [c for c in task_codes if c not in result]
|
||||
raise ValueError(f"检测到循环依赖: {cycle_tasks}")
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user