微信小程序页面迁移校验之前 P5任务处理之前
This commit is contained in:
1
apps/etl/connectors/feiqiu/pipeline/__init__.py
Normal file
1
apps/etl/connectors/feiqiu/pipeline/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
57
apps/etl/connectors/feiqiu/pipeline/models.py
Normal file
57
apps/etl/connectors/feiqiu/pipeline/models.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""管道数据类:请求描述、执行结果、写入结果。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineRequest:
|
||||
"""管道请求描述。"""
|
||||
|
||||
endpoint: str
|
||||
params: dict
|
||||
page_size: int | None = 200
|
||||
data_path: tuple[str, ...] = ("data",)
|
||||
list_key: str | None = None
|
||||
segment_index: int = 0 # 所属窗口分段索引
|
||||
is_detail: bool = False # 是否为详情请求
|
||||
detail_id: Any = None # 详情请求的 ID
|
||||
# 预取的 API 响应(用于 BaseOdsTask 集成:iter_paginated 已获取数据,
|
||||
# _request_loop 跳过 api.post() 直接使用)
|
||||
_prefetched_response: Any = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""管道执行结果。"""
|
||||
|
||||
status: str = "SUCCESS"
|
||||
total_requests: int = 0
|
||||
completed_requests: int = 0
|
||||
total_fetched: int = 0
|
||||
total_inserted: int = 0
|
||||
total_updated: int = 0
|
||||
total_skipped: int = 0
|
||||
total_deleted: int = 0
|
||||
request_failures: int = 0
|
||||
processing_failures: int = 0
|
||||
write_failures: int = 0
|
||||
cancelled: bool = False
|
||||
errors: list[dict] = field(default_factory=list)
|
||||
timing: dict[str, float] = field(default_factory=dict)
|
||||
# Detail_Mode 统计
|
||||
detail_success: int = 0
|
||||
detail_failure: int = 0
|
||||
detail_skipped: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class WriteResult:
|
||||
"""单次批量写入结果。"""
|
||||
|
||||
inserted: int = 0
|
||||
updated: int = 0
|
||||
skipped: int = 0
|
||||
errors: int = 0
|
||||
473
apps/etl/connectors/feiqiu/pipeline/unified_pipeline.py
Normal file
473
apps/etl/connectors/feiqiu/pipeline/unified_pipeline.py
Normal file
@@ -0,0 +1,473 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""统一管道引擎:串行请求 + 异步处理 + 单线程写库。
|
||||
|
||||
核心执行流程:
|
||||
主线程(_request_loop)串行发送 API 请求 → processing_queue
|
||||
→ N 个 worker 线程(_process_worker)并行处理 → write_queue
|
||||
→ 1 个 writer 线程(_write_worker)批量写入数据库
|
||||
|
||||
线程安全保证:
|
||||
- PipelineResult 的计数更新通过 threading.Lock 保护
|
||||
- 队列通信使用 queue.Queue(内置线程安全)
|
||||
- SENTINEL(None)用于通知线程退出
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from typing import Any, Callable, Iterable
|
||||
|
||||
from api.rate_limiter import RateLimiter
|
||||
from config.pipeline_config import PipelineConfig
|
||||
from utils.cancellation import CancellationToken
|
||||
from pipeline.models import PipelineRequest, PipelineResult, WriteResult
|
||||
|
||||
# 运行时指标日志间隔(每 N 个请求记录一次队列深度等指标)
|
||||
_METRICS_LOG_INTERVAL = 10
|
||||
|
||||
|
||||
class UnifiedPipeline:
|
||||
"""统一管道引擎:串行请求 + 异步处理 + 单线程写库。
|
||||
|
||||
Args:
|
||||
api_client: API 客户端(duck typing,需有 post 方法)
|
||||
db_connection: 数据库连接(duck typing)
|
||||
logger: 日志记录器
|
||||
config: 管道配置
|
||||
cancel_token: 取消令牌,None 时自动创建一个不会取消的令牌
|
||||
etl_timer: 可选的 EtlTimer 实例,用于在 FlowRunner 计时报告中记录阶段耗时
|
||||
task_code: 任务代码,与 etl_timer 配合使用作为步骤名前缀
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_client, # duck typing: 有 post(endpoint, params) 方法
|
||||
db_connection, # duck typing
|
||||
logger: logging.Logger,
|
||||
config: PipelineConfig,
|
||||
cancel_token: CancellationToken | None = None,
|
||||
etl_timer=None, # 可选 EtlTimer,duck typing
|
||||
task_code: str | None = None,
|
||||
) -> None:
|
||||
self.api = api_client
|
||||
self.db = db_connection
|
||||
self.logger = logger
|
||||
self.config = config
|
||||
self.cancel_token = cancel_token or CancellationToken()
|
||||
self._rate_limiter = RateLimiter(config.rate_min, config.rate_max)
|
||||
self._etl_timer = etl_timer
|
||||
self._task_code = task_code
|
||||
# 结果计数锁,保护 PipelineResult 的并发更新
|
||||
self._lock = threading.Lock()
|
||||
# 处理线程引用,用于运行时指标日志中统计活跃线程数
|
||||
self._workers: list[threading.Thread] = []
|
||||
|
||||
def run(
|
||||
self,
|
||||
requests: Iterable[PipelineRequest],
|
||||
process_fn: Callable[[Any], list[dict]],
|
||||
write_fn: Callable[[list[dict]], WriteResult],
|
||||
) -> PipelineResult:
|
||||
"""执行管道。
|
||||
|
||||
Args:
|
||||
requests: 请求迭代器(由 BaseOdsTask 生成)
|
||||
process_fn: 处理函数,将 API 响应转换为待写入记录列表
|
||||
write_fn: 写入函数,将记录批量写入数据库
|
||||
|
||||
Returns:
|
||||
PipelineResult 包含各阶段统计和最终状态
|
||||
"""
|
||||
# 预取消检查:cancel_token 已取消则立即返回空结果
|
||||
if self.cancel_token.is_cancelled:
|
||||
return PipelineResult(status="CANCELLED", cancelled=True)
|
||||
|
||||
processing_queue: queue.Queue = queue.Queue(
|
||||
maxsize=self.config.queue_size,
|
||||
)
|
||||
write_queue: queue.Queue = queue.Queue(
|
||||
maxsize=self.config.queue_size * 2,
|
||||
)
|
||||
result = PipelineResult()
|
||||
|
||||
# 保存队列引用,供 _request_loop 运行时指标日志使用
|
||||
self._processing_queue = processing_queue
|
||||
self._write_queue = write_queue
|
||||
|
||||
start_time = time.monotonic()
|
||||
|
||||
# EtlTimer 集成:记录请求阶段子步骤
|
||||
timer = self._etl_timer
|
||||
step_name = self._task_code
|
||||
|
||||
# 启动 N 个处理线程
|
||||
self._workers = []
|
||||
for i in range(self.config.workers):
|
||||
t = threading.Thread(
|
||||
target=self._process_worker,
|
||||
args=(processing_queue, write_queue, process_fn, result),
|
||||
name=f"pipeline-worker-{i}",
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
self._workers.append(t)
|
||||
|
||||
# 启动 1 个写入线程
|
||||
writer = threading.Thread(
|
||||
target=self._write_worker,
|
||||
args=(write_queue, write_fn, result),
|
||||
name="pipeline-writer",
|
||||
daemon=True,
|
||||
)
|
||||
writer.start()
|
||||
|
||||
# 主线程:串行请求
|
||||
if timer and step_name:
|
||||
try:
|
||||
timer.start_sub_step(step_name, "request")
|
||||
except KeyError:
|
||||
pass # 父步骤不存在时静默跳过
|
||||
request_start = time.monotonic()
|
||||
self._request_loop(requests, processing_queue, result)
|
||||
request_elapsed = time.monotonic() - request_start
|
||||
if timer and step_name:
|
||||
try:
|
||||
timer.stop_sub_step(step_name, "request")
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# 发送 SENTINEL 到处理队列,通知所有 worker 退出
|
||||
if timer and step_name:
|
||||
try:
|
||||
timer.start_sub_step(step_name, "process")
|
||||
except KeyError:
|
||||
pass
|
||||
process_start = time.monotonic()
|
||||
for _ in self._workers:
|
||||
processing_queue.put(None)
|
||||
for w in self._workers:
|
||||
w.join()
|
||||
process_elapsed = time.monotonic() - process_start
|
||||
if timer and step_name:
|
||||
try:
|
||||
timer.stop_sub_step(step_name, "process")
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# 发送 SENTINEL 到写入队列,通知 writer 退出
|
||||
if timer and step_name:
|
||||
try:
|
||||
timer.start_sub_step(step_name, "write")
|
||||
except KeyError:
|
||||
pass
|
||||
write_start = time.monotonic()
|
||||
write_queue.put(None)
|
||||
writer.join()
|
||||
write_elapsed = time.monotonic() - write_start
|
||||
if timer and step_name:
|
||||
try:
|
||||
timer.stop_sub_step(step_name, "write")
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
total_elapsed = time.monotonic() - start_time
|
||||
result.timing["total"] = round(total_elapsed, 3)
|
||||
result.timing["request"] = round(request_elapsed, 3)
|
||||
result.timing["process"] = round(process_elapsed, 3)
|
||||
result.timing["write"] = round(write_elapsed, 3)
|
||||
|
||||
# 确定最终状态
|
||||
if result.cancelled:
|
||||
result.status = "CANCELLED"
|
||||
elif result.status == "FAILED":
|
||||
pass # 连续失败已设置 FAILED,保持不变
|
||||
elif (
|
||||
result.request_failures
|
||||
+ result.processing_failures
|
||||
+ result.write_failures
|
||||
> 0
|
||||
):
|
||||
result.status = "PARTIAL"
|
||||
else:
|
||||
result.status = "SUCCESS"
|
||||
|
||||
# 执行摘要日志(需求 8.2)
|
||||
self.logger.info(
|
||||
"管道执行摘要: status=%s, 总耗时=%.1fs "
|
||||
"[请求=%.1fs, 处理=%.1fs, 写入=%.1fs], "
|
||||
"请求=%d/%d, 获取=%d, "
|
||||
"写入(inserted=%d, updated=%d, skipped=%d), "
|
||||
"失败(request=%d, process=%d, write=%d)",
|
||||
result.status,
|
||||
total_elapsed,
|
||||
request_elapsed,
|
||||
process_elapsed,
|
||||
write_elapsed,
|
||||
result.completed_requests,
|
||||
result.total_requests,
|
||||
result.total_fetched,
|
||||
result.total_inserted,
|
||||
result.total_updated,
|
||||
result.total_skipped,
|
||||
result.request_failures,
|
||||
result.processing_failures,
|
||||
result.write_failures,
|
||||
)
|
||||
|
||||
# 清理队列引用
|
||||
self._processing_queue = None
|
||||
self._write_queue = None
|
||||
self._workers = []
|
||||
|
||||
return result
|
||||
|
||||
def _request_loop(
|
||||
self,
|
||||
requests: Iterable[PipelineRequest],
|
||||
processing_queue: queue.Queue,
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""主线程:串行发送 API 请求,限流等待,背压阻塞。
|
||||
|
||||
流程:
|
||||
1. 遍历 requests 迭代器
|
||||
2. 检查取消信号
|
||||
3. 调用 api.post() 发送请求
|
||||
4. 将响应 put 到 processing_queue(满时阻塞 = 背压)
|
||||
5. 调用 rate_limiter.wait(),被取消则 break
|
||||
6. 连续失败超过阈值则中断(status=FAILED)
|
||||
"""
|
||||
consecutive_failures = 0
|
||||
|
||||
for req in requests:
|
||||
# 取消检查
|
||||
if self.cancel_token.is_cancelled:
|
||||
with self._lock:
|
||||
result.cancelled = True
|
||||
self.logger.info("收到取消信号,停止发送新请求")
|
||||
break
|
||||
|
||||
with self._lock:
|
||||
result.total_requests += 1
|
||||
|
||||
req_start = time.monotonic()
|
||||
try:
|
||||
# 预取模式:iter_paginated 已获取数据,直接使用
|
||||
if req._prefetched_response is not None:
|
||||
response = req._prefetched_response
|
||||
else:
|
||||
response = self.api.post(req.endpoint, req.params)
|
||||
elapsed = time.monotonic() - req_start
|
||||
|
||||
self.logger.debug(
|
||||
"请求完成: endpoint=%s, 耗时=%.2fs",
|
||||
req.endpoint,
|
||||
elapsed,
|
||||
)
|
||||
|
||||
# 将响应放入处理队列(满时阻塞 = 背压机制)
|
||||
processing_queue.put((req, response))
|
||||
|
||||
with self._lock:
|
||||
result.completed_requests += 1
|
||||
completed = result.completed_requests
|
||||
total = result.total_requests
|
||||
|
||||
# 成功则重置连续失败计数
|
||||
consecutive_failures = 0
|
||||
|
||||
# 运行时指标日志(需求 8.1):每 N 个请求记录一次队列深度和进度
|
||||
if completed % _METRICS_LOG_INTERVAL == 0:
|
||||
self._log_runtime_metrics(result, completed, total)
|
||||
|
||||
except Exception as exc:
|
||||
elapsed = time.monotonic() - req_start
|
||||
consecutive_failures += 1
|
||||
|
||||
self.logger.error(
|
||||
"请求失败: endpoint=%s, 耗时=%.2fs, 错误=%s",
|
||||
req.endpoint,
|
||||
elapsed,
|
||||
exc,
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
result.request_failures += 1
|
||||
result.errors.append({
|
||||
"phase": "request",
|
||||
"endpoint": req.endpoint,
|
||||
"error": str(exc),
|
||||
})
|
||||
|
||||
# 连续失败超过阈值则中断
|
||||
if consecutive_failures >= self.config.max_consecutive_failures:
|
||||
self.logger.error(
|
||||
"连续失败 %d 次,超过阈值 %d,中断管道",
|
||||
consecutive_failures,
|
||||
self.config.max_consecutive_failures,
|
||||
)
|
||||
with self._lock:
|
||||
result.status = "FAILED"
|
||||
break
|
||||
|
||||
# 限流等待(最后一个请求后也等待,保持与上游的间隔一致性)
|
||||
if not self._rate_limiter.wait(self.cancel_token.event):
|
||||
with self._lock:
|
||||
result.cancelled = True
|
||||
self.logger.info("限流等待期间收到取消信号,停止发送新请求")
|
||||
break
|
||||
|
||||
def _process_worker(
|
||||
self,
|
||||
processing_queue: queue.Queue,
|
||||
write_queue: queue.Queue,
|
||||
process_fn: Callable[[Any], list[dict]],
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""处理线程:从 processing_queue 消费数据,调用 process_fn,结果放入 write_queue。
|
||||
|
||||
收到 SENTINEL(None)时退出。
|
||||
单条记录处理异常时捕获、记录错误、继续处理。
|
||||
"""
|
||||
while True:
|
||||
item = processing_queue.get()
|
||||
|
||||
# SENTINEL:退出信号
|
||||
if item is None:
|
||||
processing_queue.task_done()
|
||||
break
|
||||
|
||||
req, response = item
|
||||
try:
|
||||
records = process_fn(response)
|
||||
|
||||
if records:
|
||||
# 将处理结果放入写入队列
|
||||
write_queue.put(records)
|
||||
|
||||
with self._lock:
|
||||
result.total_fetched += len(records)
|
||||
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
"处理失败: endpoint=%s, 错误=%s",
|
||||
req.endpoint,
|
||||
exc,
|
||||
)
|
||||
with self._lock:
|
||||
result.processing_failures += 1
|
||||
result.errors.append({
|
||||
"phase": "processing",
|
||||
"endpoint": req.endpoint,
|
||||
"error": str(exc),
|
||||
})
|
||||
|
||||
processing_queue.task_done()
|
||||
|
||||
def _write_worker(
|
||||
self,
|
||||
write_queue: queue.Queue,
|
||||
write_fn: Callable[[list[dict]], WriteResult],
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""写入线程:从 write_queue 消费数据,累积到 batch_size 或超时后批量写入。
|
||||
|
||||
- 累积到 batch_size 条记录时立即写入
|
||||
- 等待 batch_timeout 秒后将已累积的记录写入(即使不足 batch_size)
|
||||
- 写入失败时记录错误、继续处理后续批次
|
||||
- 收到 SENTINEL(None)时将剩余数据 flush 后退出
|
||||
"""
|
||||
batch: list[dict] = []
|
||||
batch_size = self.config.batch_size
|
||||
batch_timeout = self.config.batch_timeout
|
||||
|
||||
while True:
|
||||
try:
|
||||
item = write_queue.get(timeout=batch_timeout)
|
||||
except queue.Empty:
|
||||
# 超时:将已累积的记录写入
|
||||
if batch:
|
||||
self._flush_batch(batch, write_fn, result)
|
||||
batch = []
|
||||
continue
|
||||
|
||||
# SENTINEL:退出信号
|
||||
if item is None:
|
||||
write_queue.task_done()
|
||||
break
|
||||
|
||||
# item 是 list[dict](一次 process_fn 的输出)
|
||||
batch.extend(item)
|
||||
write_queue.task_done()
|
||||
|
||||
# 队列积压警告
|
||||
qsize = write_queue.qsize()
|
||||
if qsize >= self.config.queue_size * 2:
|
||||
self.logger.warning(
|
||||
"写入队列积压: qsize=%d, 阈值=%d",
|
||||
qsize,
|
||||
self.config.queue_size * 2,
|
||||
)
|
||||
|
||||
# 累积到 batch_size 时写入
|
||||
while len(batch) >= batch_size:
|
||||
chunk = batch[:batch_size]
|
||||
batch = batch[batch_size:]
|
||||
self._flush_batch(chunk, write_fn, result)
|
||||
|
||||
# 退出前 flush 剩余数据
|
||||
if batch:
|
||||
self._flush_batch(batch, write_fn, result)
|
||||
|
||||
def _flush_batch(
|
||||
self,
|
||||
batch: list[dict],
|
||||
write_fn: Callable[[list[dict]], WriteResult],
|
||||
result: PipelineResult,
|
||||
) -> None:
|
||||
"""执行一次批量写入,更新结果计数。"""
|
||||
if not batch:
|
||||
return
|
||||
|
||||
try:
|
||||
wr = write_fn(batch)
|
||||
with self._lock:
|
||||
result.total_inserted += wr.inserted
|
||||
result.total_updated += wr.updated
|
||||
result.total_skipped += wr.skipped
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
"批量写入失败: batch_size=%d, 错误=%s",
|
||||
len(batch),
|
||||
exc,
|
||||
)
|
||||
with self._lock:
|
||||
result.write_failures += 1
|
||||
result.errors.append({
|
||||
"phase": "write",
|
||||
"batch_size": len(batch),
|
||||
"error": str(exc),
|
||||
})
|
||||
|
||||
def _log_runtime_metrics(
|
||||
self,
|
||||
result: PipelineResult,
|
||||
completed: int,
|
||||
total: int,
|
||||
) -> None:
|
||||
"""记录运行时指标:队列深度、活跃线程数、进度(需求 8.1)。"""
|
||||
pq_depth = self._processing_queue.qsize() if self._processing_queue else 0
|
||||
wq_depth = self._write_queue.qsize() if self._write_queue else 0
|
||||
active_workers = sum(1 for w in self._workers if w.is_alive())
|
||||
|
||||
self.logger.debug(
|
||||
"运行时指标: 进度=%d/%d, 处理队列=%d, 活跃线程=%d, 写入队列=%d",
|
||||
completed,
|
||||
total,
|
||||
pq_depth,
|
||||
active_workers,
|
||||
wq_depth,
|
||||
)
|
||||
Reference in New Issue
Block a user