微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -15,6 +15,11 @@ from psycopg2.extras import Json, execute_values
from models.parsers import TypeParser
from tasks.base_task import BaseTask
from utils.windowing import build_window_segments, calc_window_minutes, calc_window_days, format_window_days
from config.pipeline_config import PipelineConfig
from pipeline.models import PipelineRequest, PipelineResult, WriteResult
from pipeline.unified_pipeline import UnifiedPipeline
from utils.cancellation import CancellationToken
from utils.task_log_buffer import TaskLogBuffer
ColumnTransform = Callable[[Any], Any]
@@ -67,6 +72,15 @@ class OdsTaskSpec:
# WINDOW 模式的时间列名
snapshot_time_column: str | None = None
# ── Detail_Mode 可选配置(二级详情拉取)──
detail_endpoint: str | None = None # 详情接口 endpoint
detail_param_builder: Callable[[dict], dict] | None = None # 详情请求参数构造函数
detail_target_table: str | None = None # 详情数据目标表名
detail_data_path: Tuple[str, ...] | None = None # 详情数据的 data_path
detail_list_key: str | None = None # 详情数据的 list_key
detail_id_column: str | None = None # 从列表数据中提取 ID 的列名
detail_process_fn: Callable[[Any], list[dict]] | None = None # 自定义详情处理函数
def __post_init__(self) -> None:
if self.snapshot_mode == SnapshotMode.WINDOW and not self.snapshot_time_column:
raise ValueError(
@@ -88,7 +102,10 @@ class BaseOdsTask(BaseTask):
def execute(self, cursor_data: dict | None = None) -> dict:
spec = self.SPEC
# 创建任务级日志缓冲区,任务完成后一次性输出,避免多任务日志交叉
self._log_buf = TaskLogBuffer(spec.code, self.logger)
self.logger.info("开始执行%s (ODS)", spec.code)
self._log_buf.info("开始执行%s (ODS)", spec.code)
window_start, window_end, window_minutes = self._resolve_window(cursor_data)
segments = build_window_segments(
@@ -111,6 +128,11 @@ class BaseOdsTask(BaseTask):
total_segments,
format_window_days(total_days),
)
self._log_buf.info(
"窗口拆分为 %s 段(共 %s 天)",
total_segments,
format_window_days(total_days),
)
store_id = TypeParser.parse_int(self.config.get("app.store_id"))
if not store_id:
@@ -141,6 +163,10 @@ class BaseOdsTask(BaseTask):
]
has_is_delete = self._table_has_column(spec.table_name, "is_delete")
# 构建 PipelineConfig支持任务级覆盖
pipeline_config = PipelineConfig.from_app_config(self.config, spec.code)
cancel_token = getattr(self, '_cancel_token', None) or CancellationToken()
try:
for idx, (seg_start, seg_end) in enumerate(segments, start=1):
params = self._build_params(
@@ -158,11 +184,12 @@ class BaseOdsTask(BaseTask):
"errors": 0,
"deleted": 0,
}
# 快照软删除需要的共享状态process_fn 闭包写入)
segment_keys: set[tuple] = set()
# CHANGE 2026-02-18 | 收集 WINDOW 模式下 API 返回数据的实际最早时间戳
segment_earliest_time: datetime | None = None
segment_earliest_time: list[datetime | None] = [None]
# CHANGE [2026-02-24] 收集 API 返回数据的实际最晚时间戳,用于 late-cutoff 保护
segment_latest_time: datetime | None = None
segment_latest_time: list[datetime | None] = [None]
self.logger.info(
"%s: 开始执行(%s/%s),窗口[%s ~ %s]",
@@ -172,52 +199,51 @@ class BaseOdsTask(BaseTask):
seg_start,
seg_end,
)
self._log_buf.info(
"开始执行(%s/%s),窗口[%s ~ %s]",
idx, total_segments, seg_start, seg_end,
)
for _, page_records, _, response_payload in self.api.iter_paginated(
endpoint=spec.endpoint,
params=params,
page_size=page_size,
data_path=spec.data_path,
list_key=spec.list_key,
):
if (
snapshot_missing_delete
and has_is_delete
and business_pk_cols
and snapshot_mode != SnapshotMode.NONE
):
segment_keys.update(self._collect_business_keys(page_records, business_pk_cols))
# CHANGE 2026-02-18 | 收集实际最早时间戳,用于 early-cutoff 保护
if (
snapshot_protect_early_cutoff
and snapshot_mode == SnapshotMode.WINDOW
and snapshot_time_column
):
page_earliest = self._collect_earliest_time(
page_records, snapshot_time_column
)
if page_earliest is not None:
if segment_earliest_time is None or page_earliest < segment_earliest_time:
segment_earliest_time = page_earliest
# CHANGE [2026-02-24] 收集实际最晚时间戳,用于 late-cutoff 保护
page_latest = self._collect_latest_time(
page_records, snapshot_time_column
)
if page_latest is not None:
if segment_latest_time is None or page_latest > segment_latest_time:
segment_latest_time = page_latest
inserted, updated, skipped = self._insert_records_schema_aware(
table=spec.table_name,
records=page_records,
response_payload=response_payload,
source_file=source_file,
source_endpoint=spec.endpoint if spec.include_source_endpoint else None,
)
segment_counts["fetched"] += len(page_records)
segment_counts["inserted"] += inserted
segment_counts["updated"] += updated
segment_counts["skipped"] += skipped
# 构建 UnifiedPipeline 并执行当前 segment
pipeline = UnifiedPipeline(
api_client=self.api,
db_connection=self.db,
logger=self.logger,
config=pipeline_config,
cancel_token=cancel_token,
)
requests = self._build_requests(
spec, params, page_size, idx - 1,
)
process_fn = self._build_process_fn(
spec,
snapshot_missing_delete=snapshot_missing_delete,
has_is_delete=has_is_delete,
business_pk_cols=business_pk_cols,
snapshot_mode=snapshot_mode,
snapshot_protect_early_cutoff=snapshot_protect_early_cutoff,
snapshot_time_column=snapshot_time_column,
segment_keys=segment_keys,
segment_earliest_time=segment_earliest_time,
segment_latest_time=segment_latest_time,
)
write_fn = self._build_write_fn(spec, source_file)
pipe_result = pipeline.run(requests, process_fn, write_fn)
# 将 PipelineResult 映射到 segment_counts
segment_counts["fetched"] = pipe_result.total_fetched
segment_counts["inserted"] = pipe_result.total_inserted
segment_counts["updated"] = pipe_result.total_updated
segment_counts["skipped"] = pipe_result.total_skipped
segment_counts["errors"] = (
pipe_result.request_failures
+ pipe_result.processing_failures
+ pipe_result.write_failures
)
# 快照软删除pipeline 完成后执行,保留原有逻辑)
if (
snapshot_missing_delete
and has_is_delete
@@ -230,28 +256,36 @@ class BaseOdsTask(BaseTask):
if (
snapshot_protect_early_cutoff
and snapshot_mode == SnapshotMode.WINDOW
and segment_earliest_time is not None
and segment_earliest_time > seg_start
and segment_earliest_time[0] is not None
and segment_earliest_time[0] > seg_start
):
self.logger.info(
"%s: early-cutoff 保护生效,软删除窗口起点从 %s 收窄至 %s",
spec.code, seg_start, segment_earliest_time,
spec.code, seg_start, segment_earliest_time[0],
)
effective_window_start = segment_earliest_time
self._log_buf.info(
"early-cutoff 保护生效,软删除窗口起点从 %s 收窄至 %s",
seg_start, segment_earliest_time[0],
)
effective_window_start = segment_earliest_time[0]
# CHANGE [2026-02-24] late-cutoff 保护:用 API 实际最晚时间戳收窄软删除范围
# 防止 recent endpoint 数据保留期滚动导致窗口尾部数据消失时误标删除
effective_window_end = seg_end
if (
snapshot_protect_early_cutoff
and snapshot_mode == SnapshotMode.WINDOW
and segment_latest_time is not None
and segment_latest_time < seg_end
and segment_latest_time[0] is not None
and segment_latest_time[0] < seg_end
):
self.logger.info(
"%s: late-cutoff 保护生效,软删除窗口终点从 %s 收窄至 %s",
spec.code, seg_end, segment_latest_time,
spec.code, seg_end, segment_latest_time[0],
)
effective_window_end = segment_latest_time
self._log_buf.info(
"late-cutoff 保护生效,软删除窗口终点从 %s 收窄至 %s",
seg_end, segment_latest_time[0],
)
effective_window_end = segment_latest_time[0]
deleted = self._mark_missing_as_deleted(
table=spec.table_name,
business_pk_cols=business_pk_cols,
@@ -279,6 +313,12 @@ class BaseOdsTask(BaseTask):
format_window_days(processed_days),
format_window_days(total_days),
)
self._log_buf.info(
"完成(%s/%s),已处理 %s/%s",
idx, total_segments,
format_window_days(processed_days),
format_window_days(total_days),
)
if total_segments > 1:
segment_results.append(
{
@@ -291,13 +331,76 @@ class BaseOdsTask(BaseTask):
}
)
# ── Detail_Mode列表拉取全部完成后执行二级详情拉取 ──
detail_counts = {
"detail_success": 0,
"detail_failure": 0,
"detail_skipped": 0,
}
if spec.detail_endpoint:
self.logger.info("%s: 列表阶段完成,进入详情拉取阶段", spec.code)
self._log_buf.info("列表阶段完成,进入详情拉取阶段")
detail_pipeline = UnifiedPipeline(
api_client=self.api,
db_connection=self.db,
logger=self.logger,
config=pipeline_config,
cancel_token=cancel_token,
)
detail_requests = self._build_detail_requests(spec)
detail_process_fn = self._build_detail_process_fn(spec)
detail_write_fn = self._build_detail_write_fn(spec, source_file)
detail_result = detail_pipeline.run(
detail_requests, detail_process_fn, detail_write_fn,
)
self.db.commit()
# 填充详情统计:成功 = 完成的请求数,失败 = 请求失败数,跳过 = 0无跳过逻辑
detail_counts["detail_success"] = detail_result.completed_requests
detail_counts["detail_failure"] = (
detail_result.request_failures
+ detail_result.processing_failures
+ detail_result.write_failures
)
# 记录详情阶段每个失败项的错误日志
for err in detail_result.errors:
self.logger.error(
"%s: 详情请求失败, detail_id=%s, error=%s",
spec.code,
err.get("detail_id", err.get("endpoint", "unknown")),
err.get("error", "unknown"),
)
self._log_buf.error(
"详情请求失败, detail_id=%s, error=%s",
err.get("detail_id", err.get("endpoint", "unknown")),
err.get("error", "unknown"),
)
self.logger.info(
"%s: 详情拉取完成, success=%d, failure=%d, skipped=%d",
spec.code,
detail_counts["detail_success"],
detail_counts["detail_failure"],
detail_counts["detail_skipped"],
)
self._log_buf.info(
"详情拉取完成, success=%d, failure=%d, skipped=%d",
detail_counts["detail_success"],
detail_counts["detail_failure"],
detail_counts["detail_skipped"],
)
self.logger.info("%s ODS 任务完成: %s", spec.code, total_counts)
self._log_buf.info("ODS 任务完成: %s", total_counts)
allow_empty_advance = bool(self.config.get("run.allow_empty_result_advance", False))
status = "SUCCESS"
if total_counts["fetched"] == 0 and not allow_empty_advance:
status = "PARTIAL"
result = self._build_result(status, total_counts)
# 附加详情统计到结果
result["detail"] = detail_counts
overall_start = segments[0][0]
overall_end = segments[-1][1]
result["window"] = {
@@ -311,14 +414,223 @@ class BaseOdsTask(BaseTask):
result["request_params"] = params_list[0]
else:
result["request_params"] = params_list
# 任务完成,将缓冲日志一次性输出到父 logger
self._log_buf.flush()
return result
except Exception:
self.db.rollback()
total_counts["errors"] += 1
self.logger.error("%s ODS 任务失败", spec.code, exc_info=True)
self._log_buf.error("ODS 任务失败")
# 异常时也 flush确保已收集的日志不丢失
self._log_buf.flush()
raise
# ── Pipeline 集成方法 ──
def _build_requests(
self,
spec: OdsTaskSpec,
params: dict,
page_size: int,
segment_index: int,
) -> Iterable[PipelineRequest]:
"""生成 PipelineRequest 序列,内部使用 iter_paginated 处理分页。
每一页的数据通过 _prefetched_response 预取UnifiedPipeline 的
_request_loop 跳过 api.post() 直接使用预取数据。
"""
for page_num, page_records, total, response_payload in self.api.iter_paginated(
endpoint=spec.endpoint,
params=params,
page_size=page_size,
data_path=spec.data_path,
list_key=spec.list_key,
):
yield PipelineRequest(
endpoint=spec.endpoint,
params=params,
page_size=page_size,
data_path=spec.data_path,
list_key=spec.list_key,
segment_index=segment_index,
_prefetched_response={
"records": page_records,
"response_payload": response_payload,
},
)
def _build_process_fn(
self,
spec: OdsTaskSpec,
*,
snapshot_missing_delete: bool,
has_is_delete: bool,
business_pk_cols: list[str],
snapshot_mode: SnapshotMode,
snapshot_protect_early_cutoff: bool,
snapshot_time_column: str | None,
segment_keys: set[tuple],
segment_earliest_time: list[datetime | None],
segment_latest_time: list[datetime | None],
) -> Callable[[Any], list[dict]]:
"""构建处理函数:从预取响应中提取记录,收集快照软删除所需的共享状态。"""
def process_fn(response: Any) -> list[dict]:
# response 是 _prefetched_response 字典
records = response.get("records", [])
if not records:
return []
# 收集业务主键(快照软删除用)
if (
snapshot_missing_delete
and has_is_delete
and business_pk_cols
and snapshot_mode != SnapshotMode.NONE
):
segment_keys.update(
self._collect_business_keys(records, business_pk_cols)
)
# CHANGE 2026-02-18 | 收集实际最早时间戳,用于 early-cutoff 保护
if (
snapshot_protect_early_cutoff
and snapshot_mode == SnapshotMode.WINDOW
and snapshot_time_column
):
page_earliest = self._collect_earliest_time(
records, snapshot_time_column
)
if page_earliest is not None:
if segment_earliest_time[0] is None or page_earliest < segment_earliest_time[0]:
segment_earliest_time[0] = page_earliest
# CHANGE [2026-02-24] 收集实际最晚时间戳,用于 late-cutoff 保护
page_latest = self._collect_latest_time(
records, snapshot_time_column
)
if page_latest is not None:
if segment_latest_time[0] is None or page_latest > segment_latest_time[0]:
segment_latest_time[0] = page_latest
return records
return process_fn
def _build_write_fn(
self,
spec: OdsTaskSpec,
source_file: str | None,
) -> Callable[[list[dict]], WriteResult]:
"""构建写入函数:调用 _insert_records_schema_aware返回 WriteResult。"""
def write_fn(records: list[dict]) -> WriteResult:
inserted, updated, skipped = self._insert_records_schema_aware(
table=spec.table_name,
records=records,
response_payload=None,
source_file=source_file,
source_endpoint=spec.endpoint if spec.include_source_endpoint else None,
)
return WriteResult(inserted=inserted, updated=updated, skipped=skipped)
return write_fn
# ── Detail_Mode 方法 ──
def _build_detail_requests(
self,
spec: OdsTaskSpec,
) -> Iterable[PipelineRequest]:
"""从已写入 ODS 的记录中提取 ID 列表,生成详情请求序列。
仅在 spec.detail_endpoint 已配置时调用。查询 ODS 目标表获取
detail_id_column 列的值,为每个 ID 生成一个 is_detail=True 的
PipelineRequest。
"""
if not spec.detail_endpoint or not spec.detail_id_column:
return
# 从 ODS 目标表查询刚写入的 ID 列表
id_col = spec.detail_id_column
table = spec.table_name
query = f"SELECT DISTINCT {id_col} FROM {table} WHERE {id_col} IS NOT NULL"
try:
cursor = self.db.cursor()
cursor.execute(query)
rows = cursor.fetchall()
cursor.close()
except Exception:
self.logger.error(
"%s: 查询详情 ID 列表失败, table=%s, column=%s",
spec.code, table, id_col, exc_info=True,
)
return
if not rows:
self.logger.info("%s: 无需拉取详情ID 列表为空", spec.code)
return
self.logger.info(
"%s: 开始详情拉取,共 %d 个 ID", spec.code, len(rows),
)
for (record_id,) in rows:
# 使用 detail_param_builder 构造请求参数,或默认 {"id": record_id}
if spec.detail_param_builder:
params = spec.detail_param_builder({"id": record_id})
else:
params = {"id": record_id}
yield PipelineRequest(
endpoint=spec.detail_endpoint,
params=params,
data_path=spec.detail_data_path or ("data",),
list_key=spec.detail_list_key,
is_detail=True,
detail_id=record_id,
)
def _build_detail_process_fn(
self,
spec: OdsTaskSpec,
) -> Callable[[Any], list[dict]]:
"""构建详情阶段的处理函数:从预取响应中提取记录。
优先使用 spec.detail_process_fn自定义处理函数
否则回退到默认的 response.get("records") 提取。
"""
if spec.detail_process_fn is not None:
return spec.detail_process_fn
def detail_process_fn(response: Any) -> list[dict]:
records = response.get("records", [])
return records
return detail_process_fn
def _build_detail_write_fn(
self,
spec: OdsTaskSpec,
source_file: str | None,
) -> Callable[[list[dict]], WriteResult]:
"""构建详情阶段的写入函数:写入 detail_target_table。"""
target_table = spec.detail_target_table or spec.table_name
def detail_write_fn(records: list[dict]) -> WriteResult:
inserted, updated, skipped = self._insert_records_schema_aware(
table=target_table,
records=records,
response_payload=None,
source_file=source_file,
source_endpoint=spec.detail_endpoint if spec.include_source_endpoint else None,
)
return WriteResult(inserted=inserted, updated=updated, skipped=skipped)
return detail_write_fn
def _resolve_window(self, cursor_data: dict | None) -> tuple[datetime, datetime, int]:
base_start, base_end, base_minutes = self._get_time_window(cursor_data)
@@ -909,6 +1221,18 @@ class BaseOdsTask(BaseTask):
_fill_missing("siteid", [site_profile.get("siteId"), site_profile.get("id")])
_fill_missing("sitename", [site_profile.get("shop_name"), site_profile.get("siteName")])
# 通用 siteid 注入ODS 表有 siteid 列但 API 记录不含时,从 app.store_id 填充
# 场景goods_stock_summary 等按门店请求但返回记录不含 siteId 的接口
ods_has_siteid = any(c[0].lower() == "siteid" for c in cols_info)
if ods_has_siteid:
store_id = TypeParser.parse_int(self.config.get("app.store_id"))
if store_id:
for item in merged_records:
merged = item["merged"]
existing = self._get_value_case_insensitive(merged, "siteid")
if existing in (None, "", 0):
merged["siteid"] = store_id
business_keys = [c for c in pk_cols if str(c).lower() != "content_hash"]
# P2(A): 使用 spec 上的显式开关控制去重,不再隐式依赖 has_fetched_at
# CHANGE 2026-02-19 | force_full_update 时仍查最新 hash用于判断是否回退到历史版本
@@ -1240,6 +1564,56 @@ def _bool_col(name: str, *sources: str) -> ColumnSpec:
return ColumnSpec(column=name, sources=sources, transform=_to_bool)
# ── 团购详情接口自定义 process_fn ──
# API 原始响应结构:{"data": {"groupPurchasePackage": {...}, "packageCouponAssistants": [...], ...}, "code": 0}
# detail_mode 下 process_fn 收到的是 api.post() 的原始 JSON 响应
def _group_package_detail_process_fn(response: Any) -> list[dict]:
"""从 QueryPackageCouponInfo 响应中提取字段,组装为一条扁平记录。
匹配 ods.group_buy_package_details 表结构。
"""
data = response.get("data")
if not data:
return []
pkg = data.get("groupPurchasePackage")
if not pkg:
return []
# 结构化字段(来自 data.groupPurchasePackage
record: dict[str, Any] = {
"coupon_id": pkg.get("id"),
"package_name": pkg.get("package_name"),
"duration": pkg.get("duration"),
"start_time": pkg.get("start_time"),
"end_time": pkg.get("end_time"),
"add_start_clock": pkg.get("add_start_clock"),
"add_end_clock": pkg.get("add_end_clock"),
"is_enabled": pkg.get("is_enabled"),
"is_delete": pkg.get("is_delete"),
"site_id": pkg.get("site_id"),
"tenant_id": pkg.get("tenant_id"),
"create_time": pkg.get("create_time"),
"creator_name": pkg.get("creator_name"),
}
# JSONB 数组字段
record["table_area_ids"] = pkg.get("tableAreaId")
record["table_area_names"] = pkg.get("tableAreaNameList")
record["assistant_services"] = data.get("packageCouponAssistants")
record["groupon_site_infos"] = data.get("grouponSiteInfos")
record["package_services"] = data.get("packagePackageService")
record["coupon_details_list"] = data.get("packageCouponDetailsList")
# content_hash对业务字段不含 content_hash、payload、fetched_at计算 SHA256
hash_input = json.dumps(record, sort_keys=True, ensure_ascii=False, default=str)
record["content_hash"] = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
# payload完整的 data 对象
record["payload"] = data
return [record]
ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
@@ -1251,9 +1625,18 @@ ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
data_path=("data",),
list_key="assistantInfos",
pk_columns=(_int_col("id", "id", required=True),),
extra_params={
"workStatusEnum": 0,
"dingTalkSynced": 0,
"leaveId": 0,
"criticismStatus": 0,
"signStatus": -1,
},
include_source_endpoint=False,
include_fetched_at=False,
include_record_index=True,
requires_window=False,
time_fields=None,
snapshot_mode=SnapshotMode.FULL_TABLE,
description="助教账号档案 ODSSearchAssistantInfo -> assistantInfos 原始 JSON",
),
@@ -1314,7 +1697,8 @@ ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
include_source_endpoint=False,
include_fetched_at=False,
include_record_index=True,
requires_window=False,
requires_window=True,
time_fields=("startTime", "endTime"),
snapshot_mode=SnapshotMode.WINDOW,
snapshot_time_column="create_time",
description="门店商品销售流水 ODSGetGoodsSalesList -> orderGoodsLedgers 原始 JSON",
@@ -1499,6 +1883,13 @@ ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
include_record_index=True,
requires_window=False,
snapshot_mode=SnapshotMode.FULL_TABLE,
# ── Detail_Mode 配置:团购详情接口 ──
detail_endpoint="/PackageCoupon/QueryPackageCouponInfo",
detail_param_builder=lambda rec: {"couponId": rec["id"]},
detail_target_table="ods.group_buy_package_details",
detail_data_path=("data",),
detail_id_column="id",
detail_process_fn=_group_package_detail_process_fn,
description="团购套餐定义 ODSQueryPackageCouponList -> packageCouponList 原始 JSON",
),
OdsTaskSpec(