微信小程序页面迁移校验之前 P5任务处理之前
This commit is contained in:
@@ -15,6 +15,11 @@ from psycopg2.extras import Json, execute_values
|
||||
from models.parsers import TypeParser
|
||||
from tasks.base_task import BaseTask
|
||||
from utils.windowing import build_window_segments, calc_window_minutes, calc_window_days, format_window_days
|
||||
from config.pipeline_config import PipelineConfig
|
||||
from pipeline.models import PipelineRequest, PipelineResult, WriteResult
|
||||
from pipeline.unified_pipeline import UnifiedPipeline
|
||||
from utils.cancellation import CancellationToken
|
||||
from utils.task_log_buffer import TaskLogBuffer
|
||||
|
||||
|
||||
ColumnTransform = Callable[[Any], Any]
|
||||
@@ -67,6 +72,15 @@ class OdsTaskSpec:
|
||||
# WINDOW 模式的时间列名
|
||||
snapshot_time_column: str | None = None
|
||||
|
||||
# ── Detail_Mode 可选配置(二级详情拉取)──
|
||||
detail_endpoint: str | None = None # 详情接口 endpoint
|
||||
detail_param_builder: Callable[[dict], dict] | None = None # 详情请求参数构造函数
|
||||
detail_target_table: str | None = None # 详情数据目标表名
|
||||
detail_data_path: Tuple[str, ...] | None = None # 详情数据的 data_path
|
||||
detail_list_key: str | None = None # 详情数据的 list_key
|
||||
detail_id_column: str | None = None # 从列表数据中提取 ID 的列名
|
||||
detail_process_fn: Callable[[Any], list[dict]] | None = None # 自定义详情处理函数
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.snapshot_mode == SnapshotMode.WINDOW and not self.snapshot_time_column:
|
||||
raise ValueError(
|
||||
@@ -88,7 +102,10 @@ class BaseOdsTask(BaseTask):
|
||||
|
||||
def execute(self, cursor_data: dict | None = None) -> dict:
|
||||
spec = self.SPEC
|
||||
# 创建任务级日志缓冲区,任务完成后一次性输出,避免多任务日志交叉
|
||||
self._log_buf = TaskLogBuffer(spec.code, self.logger)
|
||||
self.logger.info("开始执行%s (ODS)", spec.code)
|
||||
self._log_buf.info("开始执行%s (ODS)", spec.code)
|
||||
|
||||
window_start, window_end, window_minutes = self._resolve_window(cursor_data)
|
||||
segments = build_window_segments(
|
||||
@@ -111,6 +128,11 @@ class BaseOdsTask(BaseTask):
|
||||
total_segments,
|
||||
format_window_days(total_days),
|
||||
)
|
||||
self._log_buf.info(
|
||||
"窗口拆分为 %s 段(共 %s 天)",
|
||||
total_segments,
|
||||
format_window_days(total_days),
|
||||
)
|
||||
|
||||
store_id = TypeParser.parse_int(self.config.get("app.store_id"))
|
||||
if not store_id:
|
||||
@@ -141,6 +163,10 @@ class BaseOdsTask(BaseTask):
|
||||
]
|
||||
has_is_delete = self._table_has_column(spec.table_name, "is_delete")
|
||||
|
||||
# 构建 PipelineConfig(支持任务级覆盖)
|
||||
pipeline_config = PipelineConfig.from_app_config(self.config, spec.code)
|
||||
cancel_token = getattr(self, '_cancel_token', None) or CancellationToken()
|
||||
|
||||
try:
|
||||
for idx, (seg_start, seg_end) in enumerate(segments, start=1):
|
||||
params = self._build_params(
|
||||
@@ -158,11 +184,12 @@ class BaseOdsTask(BaseTask):
|
||||
"errors": 0,
|
||||
"deleted": 0,
|
||||
}
|
||||
# 快照软删除需要的共享状态(process_fn 闭包写入)
|
||||
segment_keys: set[tuple] = set()
|
||||
# CHANGE 2026-02-18 | 收集 WINDOW 模式下 API 返回数据的实际最早时间戳
|
||||
segment_earliest_time: datetime | None = None
|
||||
segment_earliest_time: list[datetime | None] = [None]
|
||||
# CHANGE [2026-02-24] 收集 API 返回数据的实际最晚时间戳,用于 late-cutoff 保护
|
||||
segment_latest_time: datetime | None = None
|
||||
segment_latest_time: list[datetime | None] = [None]
|
||||
|
||||
self.logger.info(
|
||||
"%s: 开始执行(%s/%s),窗口[%s ~ %s]",
|
||||
@@ -172,52 +199,51 @@ class BaseOdsTask(BaseTask):
|
||||
seg_start,
|
||||
seg_end,
|
||||
)
|
||||
self._log_buf.info(
|
||||
"开始执行(%s/%s),窗口[%s ~ %s]",
|
||||
idx, total_segments, seg_start, seg_end,
|
||||
)
|
||||
|
||||
for _, page_records, _, response_payload in self.api.iter_paginated(
|
||||
endpoint=spec.endpoint,
|
||||
params=params,
|
||||
page_size=page_size,
|
||||
data_path=spec.data_path,
|
||||
list_key=spec.list_key,
|
||||
):
|
||||
if (
|
||||
snapshot_missing_delete
|
||||
and has_is_delete
|
||||
and business_pk_cols
|
||||
and snapshot_mode != SnapshotMode.NONE
|
||||
):
|
||||
segment_keys.update(self._collect_business_keys(page_records, business_pk_cols))
|
||||
# CHANGE 2026-02-18 | 收集实际最早时间戳,用于 early-cutoff 保护
|
||||
if (
|
||||
snapshot_protect_early_cutoff
|
||||
and snapshot_mode == SnapshotMode.WINDOW
|
||||
and snapshot_time_column
|
||||
):
|
||||
page_earliest = self._collect_earliest_time(
|
||||
page_records, snapshot_time_column
|
||||
)
|
||||
if page_earliest is not None:
|
||||
if segment_earliest_time is None or page_earliest < segment_earliest_time:
|
||||
segment_earliest_time = page_earliest
|
||||
# CHANGE [2026-02-24] 收集实际最晚时间戳,用于 late-cutoff 保护
|
||||
page_latest = self._collect_latest_time(
|
||||
page_records, snapshot_time_column
|
||||
)
|
||||
if page_latest is not None:
|
||||
if segment_latest_time is None or page_latest > segment_latest_time:
|
||||
segment_latest_time = page_latest
|
||||
inserted, updated, skipped = self._insert_records_schema_aware(
|
||||
table=spec.table_name,
|
||||
records=page_records,
|
||||
response_payload=response_payload,
|
||||
source_file=source_file,
|
||||
source_endpoint=spec.endpoint if spec.include_source_endpoint else None,
|
||||
)
|
||||
segment_counts["fetched"] += len(page_records)
|
||||
segment_counts["inserted"] += inserted
|
||||
segment_counts["updated"] += updated
|
||||
segment_counts["skipped"] += skipped
|
||||
# 构建 UnifiedPipeline 并执行当前 segment
|
||||
pipeline = UnifiedPipeline(
|
||||
api_client=self.api,
|
||||
db_connection=self.db,
|
||||
logger=self.logger,
|
||||
config=pipeline_config,
|
||||
cancel_token=cancel_token,
|
||||
)
|
||||
|
||||
requests = self._build_requests(
|
||||
spec, params, page_size, idx - 1,
|
||||
)
|
||||
process_fn = self._build_process_fn(
|
||||
spec,
|
||||
snapshot_missing_delete=snapshot_missing_delete,
|
||||
has_is_delete=has_is_delete,
|
||||
business_pk_cols=business_pk_cols,
|
||||
snapshot_mode=snapshot_mode,
|
||||
snapshot_protect_early_cutoff=snapshot_protect_early_cutoff,
|
||||
snapshot_time_column=snapshot_time_column,
|
||||
segment_keys=segment_keys,
|
||||
segment_earliest_time=segment_earliest_time,
|
||||
segment_latest_time=segment_latest_time,
|
||||
)
|
||||
write_fn = self._build_write_fn(spec, source_file)
|
||||
|
||||
pipe_result = pipeline.run(requests, process_fn, write_fn)
|
||||
|
||||
# 将 PipelineResult 映射到 segment_counts
|
||||
segment_counts["fetched"] = pipe_result.total_fetched
|
||||
segment_counts["inserted"] = pipe_result.total_inserted
|
||||
segment_counts["updated"] = pipe_result.total_updated
|
||||
segment_counts["skipped"] = pipe_result.total_skipped
|
||||
segment_counts["errors"] = (
|
||||
pipe_result.request_failures
|
||||
+ pipe_result.processing_failures
|
||||
+ pipe_result.write_failures
|
||||
)
|
||||
|
||||
# 快照软删除(pipeline 完成后执行,保留原有逻辑)
|
||||
if (
|
||||
snapshot_missing_delete
|
||||
and has_is_delete
|
||||
@@ -230,28 +256,36 @@ class BaseOdsTask(BaseTask):
|
||||
if (
|
||||
snapshot_protect_early_cutoff
|
||||
and snapshot_mode == SnapshotMode.WINDOW
|
||||
and segment_earliest_time is not None
|
||||
and segment_earliest_time > seg_start
|
||||
and segment_earliest_time[0] is not None
|
||||
and segment_earliest_time[0] > seg_start
|
||||
):
|
||||
self.logger.info(
|
||||
"%s: early-cutoff 保护生效,软删除窗口起点从 %s 收窄至 %s",
|
||||
spec.code, seg_start, segment_earliest_time,
|
||||
spec.code, seg_start, segment_earliest_time[0],
|
||||
)
|
||||
effective_window_start = segment_earliest_time
|
||||
self._log_buf.info(
|
||||
"early-cutoff 保护生效,软删除窗口起点从 %s 收窄至 %s",
|
||||
seg_start, segment_earliest_time[0],
|
||||
)
|
||||
effective_window_start = segment_earliest_time[0]
|
||||
# CHANGE [2026-02-24] late-cutoff 保护:用 API 实际最晚时间戳收窄软删除范围
|
||||
# 防止 recent endpoint 数据保留期滚动导致窗口尾部数据消失时误标删除
|
||||
effective_window_end = seg_end
|
||||
if (
|
||||
snapshot_protect_early_cutoff
|
||||
and snapshot_mode == SnapshotMode.WINDOW
|
||||
and segment_latest_time is not None
|
||||
and segment_latest_time < seg_end
|
||||
and segment_latest_time[0] is not None
|
||||
and segment_latest_time[0] < seg_end
|
||||
):
|
||||
self.logger.info(
|
||||
"%s: late-cutoff 保护生效,软删除窗口终点从 %s 收窄至 %s",
|
||||
spec.code, seg_end, segment_latest_time,
|
||||
spec.code, seg_end, segment_latest_time[0],
|
||||
)
|
||||
effective_window_end = segment_latest_time
|
||||
self._log_buf.info(
|
||||
"late-cutoff 保护生效,软删除窗口终点从 %s 收窄至 %s",
|
||||
seg_end, segment_latest_time[0],
|
||||
)
|
||||
effective_window_end = segment_latest_time[0]
|
||||
deleted = self._mark_missing_as_deleted(
|
||||
table=spec.table_name,
|
||||
business_pk_cols=business_pk_cols,
|
||||
@@ -279,6 +313,12 @@ class BaseOdsTask(BaseTask):
|
||||
format_window_days(processed_days),
|
||||
format_window_days(total_days),
|
||||
)
|
||||
self._log_buf.info(
|
||||
"完成(%s/%s),已处理 %s/%s 天",
|
||||
idx, total_segments,
|
||||
format_window_days(processed_days),
|
||||
format_window_days(total_days),
|
||||
)
|
||||
if total_segments > 1:
|
||||
segment_results.append(
|
||||
{
|
||||
@@ -291,13 +331,76 @@ class BaseOdsTask(BaseTask):
|
||||
}
|
||||
)
|
||||
|
||||
# ── Detail_Mode:列表拉取全部完成后,执行二级详情拉取 ──
|
||||
detail_counts = {
|
||||
"detail_success": 0,
|
||||
"detail_failure": 0,
|
||||
"detail_skipped": 0,
|
||||
}
|
||||
if spec.detail_endpoint:
|
||||
self.logger.info("%s: 列表阶段完成,进入详情拉取阶段", spec.code)
|
||||
self._log_buf.info("列表阶段完成,进入详情拉取阶段")
|
||||
detail_pipeline = UnifiedPipeline(
|
||||
api_client=self.api,
|
||||
db_connection=self.db,
|
||||
logger=self.logger,
|
||||
config=pipeline_config,
|
||||
cancel_token=cancel_token,
|
||||
)
|
||||
detail_requests = self._build_detail_requests(spec)
|
||||
detail_process_fn = self._build_detail_process_fn(spec)
|
||||
detail_write_fn = self._build_detail_write_fn(spec, source_file)
|
||||
|
||||
detail_result = detail_pipeline.run(
|
||||
detail_requests, detail_process_fn, detail_write_fn,
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
# 填充详情统计:成功 = 完成的请求数,失败 = 请求失败数,跳过 = 0(无跳过逻辑)
|
||||
detail_counts["detail_success"] = detail_result.completed_requests
|
||||
detail_counts["detail_failure"] = (
|
||||
detail_result.request_failures
|
||||
+ detail_result.processing_failures
|
||||
+ detail_result.write_failures
|
||||
)
|
||||
# 记录详情阶段每个失败项的错误日志
|
||||
for err in detail_result.errors:
|
||||
self.logger.error(
|
||||
"%s: 详情请求失败, detail_id=%s, error=%s",
|
||||
spec.code,
|
||||
err.get("detail_id", err.get("endpoint", "unknown")),
|
||||
err.get("error", "unknown"),
|
||||
)
|
||||
self._log_buf.error(
|
||||
"详情请求失败, detail_id=%s, error=%s",
|
||||
err.get("detail_id", err.get("endpoint", "unknown")),
|
||||
err.get("error", "unknown"),
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
"%s: 详情拉取完成, success=%d, failure=%d, skipped=%d",
|
||||
spec.code,
|
||||
detail_counts["detail_success"],
|
||||
detail_counts["detail_failure"],
|
||||
detail_counts["detail_skipped"],
|
||||
)
|
||||
self._log_buf.info(
|
||||
"详情拉取完成, success=%d, failure=%d, skipped=%d",
|
||||
detail_counts["detail_success"],
|
||||
detail_counts["detail_failure"],
|
||||
detail_counts["detail_skipped"],
|
||||
)
|
||||
|
||||
self.logger.info("%s ODS 任务完成: %s", spec.code, total_counts)
|
||||
self._log_buf.info("ODS 任务完成: %s", total_counts)
|
||||
allow_empty_advance = bool(self.config.get("run.allow_empty_result_advance", False))
|
||||
status = "SUCCESS"
|
||||
if total_counts["fetched"] == 0 and not allow_empty_advance:
|
||||
status = "PARTIAL"
|
||||
|
||||
result = self._build_result(status, total_counts)
|
||||
# 附加详情统计到结果
|
||||
result["detail"] = detail_counts
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
result["window"] = {
|
||||
@@ -311,14 +414,223 @@ class BaseOdsTask(BaseTask):
|
||||
result["request_params"] = params_list[0]
|
||||
else:
|
||||
result["request_params"] = params_list
|
||||
# 任务完成,将缓冲日志一次性输出到父 logger
|
||||
self._log_buf.flush()
|
||||
return result
|
||||
|
||||
except Exception:
|
||||
self.db.rollback()
|
||||
total_counts["errors"] += 1
|
||||
self.logger.error("%s ODS 任务失败", spec.code, exc_info=True)
|
||||
self._log_buf.error("ODS 任务失败")
|
||||
# 异常时也 flush,确保已收集的日志不丢失
|
||||
self._log_buf.flush()
|
||||
raise
|
||||
|
||||
# ── Pipeline 集成方法 ──
|
||||
|
||||
def _build_requests(
|
||||
self,
|
||||
spec: OdsTaskSpec,
|
||||
params: dict,
|
||||
page_size: int,
|
||||
segment_index: int,
|
||||
) -> Iterable[PipelineRequest]:
|
||||
"""生成 PipelineRequest 序列,内部使用 iter_paginated 处理分页。
|
||||
|
||||
每一页的数据通过 _prefetched_response 预取,UnifiedPipeline 的
|
||||
_request_loop 跳过 api.post() 直接使用预取数据。
|
||||
"""
|
||||
for page_num, page_records, total, response_payload in self.api.iter_paginated(
|
||||
endpoint=spec.endpoint,
|
||||
params=params,
|
||||
page_size=page_size,
|
||||
data_path=spec.data_path,
|
||||
list_key=spec.list_key,
|
||||
):
|
||||
yield PipelineRequest(
|
||||
endpoint=spec.endpoint,
|
||||
params=params,
|
||||
page_size=page_size,
|
||||
data_path=spec.data_path,
|
||||
list_key=spec.list_key,
|
||||
segment_index=segment_index,
|
||||
_prefetched_response={
|
||||
"records": page_records,
|
||||
"response_payload": response_payload,
|
||||
},
|
||||
)
|
||||
|
||||
def _build_process_fn(
|
||||
self,
|
||||
spec: OdsTaskSpec,
|
||||
*,
|
||||
snapshot_missing_delete: bool,
|
||||
has_is_delete: bool,
|
||||
business_pk_cols: list[str],
|
||||
snapshot_mode: SnapshotMode,
|
||||
snapshot_protect_early_cutoff: bool,
|
||||
snapshot_time_column: str | None,
|
||||
segment_keys: set[tuple],
|
||||
segment_earliest_time: list[datetime | None],
|
||||
segment_latest_time: list[datetime | None],
|
||||
) -> Callable[[Any], list[dict]]:
|
||||
"""构建处理函数:从预取响应中提取记录,收集快照软删除所需的共享状态。"""
|
||||
|
||||
def process_fn(response: Any) -> list[dict]:
|
||||
# response 是 _prefetched_response 字典
|
||||
records = response.get("records", [])
|
||||
if not records:
|
||||
return []
|
||||
|
||||
# 收集业务主键(快照软删除用)
|
||||
if (
|
||||
snapshot_missing_delete
|
||||
and has_is_delete
|
||||
and business_pk_cols
|
||||
and snapshot_mode != SnapshotMode.NONE
|
||||
):
|
||||
segment_keys.update(
|
||||
self._collect_business_keys(records, business_pk_cols)
|
||||
)
|
||||
|
||||
# CHANGE 2026-02-18 | 收集实际最早时间戳,用于 early-cutoff 保护
|
||||
if (
|
||||
snapshot_protect_early_cutoff
|
||||
and snapshot_mode == SnapshotMode.WINDOW
|
||||
and snapshot_time_column
|
||||
):
|
||||
page_earliest = self._collect_earliest_time(
|
||||
records, snapshot_time_column
|
||||
)
|
||||
if page_earliest is not None:
|
||||
if segment_earliest_time[0] is None or page_earliest < segment_earliest_time[0]:
|
||||
segment_earliest_time[0] = page_earliest
|
||||
# CHANGE [2026-02-24] 收集实际最晚时间戳,用于 late-cutoff 保护
|
||||
page_latest = self._collect_latest_time(
|
||||
records, snapshot_time_column
|
||||
)
|
||||
if page_latest is not None:
|
||||
if segment_latest_time[0] is None or page_latest > segment_latest_time[0]:
|
||||
segment_latest_time[0] = page_latest
|
||||
|
||||
return records
|
||||
|
||||
return process_fn
|
||||
|
||||
def _build_write_fn(
|
||||
self,
|
||||
spec: OdsTaskSpec,
|
||||
source_file: str | None,
|
||||
) -> Callable[[list[dict]], WriteResult]:
|
||||
"""构建写入函数:调用 _insert_records_schema_aware,返回 WriteResult。"""
|
||||
|
||||
def write_fn(records: list[dict]) -> WriteResult:
|
||||
inserted, updated, skipped = self._insert_records_schema_aware(
|
||||
table=spec.table_name,
|
||||
records=records,
|
||||
response_payload=None,
|
||||
source_file=source_file,
|
||||
source_endpoint=spec.endpoint if spec.include_source_endpoint else None,
|
||||
)
|
||||
return WriteResult(inserted=inserted, updated=updated, skipped=skipped)
|
||||
|
||||
return write_fn
|
||||
|
||||
# ── Detail_Mode 方法 ──
|
||||
|
||||
def _build_detail_requests(
|
||||
self,
|
||||
spec: OdsTaskSpec,
|
||||
) -> Iterable[PipelineRequest]:
|
||||
"""从已写入 ODS 的记录中提取 ID 列表,生成详情请求序列。
|
||||
|
||||
仅在 spec.detail_endpoint 已配置时调用。查询 ODS 目标表获取
|
||||
detail_id_column 列的值,为每个 ID 生成一个 is_detail=True 的
|
||||
PipelineRequest。
|
||||
"""
|
||||
if not spec.detail_endpoint or not spec.detail_id_column:
|
||||
return
|
||||
|
||||
# 从 ODS 目标表查询刚写入的 ID 列表
|
||||
id_col = spec.detail_id_column
|
||||
table = spec.table_name
|
||||
query = f"SELECT DISTINCT {id_col} FROM {table} WHERE {id_col} IS NOT NULL"
|
||||
try:
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
cursor.close()
|
||||
except Exception:
|
||||
self.logger.error(
|
||||
"%s: 查询详情 ID 列表失败, table=%s, column=%s",
|
||||
spec.code, table, id_col, exc_info=True,
|
||||
)
|
||||
return
|
||||
|
||||
if not rows:
|
||||
self.logger.info("%s: 无需拉取详情,ID 列表为空", spec.code)
|
||||
return
|
||||
|
||||
self.logger.info(
|
||||
"%s: 开始详情拉取,共 %d 个 ID", spec.code, len(rows),
|
||||
)
|
||||
|
||||
for (record_id,) in rows:
|
||||
# 使用 detail_param_builder 构造请求参数,或默认 {"id": record_id}
|
||||
if spec.detail_param_builder:
|
||||
params = spec.detail_param_builder({"id": record_id})
|
||||
else:
|
||||
params = {"id": record_id}
|
||||
|
||||
yield PipelineRequest(
|
||||
endpoint=spec.detail_endpoint,
|
||||
params=params,
|
||||
data_path=spec.detail_data_path or ("data",),
|
||||
list_key=spec.detail_list_key,
|
||||
is_detail=True,
|
||||
detail_id=record_id,
|
||||
)
|
||||
|
||||
def _build_detail_process_fn(
|
||||
self,
|
||||
spec: OdsTaskSpec,
|
||||
) -> Callable[[Any], list[dict]]:
|
||||
"""构建详情阶段的处理函数:从预取响应中提取记录。
|
||||
|
||||
优先使用 spec.detail_process_fn(自定义处理函数),
|
||||
否则回退到默认的 response.get("records") 提取。
|
||||
"""
|
||||
if spec.detail_process_fn is not None:
|
||||
return spec.detail_process_fn
|
||||
|
||||
def detail_process_fn(response: Any) -> list[dict]:
|
||||
records = response.get("records", [])
|
||||
return records
|
||||
|
||||
return detail_process_fn
|
||||
|
||||
def _build_detail_write_fn(
|
||||
self,
|
||||
spec: OdsTaskSpec,
|
||||
source_file: str | None,
|
||||
) -> Callable[[list[dict]], WriteResult]:
|
||||
"""构建详情阶段的写入函数:写入 detail_target_table。"""
|
||||
target_table = spec.detail_target_table or spec.table_name
|
||||
|
||||
def detail_write_fn(records: list[dict]) -> WriteResult:
|
||||
inserted, updated, skipped = self._insert_records_schema_aware(
|
||||
table=target_table,
|
||||
records=records,
|
||||
response_payload=None,
|
||||
source_file=source_file,
|
||||
source_endpoint=spec.detail_endpoint if spec.include_source_endpoint else None,
|
||||
)
|
||||
return WriteResult(inserted=inserted, updated=updated, skipped=skipped)
|
||||
|
||||
return detail_write_fn
|
||||
|
||||
|
||||
def _resolve_window(self, cursor_data: dict | None) -> tuple[datetime, datetime, int]:
|
||||
base_start, base_end, base_minutes = self._get_time_window(cursor_data)
|
||||
|
||||
@@ -909,6 +1221,18 @@ class BaseOdsTask(BaseTask):
|
||||
_fill_missing("siteid", [site_profile.get("siteId"), site_profile.get("id")])
|
||||
_fill_missing("sitename", [site_profile.get("shop_name"), site_profile.get("siteName")])
|
||||
|
||||
# 通用 siteid 注入:ODS 表有 siteid 列但 API 记录不含时,从 app.store_id 填充
|
||||
# 场景:goods_stock_summary 等按门店请求但返回记录不含 siteId 的接口
|
||||
ods_has_siteid = any(c[0].lower() == "siteid" for c in cols_info)
|
||||
if ods_has_siteid:
|
||||
store_id = TypeParser.parse_int(self.config.get("app.store_id"))
|
||||
if store_id:
|
||||
for item in merged_records:
|
||||
merged = item["merged"]
|
||||
existing = self._get_value_case_insensitive(merged, "siteid")
|
||||
if existing in (None, "", 0):
|
||||
merged["siteid"] = store_id
|
||||
|
||||
business_keys = [c for c in pk_cols if str(c).lower() != "content_hash"]
|
||||
# P2(A): 使用 spec 上的显式开关控制去重,不再隐式依赖 has_fetched_at
|
||||
# CHANGE 2026-02-19 | force_full_update 时仍查最新 hash(用于判断是否回退到历史版本),
|
||||
@@ -1240,6 +1564,56 @@ def _bool_col(name: str, *sources: str) -> ColumnSpec:
|
||||
return ColumnSpec(column=name, sources=sources, transform=_to_bool)
|
||||
|
||||
|
||||
# ── 团购详情接口自定义 process_fn ──
|
||||
# API 原始响应结构:{"data": {"groupPurchasePackage": {...}, "packageCouponAssistants": [...], ...}, "code": 0}
|
||||
# detail_mode 下 process_fn 收到的是 api.post() 的原始 JSON 响应
|
||||
|
||||
def _group_package_detail_process_fn(response: Any) -> list[dict]:
|
||||
"""从 QueryPackageCouponInfo 响应中提取字段,组装为一条扁平记录。
|
||||
|
||||
匹配 ods.group_buy_package_details 表结构。
|
||||
"""
|
||||
data = response.get("data")
|
||||
if not data:
|
||||
return []
|
||||
|
||||
pkg = data.get("groupPurchasePackage")
|
||||
if not pkg:
|
||||
return []
|
||||
|
||||
# 结构化字段(来自 data.groupPurchasePackage)
|
||||
record: dict[str, Any] = {
|
||||
"coupon_id": pkg.get("id"),
|
||||
"package_name": pkg.get("package_name"),
|
||||
"duration": pkg.get("duration"),
|
||||
"start_time": pkg.get("start_time"),
|
||||
"end_time": pkg.get("end_time"),
|
||||
"add_start_clock": pkg.get("add_start_clock"),
|
||||
"add_end_clock": pkg.get("add_end_clock"),
|
||||
"is_enabled": pkg.get("is_enabled"),
|
||||
"is_delete": pkg.get("is_delete"),
|
||||
"site_id": pkg.get("site_id"),
|
||||
"tenant_id": pkg.get("tenant_id"),
|
||||
"create_time": pkg.get("create_time"),
|
||||
"creator_name": pkg.get("creator_name"),
|
||||
}
|
||||
|
||||
# JSONB 数组字段
|
||||
record["table_area_ids"] = pkg.get("tableAreaId")
|
||||
record["table_area_names"] = pkg.get("tableAreaNameList")
|
||||
record["assistant_services"] = data.get("packageCouponAssistants")
|
||||
record["groupon_site_infos"] = data.get("grouponSiteInfos")
|
||||
record["package_services"] = data.get("packagePackageService")
|
||||
record["coupon_details_list"] = data.get("packageCouponDetailsList")
|
||||
|
||||
# content_hash:对业务字段(不含 content_hash、payload、fetched_at)计算 SHA256
|
||||
hash_input = json.dumps(record, sort_keys=True, ensure_ascii=False, default=str)
|
||||
record["content_hash"] = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
|
||||
|
||||
# payload:完整的 data 对象
|
||||
record["payload"] = data
|
||||
|
||||
return [record]
|
||||
|
||||
|
||||
ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
|
||||
@@ -1251,9 +1625,18 @@ ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
|
||||
data_path=("data",),
|
||||
list_key="assistantInfos",
|
||||
pk_columns=(_int_col("id", "id", required=True),),
|
||||
extra_params={
|
||||
"workStatusEnum": 0,
|
||||
"dingTalkSynced": 0,
|
||||
"leaveId": 0,
|
||||
"criticismStatus": 0,
|
||||
"signStatus": -1,
|
||||
},
|
||||
include_source_endpoint=False,
|
||||
include_fetched_at=False,
|
||||
include_record_index=True,
|
||||
requires_window=False,
|
||||
time_fields=None,
|
||||
snapshot_mode=SnapshotMode.FULL_TABLE,
|
||||
description="助教账号档案 ODS:SearchAssistantInfo -> assistantInfos 原始 JSON",
|
||||
),
|
||||
@@ -1314,7 +1697,8 @@ ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
|
||||
include_source_endpoint=False,
|
||||
include_fetched_at=False,
|
||||
include_record_index=True,
|
||||
requires_window=False,
|
||||
requires_window=True,
|
||||
time_fields=("startTime", "endTime"),
|
||||
snapshot_mode=SnapshotMode.WINDOW,
|
||||
snapshot_time_column="create_time",
|
||||
description="门店商品销售流水 ODS:GetGoodsSalesList -> orderGoodsLedgers 原始 JSON",
|
||||
@@ -1499,6 +1883,13 @@ ODS_TASK_SPECS: Tuple[OdsTaskSpec, ...] = (
|
||||
include_record_index=True,
|
||||
requires_window=False,
|
||||
snapshot_mode=SnapshotMode.FULL_TABLE,
|
||||
# ── Detail_Mode 配置:团购详情接口 ──
|
||||
detail_endpoint="/PackageCoupon/QueryPackageCouponInfo",
|
||||
detail_param_builder=lambda rec: {"couponId": rec["id"]},
|
||||
detail_target_table="ods.group_buy_package_details",
|
||||
detail_data_path=("data",),
|
||||
detail_id_column="id",
|
||||
detail_process_fn=_group_package_detail_process_fn,
|
||||
description="团购套餐定义 ODS:QueryPackageCouponList -> packageCouponList 原始 JSON",
|
||||
),
|
||||
OdsTaskSpec(
|
||||
|
||||
Reference in New Issue
Block a user