微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -5,12 +5,14 @@ from __future__ import annotations
import os
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import date, datetime
from decimal import Decimal, InvalidOperation
from typing import Any, Dict, Iterable, List, Sequence
from psycopg2.extras import RealDictCursor, execute_batch, execute_values
from psycopg2.extras import Json, RealDictCursor, execute_batch, execute_values
from database.connection import DatabaseConnection
from tasks.base_task import BaseTask, TaskContext
@@ -70,6 +72,16 @@ class DwdLoadTask(BaseTask):
_NUMERIC_RE = re.compile(r"^[+-]?\d+(?:\.\d+)?$")
_BOOL_STRINGS = {"true", "false", "1", "0", "yes", "no", "y", "n", "t", "f"}
# 详情表 LEFT JOIN 配置:当 DWD 表需要从额外的 ODS 详情表获取字段时使用
# detail_columns 中的列在 FACT_MAPPINGS 中以 detail."col" 形式引用
DETAIL_JOIN_CONFIG: dict[str, dict] = {
"dwd.dim_groupbuy_package_ex": {
"detail_table": "ods.group_buy_package_details",
"join_condition": 'ods_main."id" = detail."coupon_id"',
"detail_columns": ["table_area_ids", "table_area_names", "assistant_services", "groupon_site_infos"],
},
}
def _strip_scd2_keys(self, pk_cols: Sequence[str]) -> list[str]:
return [c for c in pk_cols if c.lower() not in self.SCD_COLS]
@@ -113,7 +125,10 @@ class DwdLoadTask(BaseTask):
) -> str:
if key_exprs and order_col:
distinct_on = ", ".join(key_exprs)
order_by = ", ".join([*key_exprs, f'"{order_col}" DESC NULLS LAST'])
# order_col 可能是预格式化的表达式(如 ods_main."fetched_at"),此时直接使用;
# 否则包裹双引号
order_col_expr = order_col if '"' in order_col else f'"{order_col}"'
order_by = ", ".join([*key_exprs, f'{order_col_expr} DESC NULLS LAST'])
return (
f"SELECT DISTINCT ON ({distinct_on}) {select_cols_sql} "
f"FROM {ods_table_sql} {where_sql} ORDER BY {order_by}"
@@ -303,6 +318,11 @@ class DwdLoadTask(BaseTask):
("table_area_id_list", "table_area_id_list", None),
("package_type", "type", None),
("tenant_coupon_sale_order_item_id", "tenantcouponsaleorderitemid", None),
# CHANGE 2026-03-05: 团购详情字段(来自 ods.group_buy_package_details通过 LEFT JOIN 关联)
("table_area_ids", 'detail."table_area_ids"', None),
("table_area_names", 'detail."table_area_names"', None),
("assistant_services", 'detail."assistant_services"', None),
("groupon_site_infos", 'detail."groupon_site_infos"', None),
],
"dwd.dim_staff": [
("staff_id", "id", None),
@@ -311,16 +331,16 @@ class DwdLoadTask(BaseTask):
],
"dwd.dim_staff_ex": [
("staff_id", "id", None),
("rank_name", "rankname", None),
("cashier_point_id", "cashierpointid", "bigint"),
("cashier_point_name", "cashierpointname", None),
("group_id", "groupid", "bigint"),
("group_name", "groupname", None),
("system_user_id", "systemuserid", "bigint"),
("tenant_org_id", "tenantorgid", "bigint"),
("rank_name", "rank_name", None),
("cashier_point_id", "cashier_point_id", "bigint"),
("cashier_point_name", "cashier_point_name", None),
("group_id", "group_id", "bigint"),
("group_name", "group_name", None),
("system_user_id", "system_user_id", "bigint"),
("tenant_org_id", "tenant_org_id", "bigint"),
("auth_code_create", "auth_code_create", "timestamptz"),
("create_time", "create_time", "timestamptz"),
("user_roles", "userroles", "jsonb"),
("user_roles", "user_roles", "jsonb"),
],
# 事实表主键及关键差异列
"dwd.dwd_table_fee_log": [
@@ -602,6 +622,7 @@ class DwdLoadTask(BaseTask):
],
# 库存汇总goods_stock_summaryODS 列名全小写)
# CHANGE 2026-02-21: BUG 10 fix — ODS 列名是小写sitegoodsid不是驼峰
# CHANGE 2026-03-01: 补 site_id 映射ODS 入库时从 app.store_id 注入 siteid
"dwd.dwd_goods_stock_summary": [
("site_goods_id", '"sitegoodsid"', "bigint"), # 门店商品 IDPK
("goods_name", '"goodsname"', None), # 商品名称
@@ -617,6 +638,7 @@ class DwdLoadTask(BaseTask):
("range_sale_money", '"rangesalemoney"', "numeric"), # 销售金额
("range_inventory", '"rangeinventory"', "numeric"), # 盘点调整量
("current_stock", '"currentstock"', "numeric"), # 当前库存
("site_id", '"siteid"', "bigint"), # 门店 IDODS 入库时注入)
],
# 库存变动流水goods_stock_movementsODS 列名全小写)
# CHANGE 2026-02-21: BUG 10 fix — ODS 列名是小写,不是驼峰
@@ -653,11 +675,12 @@ class DwdLoadTask(BaseTask):
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
"""
遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。
并行遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。
说明:
- 为避免长事务导致锁堆积/中断后遗留 idle-in-tx本任务按“每张表一次事务”提交
- 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息
- 使用 ThreadPoolExecutor 并行处理多张表,每张表使用独立数据库连接和事务
- 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息
- 并行线程数通过 AppConfig 的 dwd.parallel_workers 配置(默认 4
"""
now = extracted["now"]
summary: List[Dict[str, Any]] = []
@@ -668,54 +691,109 @@ class DwdLoadTask(BaseTask):
if env_only and not only_tables_cfg:
only_tables_cfg = [t.strip() for t in env_only.split(",") if t.strip()]
only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set()
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
for dwd_table, ods_table in self.TABLE_MAP.items():
if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
continue
started = time.monotonic()
self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table)
parallel_workers = int(self.config.get("dwd.parallel_workers", 4))
# 筛选需要处理的表
tables_to_process: list[tuple[str, str]] = []
for dwd_table, ods_table in self.TABLE_MAP.items():
if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
continue
tables_to_process.append((dwd_table, ods_table))
if not tables_to_process:
return {"tables": summary, "errors": 0, "error_details": errors}
# 并行调度:每张表在独立线程中执行,使用独立数据库连接
with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
futures = {}
for dwd_table, ods_table in tables_to_process:
future = executor.submit(
self._process_single_table,
dwd_table, ods_table, now, context,
)
futures[future] = dwd_table
for future in as_completed(futures):
dwd_table = futures[future]
try:
dwd_cols = self._get_columns(cur, dwd_table)
ods_cols = self._get_columns(cur, ods_table)
if not dwd_cols:
self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table)
continue
if self._table_base(dwd_table).startswith("dim_"):
dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
self.db.conn.commit()
summary.append({"table": dwd_table, "mode": "SCD2", **dim_counts})
else:
dwd_types = self._get_column_types(cur, dwd_table, "dwd")
ods_types = self._get_column_types(cur, ods_table, "ods")
fact_counts = self._merge_fact_increment(
cur,
dwd_table,
ods_table,
dwd_cols,
ods_cols,
dwd_types,
ods_types,
window_start=context.window_start,
window_end=context.window_end,
)
self.db.conn.commit()
summary.append({"table": dwd_table, "mode": "INCREMENT", **fact_counts})
elapsed = time.monotonic() - started
self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed)
table_result = future.result()
summary.append(table_result)
except Exception as exc: # noqa: BLE001
try:
self.db.conn.rollback()
except Exception:
pass
elapsed = time.monotonic() - started
self.logger.exception("DWD 装载失败:%s,用时 %.2fserr=%s", dwd_table, elapsed, exc)
self.logger.error(
"DWD 并行装载失败:%serr=%s", dwd_table, exc,
)
errors.append({"table": dwd_table, "error": str(exc)})
continue
return {"tables": summary, "errors": len(errors), "error_details": errors}
def _process_single_table(
self,
dwd_table: str,
ods_table: str,
now: datetime,
context: TaskContext,
) -> Dict[str, Any]:
"""在独立线程中处理单张 DWD 表,使用独立数据库连接和事务。
每张表创建独立的 DatabaseConnection处理完成后关闭
保证线程间事务隔离,单表失败不影响其他表。
"""
started = time.monotonic()
self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table)
# 为当前线程创建独立数据库连接
thread_db = DatabaseConnection(
dsn=self.db._dsn,
session=self.db._session,
connect_timeout=self.db._connect_timeout,
)
try:
with thread_db.conn.cursor(cursor_factory=RealDictCursor) as cur:
dwd_cols = self._get_columns(cur, dwd_table)
ods_cols = self._get_columns(cur, ods_table)
if not dwd_cols:
self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table)
return {"table": dwd_table, "mode": "SKIPPED", "inserted": 0, "updated": 0}
if self._table_base(dwd_table).startswith("dim_"):
dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
thread_db.conn.commit()
result = {"table": dwd_table, "mode": "SCD2", **dim_counts}
else:
dwd_types = self._get_column_types(cur, dwd_table, "dwd")
ods_types = self._get_column_types(cur, ods_table, "ods")
fact_counts = self._merge_fact_increment(
cur,
dwd_table,
ods_table,
dwd_cols,
ods_cols,
dwd_types,
ods_types,
window_start=context.window_start,
window_end=context.window_end,
)
thread_db.conn.commit()
result = {"table": dwd_table, "mode": "INCREMENT", **fact_counts}
elapsed = time.monotonic() - started
self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed)
return result
except Exception as exc:
try:
thread_db.conn.rollback()
except Exception:
pass
elapsed = time.monotonic() - started
self.logger.exception(
"DWD 装载失败:%s,用时 %.2fserr=%s", dwd_table, elapsed, exc,
)
# 重新抛出,让 future.result() 在主线程捕获
raise
finally:
thread_db.close()
# ---------------------- 辅助方法 ----------------------
def _get_columns(self, cur, table: str) -> List[str]:
"""获取指定表的列名(小写)。"""
@@ -872,6 +950,17 @@ class DwdLoadTask(BaseTask):
ods_types = self._get_column_types(cur, ods_table, "ods")
ts_types = {"timestamp without time zone", "timestamp with time zone"}
table_sql = self._format_table(ods_table, "ods")
# CHANGE 2026-03-05: 详情表 LEFT JOIN 支持 — 当 DWD 表配置了 DETAIL_JOIN_CONFIG 时,
# 给 ODS 主表加别名 ods_mainLEFT JOIN 详情表为 detail
# 非 detail 列引用加 ods_main. 前缀避免歧义
detail_join = self.DETAIL_JOIN_CONFIG.get(dwd_table)
ods_alias = "ods_main" if detail_join else ""
if detail_join:
detail_table_sql = self._format_table(detail_join["detail_table"], "ods")
table_sql = (
f"{table_sql} AS ods_main "
f'LEFT JOIN {detail_table_sql} AS detail ON {detail_join["join_condition"]}'
)
# 构造 SELECT 表达式,支持 JSON/expression 映射
select_exprs: list[str] = []
added: set[str] = set()
@@ -881,21 +970,26 @@ class DwdLoadTask(BaseTask):
continue
if lc in mapping:
src, cast_type = mapping[lc]
# detail. 前缀的列直接使用(来自详情表),其他列加 ods_main. 前缀
if ods_alias and not src.startswith("detail."):
src = self._qualify_column_ref(src, ods_alias)
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
added.add(lc)
elif lc in ods_set:
col_ref = f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"'
# CHANGE 2026-02-22: BUG 12 — 同名列如果是时间类型,加哨兵值过滤
if dwd_types.get(lc) in ts_types and ods_types.get(lc) in ts_types:
select_exprs.append(
f'CASE WHEN "{lc}" >= \'{self._SENTINEL_DATE_THRESHOLD}\'::timestamp '
f'THEN "{lc}" ELSE NULL END AS "{lc}"'
f"CASE WHEN {col_ref} >= '{self._SENTINEL_DATE_THRESHOLD}'::timestamp "
f'THEN {col_ref} ELSE NULL END AS "{lc}"'
)
else:
select_exprs.append(f'"{lc}" AS "{lc}"')
select_exprs.append(f'{col_ref} AS "{lc}"')
added.add(lc)
# 分类维度需要额外读取 categoryboxes 以展开子类
if dwd_table == "dwd.dim_goods_category" and "categoryboxes" not in added and "categoryboxes" in ods_set:
select_exprs.append('"categoryboxes" AS "categoryboxes"')
col_ref = f'{ods_alias}."categoryboxes"' if ods_alias else '"categoryboxes"'
select_exprs.append(f'{col_ref} AS "categoryboxes"')
added.add("categoryboxes")
# 主键兜底确保被选出
for pk in business_keys:
@@ -903,9 +997,12 @@ class DwdLoadTask(BaseTask):
if lc not in added:
if lc in mapping:
src, cast_type = mapping[lc]
if ods_alias and not src.startswith("detail."):
src = self._qualify_column_ref(src, ods_alias)
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
elif lc in ods_set:
select_exprs.append(f'"{lc}" AS "{lc}"')
col_ref = f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"'
select_exprs.append(f'{col_ref} AS "{lc}"')
added.add(lc)
if not select_exprs:
@@ -917,14 +1014,19 @@ class DwdLoadTask(BaseTask):
lc = key.lower()
if lc in mapping:
src, cast_type = mapping[lc]
if ods_alias and not src.startswith("detail."):
src = self._qualify_column_ref(src, ods_alias)
key_exprs.append(self._cast_expr(src, cast_type))
elif lc in ods_set:
key_exprs.append(f'"{lc}"')
key_exprs.append(f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"')
select_cols_sql = ", ".join(select_exprs)
where_sql = self._append_where_condition("", '"fetched_at" IS NOT NULL')
fetched_at_ref = f'{ods_alias}."fetched_at"' if ods_alias else '"fetched_at"'
where_sql = self._append_where_condition("", f'{fetched_at_ref} IS NOT NULL')
# CHANGE 2026-03-05: order_col 也需要加别名前缀
qualified_order_col = f'{ods_alias}."{order_col}"' if ods_alias and order_col else (f'"{order_col}"' if order_col else None)
sql = self._latest_snapshot_select_sql(
select_cols_sql, table_sql, key_exprs, order_col, where_sql
select_cols_sql, table_sql, key_exprs, qualified_order_col, where_sql
)
cur.execute(sql)
rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
@@ -1006,7 +1108,7 @@ class DwdLoadTask(BaseTask):
# 批量插入新版本
if to_insert:
self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now)
self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now, dwd_types=dwd_types)
processed = len(src_rows_by_pk)
updated = len(to_close)
@@ -1050,11 +1152,16 @@ class DwdLoadTask(BaseTask):
dwd_cols: Sequence[str],
rows_with_version: Sequence[tuple[Dict[str, Any], int]],
now: datetime,
dwd_types: Dict[str, str] | None = None,
) -> None:
"""批量插入新的 SCD2 版本行。"""
sorted_cols = [c.lower() for c in sorted(dwd_cols)]
insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols)
table_sql = self._format_table(table, "dwd")
# 预计算数组类型列集合,避免 list 值被误包装为 Json
_array_cols: set[str] = set()
if dwd_types:
_array_cols = {c for c, t in dwd_types.items() if "ARRAY" in t.upper() or "[]" in t}
def build_row(src_row: Dict[str, Any], version: int) -> list[Any]:
values: list[Any] = []
@@ -1068,7 +1175,15 @@ class DwdLoadTask(BaseTask):
elif c == "scd2_version":
values.append(version)
else:
values.append(src_row.get(c))
val = src_row.get(c)
# CHANGE 2026-03-07: 区分数组列和 JSONB 列
# 数组列TEXT[] 等)的 list 值直接传递psycopg2 自动转为 PG 数组格式
# JSONB 列的 dict/list 值需要 Json() 包装
if isinstance(val, list) and c not in _array_cols:
val = Json(val)
elif isinstance(val, dict):
val = Json(val)
values.append(val)
return values
values_rows = [build_row(r, ver) for r, ver in rows_with_version]
@@ -1395,6 +1510,23 @@ class DwdLoadTask(BaseTask):
# CHANGE 2026-02-22: BUG 12 fix — 哨兵日期阈值,上游 API 用 0001-01-01 表示"未设置"
_SENTINEL_DATE_THRESHOLD = "0002-01-01"
@staticmethod
def _qualify_column_ref(src: str, alias: str) -> str:
"""为裸列引用添加表别名前缀。
已包含 detail.、别名前缀、JSON 操作符、表达式CASE/COALESCE 等)的源不做修改。
仅对简单列名(如 "col" 或 col添加 alias."col" 前缀。
"""
# 已有 detail. 或其他表前缀(含 .)→ 不修改
if "." in src:
return src
# JSON 操作符、SQL 表达式 → 不修改
if any(tok in src for tok in ("->", "#>>", "::", "CASE ", "COALESCE", "NULLIF", "(")):
return src
# 裸列名(可能带引号)→ 加别名前缀
bare = src.strip('"')
return f'{alias}."{bare}"'
def _cast_expr(self, col: str, cast_type: str | None) -> str:
"""构造带可选 CAST 的列表达式。