微信小程序页面迁移校验之前 P5任务处理之前
This commit is contained in:
@@ -5,12 +5,14 @@ from __future__ import annotations
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import Any, Dict, Iterable, List, Sequence
|
||||
|
||||
from psycopg2.extras import RealDictCursor, execute_batch, execute_values
|
||||
from psycopg2.extras import Json, RealDictCursor, execute_batch, execute_values
|
||||
|
||||
from database.connection import DatabaseConnection
|
||||
from tasks.base_task import BaseTask, TaskContext
|
||||
|
||||
|
||||
@@ -70,6 +72,16 @@ class DwdLoadTask(BaseTask):
|
||||
_NUMERIC_RE = re.compile(r"^[+-]?\d+(?:\.\d+)?$")
|
||||
_BOOL_STRINGS = {"true", "false", "1", "0", "yes", "no", "y", "n", "t", "f"}
|
||||
|
||||
# 详情表 LEFT JOIN 配置:当 DWD 表需要从额外的 ODS 详情表获取字段时使用
|
||||
# detail_columns 中的列在 FACT_MAPPINGS 中以 detail."col" 形式引用
|
||||
DETAIL_JOIN_CONFIG: dict[str, dict] = {
|
||||
"dwd.dim_groupbuy_package_ex": {
|
||||
"detail_table": "ods.group_buy_package_details",
|
||||
"join_condition": 'ods_main."id" = detail."coupon_id"',
|
||||
"detail_columns": ["table_area_ids", "table_area_names", "assistant_services", "groupon_site_infos"],
|
||||
},
|
||||
}
|
||||
|
||||
def _strip_scd2_keys(self, pk_cols: Sequence[str]) -> list[str]:
|
||||
return [c for c in pk_cols if c.lower() not in self.SCD_COLS]
|
||||
|
||||
@@ -113,7 +125,10 @@ class DwdLoadTask(BaseTask):
|
||||
) -> str:
|
||||
if key_exprs and order_col:
|
||||
distinct_on = ", ".join(key_exprs)
|
||||
order_by = ", ".join([*key_exprs, f'"{order_col}" DESC NULLS LAST'])
|
||||
# order_col 可能是预格式化的表达式(如 ods_main."fetched_at"),此时直接使用;
|
||||
# 否则包裹双引号
|
||||
order_col_expr = order_col if '"' in order_col else f'"{order_col}"'
|
||||
order_by = ", ".join([*key_exprs, f'{order_col_expr} DESC NULLS LAST'])
|
||||
return (
|
||||
f"SELECT DISTINCT ON ({distinct_on}) {select_cols_sql} "
|
||||
f"FROM {ods_table_sql} {where_sql} ORDER BY {order_by}"
|
||||
@@ -303,6 +318,11 @@ class DwdLoadTask(BaseTask):
|
||||
("table_area_id_list", "table_area_id_list", None),
|
||||
("package_type", "type", None),
|
||||
("tenant_coupon_sale_order_item_id", "tenantcouponsaleorderitemid", None),
|
||||
# CHANGE 2026-03-05: 团购详情字段(来自 ods.group_buy_package_details,通过 LEFT JOIN 关联)
|
||||
("table_area_ids", 'detail."table_area_ids"', None),
|
||||
("table_area_names", 'detail."table_area_names"', None),
|
||||
("assistant_services", 'detail."assistant_services"', None),
|
||||
("groupon_site_infos", 'detail."groupon_site_infos"', None),
|
||||
],
|
||||
"dwd.dim_staff": [
|
||||
("staff_id", "id", None),
|
||||
@@ -311,16 +331,16 @@ class DwdLoadTask(BaseTask):
|
||||
],
|
||||
"dwd.dim_staff_ex": [
|
||||
("staff_id", "id", None),
|
||||
("rank_name", "rankname", None),
|
||||
("cashier_point_id", "cashierpointid", "bigint"),
|
||||
("cashier_point_name", "cashierpointname", None),
|
||||
("group_id", "groupid", "bigint"),
|
||||
("group_name", "groupname", None),
|
||||
("system_user_id", "systemuserid", "bigint"),
|
||||
("tenant_org_id", "tenantorgid", "bigint"),
|
||||
("rank_name", "rank_name", None),
|
||||
("cashier_point_id", "cashier_point_id", "bigint"),
|
||||
("cashier_point_name", "cashier_point_name", None),
|
||||
("group_id", "group_id", "bigint"),
|
||||
("group_name", "group_name", None),
|
||||
("system_user_id", "system_user_id", "bigint"),
|
||||
("tenant_org_id", "tenant_org_id", "bigint"),
|
||||
("auth_code_create", "auth_code_create", "timestamptz"),
|
||||
("create_time", "create_time", "timestamptz"),
|
||||
("user_roles", "userroles", "jsonb"),
|
||||
("user_roles", "user_roles", "jsonb"),
|
||||
],
|
||||
# 事实表主键及关键差异列
|
||||
"dwd.dwd_table_fee_log": [
|
||||
@@ -602,6 +622,7 @@ class DwdLoadTask(BaseTask):
|
||||
],
|
||||
# 库存汇总:goods_stock_summary(ODS 列名全小写)
|
||||
# CHANGE 2026-02-21: BUG 10 fix — ODS 列名是小写(sitegoodsid),不是驼峰
|
||||
# CHANGE 2026-03-01: 补 site_id 映射(ODS 入库时从 app.store_id 注入 siteid)
|
||||
"dwd.dwd_goods_stock_summary": [
|
||||
("site_goods_id", '"sitegoodsid"', "bigint"), # 门店商品 ID(PK)
|
||||
("goods_name", '"goodsname"', None), # 商品名称
|
||||
@@ -617,6 +638,7 @@ class DwdLoadTask(BaseTask):
|
||||
("range_sale_money", '"rangesalemoney"', "numeric"), # 销售金额
|
||||
("range_inventory", '"rangeinventory"', "numeric"), # 盘点调整量
|
||||
("current_stock", '"currentstock"', "numeric"), # 当前库存
|
||||
("site_id", '"siteid"', "bigint"), # 门店 ID(ODS 入库时注入)
|
||||
],
|
||||
# 库存变动流水:goods_stock_movements(ODS 列名全小写)
|
||||
# CHANGE 2026-02-21: BUG 10 fix — ODS 列名是小写,不是驼峰
|
||||
@@ -653,11 +675,12 @@ class DwdLoadTask(BaseTask):
|
||||
|
||||
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
|
||||
"""
|
||||
遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。
|
||||
并行遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。
|
||||
|
||||
说明:
|
||||
- 为避免长事务导致锁堆积/中断后遗留 idle-in-tx,本任务按“每张表一次事务”提交;
|
||||
- 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息。
|
||||
- 使用 ThreadPoolExecutor 并行处理多张表,每张表使用独立数据库连接和事务;
|
||||
- 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息;
|
||||
- 并行线程数通过 AppConfig 的 dwd.parallel_workers 配置(默认 4)。
|
||||
"""
|
||||
now = extracted["now"]
|
||||
summary: List[Dict[str, Any]] = []
|
||||
@@ -668,54 +691,109 @@ class DwdLoadTask(BaseTask):
|
||||
if env_only and not only_tables_cfg:
|
||||
only_tables_cfg = [t.strip() for t in env_only.split(",") if t.strip()]
|
||||
only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set()
|
||||
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
for dwd_table, ods_table in self.TABLE_MAP.items():
|
||||
if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
|
||||
continue
|
||||
started = time.monotonic()
|
||||
self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table)
|
||||
|
||||
parallel_workers = int(self.config.get("dwd.parallel_workers", 4))
|
||||
|
||||
# 筛选需要处理的表
|
||||
tables_to_process: list[tuple[str, str]] = []
|
||||
for dwd_table, ods_table in self.TABLE_MAP.items():
|
||||
if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
|
||||
continue
|
||||
tables_to_process.append((dwd_table, ods_table))
|
||||
|
||||
if not tables_to_process:
|
||||
return {"tables": summary, "errors": 0, "error_details": errors}
|
||||
|
||||
# 并行调度:每张表在独立线程中执行,使用独立数据库连接
|
||||
with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
|
||||
futures = {}
|
||||
for dwd_table, ods_table in tables_to_process:
|
||||
future = executor.submit(
|
||||
self._process_single_table,
|
||||
dwd_table, ods_table, now, context,
|
||||
)
|
||||
futures[future] = dwd_table
|
||||
|
||||
for future in as_completed(futures):
|
||||
dwd_table = futures[future]
|
||||
try:
|
||||
dwd_cols = self._get_columns(cur, dwd_table)
|
||||
ods_cols = self._get_columns(cur, ods_table)
|
||||
if not dwd_cols:
|
||||
self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table)
|
||||
continue
|
||||
|
||||
if self._table_base(dwd_table).startswith("dim_"):
|
||||
dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
|
||||
self.db.conn.commit()
|
||||
summary.append({"table": dwd_table, "mode": "SCD2", **dim_counts})
|
||||
else:
|
||||
dwd_types = self._get_column_types(cur, dwd_table, "dwd")
|
||||
ods_types = self._get_column_types(cur, ods_table, "ods")
|
||||
fact_counts = self._merge_fact_increment(
|
||||
cur,
|
||||
dwd_table,
|
||||
ods_table,
|
||||
dwd_cols,
|
||||
ods_cols,
|
||||
dwd_types,
|
||||
ods_types,
|
||||
window_start=context.window_start,
|
||||
window_end=context.window_end,
|
||||
)
|
||||
self.db.conn.commit()
|
||||
summary.append({"table": dwd_table, "mode": "INCREMENT", **fact_counts})
|
||||
|
||||
elapsed = time.monotonic() - started
|
||||
self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed)
|
||||
table_result = future.result()
|
||||
summary.append(table_result)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
try:
|
||||
self.db.conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
elapsed = time.monotonic() - started
|
||||
self.logger.exception("DWD 装载失败:%s,用时 %.2fs,err=%s", dwd_table, elapsed, exc)
|
||||
self.logger.error(
|
||||
"DWD 并行装载失败:%s,err=%s", dwd_table, exc,
|
||||
)
|
||||
errors.append({"table": dwd_table, "error": str(exc)})
|
||||
continue
|
||||
|
||||
return {"tables": summary, "errors": len(errors), "error_details": errors}
|
||||
|
||||
def _process_single_table(
|
||||
self,
|
||||
dwd_table: str,
|
||||
ods_table: str,
|
||||
now: datetime,
|
||||
context: TaskContext,
|
||||
) -> Dict[str, Any]:
|
||||
"""在独立线程中处理单张 DWD 表,使用独立数据库连接和事务。
|
||||
|
||||
每张表创建独立的 DatabaseConnection,处理完成后关闭,
|
||||
保证线程间事务隔离,单表失败不影响其他表。
|
||||
"""
|
||||
started = time.monotonic()
|
||||
self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table)
|
||||
|
||||
# 为当前线程创建独立数据库连接
|
||||
thread_db = DatabaseConnection(
|
||||
dsn=self.db._dsn,
|
||||
session=self.db._session,
|
||||
connect_timeout=self.db._connect_timeout,
|
||||
)
|
||||
try:
|
||||
with thread_db.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
dwd_cols = self._get_columns(cur, dwd_table)
|
||||
ods_cols = self._get_columns(cur, ods_table)
|
||||
if not dwd_cols:
|
||||
self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table)
|
||||
return {"table": dwd_table, "mode": "SKIPPED", "inserted": 0, "updated": 0}
|
||||
|
||||
if self._table_base(dwd_table).startswith("dim_"):
|
||||
dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
|
||||
thread_db.conn.commit()
|
||||
result = {"table": dwd_table, "mode": "SCD2", **dim_counts}
|
||||
else:
|
||||
dwd_types = self._get_column_types(cur, dwd_table, "dwd")
|
||||
ods_types = self._get_column_types(cur, ods_table, "ods")
|
||||
fact_counts = self._merge_fact_increment(
|
||||
cur,
|
||||
dwd_table,
|
||||
ods_table,
|
||||
dwd_cols,
|
||||
ods_cols,
|
||||
dwd_types,
|
||||
ods_types,
|
||||
window_start=context.window_start,
|
||||
window_end=context.window_end,
|
||||
)
|
||||
thread_db.conn.commit()
|
||||
result = {"table": dwd_table, "mode": "INCREMENT", **fact_counts}
|
||||
|
||||
elapsed = time.monotonic() - started
|
||||
self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed)
|
||||
return result
|
||||
except Exception as exc:
|
||||
try:
|
||||
thread_db.conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
elapsed = time.monotonic() - started
|
||||
self.logger.exception(
|
||||
"DWD 装载失败:%s,用时 %.2fs,err=%s", dwd_table, elapsed, exc,
|
||||
)
|
||||
# 重新抛出,让 future.result() 在主线程捕获
|
||||
raise
|
||||
finally:
|
||||
thread_db.close()
|
||||
|
||||
# ---------------------- 辅助方法 ----------------------
|
||||
def _get_columns(self, cur, table: str) -> List[str]:
|
||||
"""获取指定表的列名(小写)。"""
|
||||
@@ -872,6 +950,17 @@ class DwdLoadTask(BaseTask):
|
||||
ods_types = self._get_column_types(cur, ods_table, "ods")
|
||||
ts_types = {"timestamp without time zone", "timestamp with time zone"}
|
||||
table_sql = self._format_table(ods_table, "ods")
|
||||
# CHANGE 2026-03-05: 详情表 LEFT JOIN 支持 — 当 DWD 表配置了 DETAIL_JOIN_CONFIG 时,
|
||||
# 给 ODS 主表加别名 ods_main,LEFT JOIN 详情表为 detail,
|
||||
# 非 detail 列引用加 ods_main. 前缀避免歧义
|
||||
detail_join = self.DETAIL_JOIN_CONFIG.get(dwd_table)
|
||||
ods_alias = "ods_main" if detail_join else ""
|
||||
if detail_join:
|
||||
detail_table_sql = self._format_table(detail_join["detail_table"], "ods")
|
||||
table_sql = (
|
||||
f"{table_sql} AS ods_main "
|
||||
f'LEFT JOIN {detail_table_sql} AS detail ON {detail_join["join_condition"]}'
|
||||
)
|
||||
# 构造 SELECT 表达式,支持 JSON/expression 映射
|
||||
select_exprs: list[str] = []
|
||||
added: set[str] = set()
|
||||
@@ -881,21 +970,26 @@ class DwdLoadTask(BaseTask):
|
||||
continue
|
||||
if lc in mapping:
|
||||
src, cast_type = mapping[lc]
|
||||
# detail. 前缀的列直接使用(来自详情表),其他列加 ods_main. 前缀
|
||||
if ods_alias and not src.startswith("detail."):
|
||||
src = self._qualify_column_ref(src, ods_alias)
|
||||
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
|
||||
added.add(lc)
|
||||
elif lc in ods_set:
|
||||
col_ref = f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"'
|
||||
# CHANGE 2026-02-22: BUG 12 — 同名列如果是时间类型,加哨兵值过滤
|
||||
if dwd_types.get(lc) in ts_types and ods_types.get(lc) in ts_types:
|
||||
select_exprs.append(
|
||||
f'CASE WHEN "{lc}" >= \'{self._SENTINEL_DATE_THRESHOLD}\'::timestamp '
|
||||
f'THEN "{lc}" ELSE NULL END AS "{lc}"'
|
||||
f"CASE WHEN {col_ref} >= '{self._SENTINEL_DATE_THRESHOLD}'::timestamp "
|
||||
f'THEN {col_ref} ELSE NULL END AS "{lc}"'
|
||||
)
|
||||
else:
|
||||
select_exprs.append(f'"{lc}" AS "{lc}"')
|
||||
select_exprs.append(f'{col_ref} AS "{lc}"')
|
||||
added.add(lc)
|
||||
# 分类维度需要额外读取 categoryboxes 以展开子类
|
||||
if dwd_table == "dwd.dim_goods_category" and "categoryboxes" not in added and "categoryboxes" in ods_set:
|
||||
select_exprs.append('"categoryboxes" AS "categoryboxes"')
|
||||
col_ref = f'{ods_alias}."categoryboxes"' if ods_alias else '"categoryboxes"'
|
||||
select_exprs.append(f'{col_ref} AS "categoryboxes"')
|
||||
added.add("categoryboxes")
|
||||
# 主键兜底确保被选出
|
||||
for pk in business_keys:
|
||||
@@ -903,9 +997,12 @@ class DwdLoadTask(BaseTask):
|
||||
if lc not in added:
|
||||
if lc in mapping:
|
||||
src, cast_type = mapping[lc]
|
||||
if ods_alias and not src.startswith("detail."):
|
||||
src = self._qualify_column_ref(src, ods_alias)
|
||||
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
|
||||
elif lc in ods_set:
|
||||
select_exprs.append(f'"{lc}" AS "{lc}"')
|
||||
col_ref = f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"'
|
||||
select_exprs.append(f'{col_ref} AS "{lc}"')
|
||||
added.add(lc)
|
||||
|
||||
if not select_exprs:
|
||||
@@ -917,14 +1014,19 @@ class DwdLoadTask(BaseTask):
|
||||
lc = key.lower()
|
||||
if lc in mapping:
|
||||
src, cast_type = mapping[lc]
|
||||
if ods_alias and not src.startswith("detail."):
|
||||
src = self._qualify_column_ref(src, ods_alias)
|
||||
key_exprs.append(self._cast_expr(src, cast_type))
|
||||
elif lc in ods_set:
|
||||
key_exprs.append(f'"{lc}"')
|
||||
key_exprs.append(f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"')
|
||||
|
||||
select_cols_sql = ", ".join(select_exprs)
|
||||
where_sql = self._append_where_condition("", '"fetched_at" IS NOT NULL')
|
||||
fetched_at_ref = f'{ods_alias}."fetched_at"' if ods_alias else '"fetched_at"'
|
||||
where_sql = self._append_where_condition("", f'{fetched_at_ref} IS NOT NULL')
|
||||
# CHANGE 2026-03-05: order_col 也需要加别名前缀
|
||||
qualified_order_col = f'{ods_alias}."{order_col}"' if ods_alias and order_col else (f'"{order_col}"' if order_col else None)
|
||||
sql = self._latest_snapshot_select_sql(
|
||||
select_cols_sql, table_sql, key_exprs, order_col, where_sql
|
||||
select_cols_sql, table_sql, key_exprs, qualified_order_col, where_sql
|
||||
)
|
||||
cur.execute(sql)
|
||||
rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
|
||||
@@ -1006,7 +1108,7 @@ class DwdLoadTask(BaseTask):
|
||||
|
||||
# 批量插入新版本
|
||||
if to_insert:
|
||||
self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now)
|
||||
self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now, dwd_types=dwd_types)
|
||||
|
||||
processed = len(src_rows_by_pk)
|
||||
updated = len(to_close)
|
||||
@@ -1050,11 +1152,16 @@ class DwdLoadTask(BaseTask):
|
||||
dwd_cols: Sequence[str],
|
||||
rows_with_version: Sequence[tuple[Dict[str, Any], int]],
|
||||
now: datetime,
|
||||
dwd_types: Dict[str, str] | None = None,
|
||||
) -> None:
|
||||
"""批量插入新的 SCD2 版本行。"""
|
||||
sorted_cols = [c.lower() for c in sorted(dwd_cols)]
|
||||
insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols)
|
||||
table_sql = self._format_table(table, "dwd")
|
||||
# 预计算数组类型列集合,避免 list 值被误包装为 Json
|
||||
_array_cols: set[str] = set()
|
||||
if dwd_types:
|
||||
_array_cols = {c for c, t in dwd_types.items() if "ARRAY" in t.upper() or "[]" in t}
|
||||
|
||||
def build_row(src_row: Dict[str, Any], version: int) -> list[Any]:
|
||||
values: list[Any] = []
|
||||
@@ -1068,7 +1175,15 @@ class DwdLoadTask(BaseTask):
|
||||
elif c == "scd2_version":
|
||||
values.append(version)
|
||||
else:
|
||||
values.append(src_row.get(c))
|
||||
val = src_row.get(c)
|
||||
# CHANGE 2026-03-07: 区分数组列和 JSONB 列
|
||||
# 数组列(TEXT[] 等)的 list 值直接传递,psycopg2 自动转为 PG 数组格式
|
||||
# JSONB 列的 dict/list 值需要 Json() 包装
|
||||
if isinstance(val, list) and c not in _array_cols:
|
||||
val = Json(val)
|
||||
elif isinstance(val, dict):
|
||||
val = Json(val)
|
||||
values.append(val)
|
||||
return values
|
||||
|
||||
values_rows = [build_row(r, ver) for r, ver in rows_with_version]
|
||||
@@ -1395,6 +1510,23 @@ class DwdLoadTask(BaseTask):
|
||||
# CHANGE 2026-02-22: BUG 12 fix — 哨兵日期阈值,上游 API 用 0001-01-01 表示"未设置"
|
||||
_SENTINEL_DATE_THRESHOLD = "0002-01-01"
|
||||
|
||||
@staticmethod
|
||||
def _qualify_column_ref(src: str, alias: str) -> str:
|
||||
"""为裸列引用添加表别名前缀。
|
||||
|
||||
已包含 detail.、别名前缀、JSON 操作符、表达式(CASE/COALESCE 等)的源不做修改。
|
||||
仅对简单列名(如 "col" 或 col)添加 alias."col" 前缀。
|
||||
"""
|
||||
# 已有 detail. 或其他表前缀(含 .)→ 不修改
|
||||
if "." in src:
|
||||
return src
|
||||
# JSON 操作符、SQL 表达式 → 不修改
|
||||
if any(tok in src for tok in ("->", "#>>", "::", "CASE ", "COALESCE", "NULLIF", "(")):
|
||||
return src
|
||||
# 裸列名(可能带引号)→ 加别名前缀
|
||||
bare = src.strip('"')
|
||||
return f'{alias}."{bare}"'
|
||||
|
||||
def _cast_expr(self, col: str, cast_type: str | None) -> str:
|
||||
"""构造带可选 CAST 的列表达式。
|
||||
|
||||
|
||||
Reference in New Issue
Block a user