微信小程序页面迁移校验之前 P5任务处理之前

2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions
--- a/apps/etl/connectors/feiqiu/tasks/dwd/dwd_load_task.py
+++ b/apps/etl/connectors/feiqiu/tasks/dwd/dwd_load_task.py
@@ -5,12 +5,14 @@ from __future__ import annotations
 import os
 import re
 import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import date, datetime
 from decimal import Decimal, InvalidOperation
 from typing import Any, Dict, Iterable, List, Sequence

-from psycopg2.extras import RealDictCursor, execute_batch, execute_values
+from psycopg2.extras import Json, RealDictCursor, execute_batch, execute_values

+from database.connection import DatabaseConnection
 from tasks.base_task import BaseTask, TaskContext


@@ -70,6 +72,16 @@ class DwdLoadTask(BaseTask):
    _NUMERIC_RE = re.compile(r"^[+-]?\d+(?:\.\d+)?$")
    _BOOL_STRINGS = {"true", "false", "1", "0", "yes", "no", "y", "n", "t", "f"}

+    # 详情表 LEFT JOIN 配置：当 DWD 表需要从额外的 ODS 详情表获取字段时使用
+    # detail_columns 中的列在 FACT_MAPPINGS 中以 detail."col" 形式引用
+    DETAIL_JOIN_CONFIG: dict[str, dict] = {
+        "dwd.dim_groupbuy_package_ex": {
+            "detail_table": "ods.group_buy_package_details",
+            "join_condition": 'ods_main."id" = detail."coupon_id"',
+            "detail_columns": ["table_area_ids", "table_area_names", "assistant_services", "groupon_site_infos"],
+        },
+    }
+
    def _strip_scd2_keys(self, pk_cols: Sequence[str]) -> list[str]:
        return [c for c in pk_cols if c.lower() not in self.SCD_COLS]

@@ -113,7 +125,10 @@ class DwdLoadTask(BaseTask):
    ) -> str:
        if key_exprs and order_col:
            distinct_on = ", ".join(key_exprs)
-            order_by = ", ".join([*key_exprs, f'"{order_col}" DESC NULLS LAST'])
+            # order_col 可能是预格式化的表达式（如 ods_main."fetched_at"），此时直接使用；
+            # 否则包裹双引号
+            order_col_expr = order_col if '"' in order_col else f'"{order_col}"'
+            order_by = ", ".join([*key_exprs, f'{order_col_expr} DESC NULLS LAST'])
            return (
                f"SELECT DISTINCT ON ({distinct_on}) {select_cols_sql} "
                f"FROM {ods_table_sql} {where_sql} ORDER BY {order_by}"
@@ -303,6 +318,11 @@ class DwdLoadTask(BaseTask):
            ("table_area_id_list", "table_area_id_list", None),
            ("package_type", "type", None),
            ("tenant_coupon_sale_order_item_id", "tenantcouponsaleorderitemid", None),
+            # CHANGE 2026-03-05: 团购详情字段（来自 ods.group_buy_package_details，通过 LEFT JOIN 关联）
+            ("table_area_ids", 'detail."table_area_ids"', None),
+            ("table_area_names", 'detail."table_area_names"', None),
+            ("assistant_services", 'detail."assistant_services"', None),
+            ("groupon_site_infos", 'detail."groupon_site_infos"', None),
        ],
        "dwd.dim_staff": [
            ("staff_id", "id", None),
@@ -311,16 +331,16 @@ class DwdLoadTask(BaseTask):
        ],
        "dwd.dim_staff_ex": [
            ("staff_id", "id", None),
-            ("rank_name", "rankname", None),
-            ("cashier_point_id", "cashierpointid", "bigint"),
-            ("cashier_point_name", "cashierpointname", None),
-            ("group_id", "groupid", "bigint"),
-            ("group_name", "groupname", None),
-            ("system_user_id", "systemuserid", "bigint"),
-            ("tenant_org_id", "tenantorgid", "bigint"),
+            ("rank_name", "rank_name", None),
+            ("cashier_point_id", "cashier_point_id", "bigint"),
+            ("cashier_point_name", "cashier_point_name", None),
+            ("group_id", "group_id", "bigint"),
+            ("group_name", "group_name", None),
+            ("system_user_id", "system_user_id", "bigint"),
+            ("tenant_org_id", "tenant_org_id", "bigint"),
            ("auth_code_create", "auth_code_create", "timestamptz"),
            ("create_time", "create_time", "timestamptz"),
-            ("user_roles", "userroles", "jsonb"),
+            ("user_roles", "user_roles", "jsonb"),
        ],
        # 事实表主键及关键差异列
        "dwd.dwd_table_fee_log": [
@@ -602,6 +622,7 @@ class DwdLoadTask(BaseTask):
        ],
        # 库存汇总：goods_stock_summary（ODS 列名全小写）
        # CHANGE 2026-02-21: BUG 10 fix — ODS 列名是小写（sitegoodsid），不是驼峰
+        # CHANGE 2026-03-01: 补 site_id 映射（ODS 入库时从 app.store_id 注入 siteid）
        "dwd.dwd_goods_stock_summary": [
            ("site_goods_id", '"sitegoodsid"', "bigint"),           # 门店商品 ID（PK）
            ("goods_name", '"goodsname"', None),                     # 商品名称
@@ -617,6 +638,7 @@ class DwdLoadTask(BaseTask):
            ("range_sale_money", '"rangesalemoney"', "numeric"),     # 销售金额
            ("range_inventory", '"rangeinventory"', "numeric"),      # 盘点调整量
            ("current_stock", '"currentstock"', "numeric"),          # 当前库存
+            ("site_id", '"siteid"', "bigint"),                       # 门店 ID（ODS 入库时注入）
        ],
        # 库存变动流水：goods_stock_movements（ODS 列名全小写）
        # CHANGE 2026-02-21: BUG 10 fix — ODS 列名是小写，不是驼峰
@@ -653,11 +675,12 @@ class DwdLoadTask(BaseTask):

    def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
        """
-        遍历映射关系，维度执行 SCD2 合并，事实表按时间增量插入。
+        并行遍历映射关系，维度执行 SCD2 合并，事实表按时间增量插入。

        说明：
-        - 为避免长事务导致锁堆积/中断后遗留 idle-in-tx，本任务按“每张表一次事务”提交；
-        - 单表失败会回滚该表并继续后续表，最终在结果中汇总错误信息。
+        - 使用 ThreadPoolExecutor 并行处理多张表，每张表使用独立数据库连接和事务；
+        - 单表失败会回滚该表并继续后续表，最终在结果中汇总错误信息；
+        - 并行线程数通过 AppConfig 的 dwd.parallel_workers 配置（默认 4）。
        """
        now = extracted["now"]
        summary: List[Dict[str, Any]] = []
@@ -668,54 +691,109 @@ class DwdLoadTask(BaseTask):
        if env_only and not only_tables_cfg:
            only_tables_cfg = [t.strip() for t in env_only.split(",") if t.strip()]
        only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set()
-        with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
-            for dwd_table, ods_table in self.TABLE_MAP.items():
-                if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
-                    continue
-                started = time.monotonic()
-                self.logger.info("DWD 装载开始：%s <= %s", dwd_table, ods_table)
+
+        parallel_workers = int(self.config.get("dwd.parallel_workers", 4))
+
+        # 筛选需要处理的表
+        tables_to_process: list[tuple[str, str]] = []
+        for dwd_table, ods_table in self.TABLE_MAP.items():
+            if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
+                continue
+            tables_to_process.append((dwd_table, ods_table))
+
+        if not tables_to_process:
+            return {"tables": summary, "errors": 0, "error_details": errors}
+
+        # 并行调度：每张表在独立线程中执行，使用独立数据库连接
+        with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
+            futures = {}
+            for dwd_table, ods_table in tables_to_process:
+                future = executor.submit(
+                    self._process_single_table,
+                    dwd_table, ods_table, now, context,
+                )
+                futures[future] = dwd_table
+
+            for future in as_completed(futures):
+                dwd_table = futures[future]
                try:
-                    dwd_cols = self._get_columns(cur, dwd_table)
-                    ods_cols = self._get_columns(cur, ods_table)
-                    if not dwd_cols:
-                        self.logger.warning("跳过 %s：未能获取 DWD 列信息", dwd_table)
-                        continue
-
-                    if self._table_base(dwd_table).startswith("dim_"):
-                        dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
-                        self.db.conn.commit()
-                        summary.append({"table": dwd_table, "mode": "SCD2", **dim_counts})
-                    else:
-                        dwd_types = self._get_column_types(cur, dwd_table, "dwd")
-                        ods_types = self._get_column_types(cur, ods_table, "ods")
-                        fact_counts = self._merge_fact_increment(
-                            cur,
-                            dwd_table,
-                            ods_table,
-                            dwd_cols,
-                            ods_cols,
-                            dwd_types,
-                            ods_types,
-                            window_start=context.window_start,
-                            window_end=context.window_end,
-                        )
-                        self.db.conn.commit()
-                        summary.append({"table": dwd_table, "mode": "INCREMENT", **fact_counts})
-
-                    elapsed = time.monotonic() - started
-                    self.logger.info("DWD 装载完成：%s，用时 %.2fs", dwd_table, elapsed)
+                    table_result = future.result()
+                    summary.append(table_result)
                except Exception as exc:  # noqa: BLE001
-                    try:
-                        self.db.conn.rollback()
-                    except Exception:
-                        pass
-                    elapsed = time.monotonic() - started
-                    self.logger.exception("DWD 装载失败：%s，用时 %.2fs，err=%s", dwd_table, elapsed, exc)
+                    self.logger.error(
+                        "DWD 并行装载失败：%s，err=%s", dwd_table, exc,
+                    )
                    errors.append({"table": dwd_table, "error": str(exc)})
-                    continue

        return {"tables": summary, "errors": len(errors), "error_details": errors}

+    def _process_single_table(
+        self,
+        dwd_table: str,
+        ods_table: str,
+        now: datetime,
+        context: TaskContext,
+    ) -> Dict[str, Any]:
+        """在独立线程中处理单张 DWD 表，使用独立数据库连接和事务。
+
+        每张表创建独立的 DatabaseConnection，处理完成后关闭，
+        保证线程间事务隔离，单表失败不影响其他表。
+        """
+        started = time.monotonic()
+        self.logger.info("DWD 装载开始：%s <= %s", dwd_table, ods_table)
+
+        # 为当前线程创建独立数据库连接
+        thread_db = DatabaseConnection(
+            dsn=self.db._dsn,
+            session=self.db._session,
+            connect_timeout=self.db._connect_timeout,
+        )
+        try:
+            with thread_db.conn.cursor(cursor_factory=RealDictCursor) as cur:
+                dwd_cols = self._get_columns(cur, dwd_table)
+                ods_cols = self._get_columns(cur, ods_table)
+                if not dwd_cols:
+                    self.logger.warning("跳过 %s：未能获取 DWD 列信息", dwd_table)
+                    return {"table": dwd_table, "mode": "SKIPPED", "inserted": 0, "updated": 0}
+
+                if self._table_base(dwd_table).startswith("dim_"):
+                    dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
+                    thread_db.conn.commit()
+                    result = {"table": dwd_table, "mode": "SCD2", **dim_counts}
+                else:
+                    dwd_types = self._get_column_types(cur, dwd_table, "dwd")
+                    ods_types = self._get_column_types(cur, ods_table, "ods")
+                    fact_counts = self._merge_fact_increment(
+                        cur,
+                        dwd_table,
+                        ods_table,
+                        dwd_cols,
+                        ods_cols,
+                        dwd_types,
+                        ods_types,
+                        window_start=context.window_start,
+                        window_end=context.window_end,
+                    )
+                    thread_db.conn.commit()
+                    result = {"table": dwd_table, "mode": "INCREMENT", **fact_counts}
+
+            elapsed = time.monotonic() - started
+            self.logger.info("DWD 装载完成：%s，用时 %.2fs", dwd_table, elapsed)
+            return result
+        except Exception as exc:
+            try:
+                thread_db.conn.rollback()
+            except Exception:
+                pass
+            elapsed = time.monotonic() - started
+            self.logger.exception(
+                "DWD 装载失败：%s，用时 %.2fs，err=%s", dwd_table, elapsed, exc,
+            )
+            # 重新抛出，让 future.result() 在主线程捕获
+            raise
+        finally:
+            thread_db.close()
+
    # ---------------------- 辅助方法 ----------------------
    def _get_columns(self, cur, table: str) -> List[str]:
        """获取指定表的列名（小写）。"""
@@ -872,6 +950,17 @@ class DwdLoadTask(BaseTask):
        ods_types = self._get_column_types(cur, ods_table, "ods")
        ts_types = {"timestamp without time zone", "timestamp with time zone"}
        table_sql = self._format_table(ods_table, "ods")
+        # CHANGE 2026-03-05: 详情表 LEFT JOIN 支持 — 当 DWD 表配置了 DETAIL_JOIN_CONFIG 时，
+        #   给 ODS 主表加别名 ods_main，LEFT JOIN 详情表为 detail，
+        #   非 detail 列引用加 ods_main. 前缀避免歧义
+        detail_join = self.DETAIL_JOIN_CONFIG.get(dwd_table)
+        ods_alias = "ods_main" if detail_join else ""
+        if detail_join:
+            detail_table_sql = self._format_table(detail_join["detail_table"], "ods")
+            table_sql = (
+                f"{table_sql} AS ods_main "
+                f'LEFT JOIN {detail_table_sql} AS detail ON {detail_join["join_condition"]}'
+            )
        # 构造 SELECT 表达式，支持 JSON/expression 映射
        select_exprs: list[str] = []
        added: set[str] = set()
@@ -881,21 +970,26 @@ class DwdLoadTask(BaseTask):
                continue
            if lc in mapping:
                src, cast_type = mapping[lc]
+                # detail. 前缀的列直接使用（来自详情表），其他列加 ods_main. 前缀
+                if ods_alias and not src.startswith("detail."):
+                    src = self._qualify_column_ref(src, ods_alias)
                select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
                added.add(lc)
            elif lc in ods_set:
+                col_ref = f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"'
                # CHANGE 2026-02-22: BUG 12 — 同名列如果是时间类型，加哨兵值过滤
                if dwd_types.get(lc) in ts_types and ods_types.get(lc) in ts_types:
                    select_exprs.append(
-                        f'CASE WHEN "{lc}" >= \'{self._SENTINEL_DATE_THRESHOLD}\'::timestamp '
-                        f'THEN "{lc}" ELSE NULL END AS "{lc}"'
+                        f"CASE WHEN {col_ref} >= '{self._SENTINEL_DATE_THRESHOLD}'::timestamp "
+                        f'THEN {col_ref} ELSE NULL END AS "{lc}"'
                    )
                else:
-                    select_exprs.append(f'"{lc}" AS "{lc}"')
+                    select_exprs.append(f'{col_ref} AS "{lc}"')
                added.add(lc)
        # 分类维度需要额外读取 categoryboxes 以展开子类
        if dwd_table == "dwd.dim_goods_category" and "categoryboxes" not in added and "categoryboxes" in ods_set:
-            select_exprs.append('"categoryboxes" AS "categoryboxes"')
+            col_ref = f'{ods_alias}."categoryboxes"' if ods_alias else '"categoryboxes"'
+            select_exprs.append(f'{col_ref} AS "categoryboxes"')
            added.add("categoryboxes")
        # 主键兜底确保被选出
        for pk in business_keys:
@@ -903,9 +997,12 @@ class DwdLoadTask(BaseTask):
            if lc not in added:
                if lc in mapping:
                    src, cast_type = mapping[lc]
+                    if ods_alias and not src.startswith("detail."):
+                        src = self._qualify_column_ref(src, ods_alias)
                    select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
                elif lc in ods_set:
-                    select_exprs.append(f'"{lc}" AS "{lc}"')
+                    col_ref = f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"'
+                    select_exprs.append(f'{col_ref} AS "{lc}"')
                added.add(lc)

        if not select_exprs:
@@ -917,14 +1014,19 @@ class DwdLoadTask(BaseTask):
            lc = key.lower()
            if lc in mapping:
                src, cast_type = mapping[lc]
+                if ods_alias and not src.startswith("detail."):
+                    src = self._qualify_column_ref(src, ods_alias)
                key_exprs.append(self._cast_expr(src, cast_type))
            elif lc in ods_set:
-                key_exprs.append(f'"{lc}"')
+                key_exprs.append(f'{ods_alias}."{lc}"' if ods_alias else f'"{lc}"')

        select_cols_sql = ", ".join(select_exprs)
-        where_sql = self._append_where_condition("", '"fetched_at" IS NOT NULL')
+        fetched_at_ref = f'{ods_alias}."fetched_at"' if ods_alias else '"fetched_at"'
+        where_sql = self._append_where_condition("", f'{fetched_at_ref} IS NOT NULL')
+        # CHANGE 2026-03-05: order_col 也需要加别名前缀
+        qualified_order_col = f'{ods_alias}."{order_col}"' if ods_alias and order_col else (f'"{order_col}"' if order_col else None)
        sql = self._latest_snapshot_select_sql(
-            select_cols_sql, table_sql, key_exprs, order_col, where_sql
+            select_cols_sql, table_sql, key_exprs, qualified_order_col, where_sql
        )
        cur.execute(sql)
        rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
@@ -1006,7 +1108,7 @@ class DwdLoadTask(BaseTask):

        # 批量插入新版本
        if to_insert:
-            self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now)
+            self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now, dwd_types=dwd_types)

        processed = len(src_rows_by_pk)
        updated = len(to_close)
@@ -1050,11 +1152,16 @@ class DwdLoadTask(BaseTask):
        dwd_cols: Sequence[str],
        rows_with_version: Sequence[tuple[Dict[str, Any], int]],
        now: datetime,
+        dwd_types: Dict[str, str] | None = None,
    ) -> None:
        """批量插入新的 SCD2 版本行。"""
        sorted_cols = [c.lower() for c in sorted(dwd_cols)]
        insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols)
        table_sql = self._format_table(table, "dwd")
+        # 预计算数组类型列集合，避免 list 值被误包装为 Json
+        _array_cols: set[str] = set()
+        if dwd_types:
+            _array_cols = {c for c, t in dwd_types.items() if "ARRAY" in t.upper() or "[]" in t}

        def build_row(src_row: Dict[str, Any], version: int) -> list[Any]:
            values: list[Any] = []
@@ -1068,7 +1175,15 @@ class DwdLoadTask(BaseTask):
                elif c == "scd2_version":
                    values.append(version)
                else:
-                    values.append(src_row.get(c))
+                    val = src_row.get(c)
+                    # CHANGE 2026-03-07: 区分数组列和 JSONB 列
+                    # 数组列（TEXT[] 等）的 list 值直接传递，psycopg2 自动转为 PG 数组格式
+                    # JSONB 列的 dict/list 值需要 Json() 包装
+                    if isinstance(val, list) and c not in _array_cols:
+                        val = Json(val)
+                    elif isinstance(val, dict):
+                        val = Json(val)
+                    values.append(val)
            return values

        values_rows = [build_row(r, ver) for r, ver in rows_with_version]
@@ -1395,6 +1510,23 @@ class DwdLoadTask(BaseTask):
    # CHANGE 2026-02-22: BUG 12 fix — 哨兵日期阈值，上游 API 用 0001-01-01 表示"未设置"
    _SENTINEL_DATE_THRESHOLD = "0002-01-01"

+    @staticmethod
+    def _qualify_column_ref(src: str, alias: str) -> str:
+        """为裸列引用添加表别名前缀。
+
+        已包含 detail.、别名前缀、JSON 操作符、表达式（CASE/COALESCE 等）的源不做修改。
+        仅对简单列名（如 "col" 或 col）添加 alias."col" 前缀。
+        """
+        # 已有 detail. 或其他表前缀（含 .）→ 不修改
+        if "." in src:
+            return src
+        # JSON 操作符、SQL 表达式 → 不修改
+        if any(tok in src for tok in ("->", "#>>", "::", "CASE ", "COALESCE", "NULLIF", "(")):
+            return src
+        # 裸列名（可能带引号）→ 加别名前缀
+        bare = src.strip('"')
+        return f'{alias}."{bare}"'
+
    def _cast_expr(self, col: str, cast_type: str | None) -> str:
        """构造带可选 CAST 的列表达式。