ETL 完成

2026-01-18 22:37:38 +08:00
parent 8da6cb6563
commit 7ca19a4a2c
159 changed files with 31225 additions and 467 deletions
--- a/etl_billiards/tasks/dwd_load_task.py
+++ b/etl_billiards/tasks/dwd_load_task.py
@@ -2,10 +2,11 @@
 """DWD 装载任务：从 ODS 增量写入 DWD（维度 SCD2，事实按时间增量）。"""
 from __future__ import annotations

+import time
 from datetime import datetime
 from typing import Any, Dict, Iterable, List, Sequence

-from psycopg2.extras import RealDictCursor
+from psycopg2.extras import RealDictCursor, execute_batch, execute_values

 from .base_task import BaseTask, TaskContext

@@ -61,14 +62,15 @@ class DwdLoadTask(BaseTask):
    }

    SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
+    # 增量/窗口过滤优先使用业务时间；fetched_at（入库时间）放最后，避免回溯窗口被“当前入库时间”干扰。
    FACT_ORDER_CANDIDATES = [
-        "fetched_at",
        "pay_time",
        "create_time",
        "update_time",
        "occur_time",
        "settle_time",
        "start_use_time",
+        "fetched_at",
    ]

    # 特殊列映射：dwd 列名 -> 源列表达式（可选 CAST）
@@ -457,30 +459,69 @@ class DwdLoadTask(BaseTask):
        return {"now": datetime.now()}

    def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
-        """遍历映射关系，维度执行 SCD2 合并，事实表按时间增量插入。"""
+        """
+        遍历映射关系，维度执行 SCD2 合并，事实表按时间增量插入。
+
+        说明：
+        - 为避免长事务导致锁堆积/中断后遗留 idle-in-tx，本任务按“每张表一次事务”提交；
+        - 单表失败会回滚该表并继续后续表，最终在结果中汇总错误信息。
+        """
        now = extracted["now"]
        summary: List[Dict[str, Any]] = []
+        errors: List[Dict[str, Any]] = []
+        only_tables_cfg = self.config.get("dwd.only_tables") or []
+        only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set()
        with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
            for dwd_table, ods_table in self.TABLE_MAP.items():
-                dwd_cols = self._get_columns(cur, dwd_table)
-                ods_cols = self._get_columns(cur, ods_table)
-                if not dwd_cols:
-                    self.logger.warning("跳过 %s，未能获取 DWD 列信息", dwd_table)
+                if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
+                    continue
+                started = time.monotonic()
+                self.logger.info("DWD 装载开始：%s <= %s", dwd_table, ods_table)
+                try:
+                    dwd_cols = self._get_columns(cur, dwd_table)
+                    ods_cols = self._get_columns(cur, ods_table)
+                    if not dwd_cols:
+                        self.logger.warning("跳过 %s：未能获取 DWD 列信息", dwd_table)
+                        continue
+
+                    if self._table_base(dwd_table).startswith("dim_"):
+                        processed = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
+                        self.db.conn.commit()
+                        summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
+                    else:
+                        dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
+                        ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
+                        use_window = bool(
+                            self.config.get("run.window_override.start")
+                            and self.config.get("run.window_override.end")
+                        )
+                        inserted = self._merge_fact_increment(
+                            cur,
+                            dwd_table,
+                            ods_table,
+                            dwd_cols,
+                            ods_cols,
+                            dwd_types,
+                            ods_types,
+                            window_start=context.window_start if use_window else None,
+                            window_end=context.window_end if use_window else None,
+                        )
+                        self.db.conn.commit()
+                        summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
+
+                    elapsed = time.monotonic() - started
+                    self.logger.info("DWD 装载完成：%s，用时 %.2fs", dwd_table, elapsed)
+                except Exception as exc:  # noqa: BLE001
+                    try:
+                        self.db.conn.rollback()
+                    except Exception:
+                        pass
+                    elapsed = time.monotonic() - started
+                    self.logger.exception("DWD 装载失败：%s，用时 %.2fs，err=%s", dwd_table, elapsed, exc)
+                    errors.append({"table": dwd_table, "error": str(exc)})
                    continue

-                if self._table_base(dwd_table).startswith("dim_"):
-                    processed = self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
-                    summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
-                else:
-                    dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
-                    ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
-                    inserted = self._merge_fact_increment(
-                        cur, dwd_table, ods_table, dwd_cols, ods_cols, dwd_types, ods_types
-                    )
-                    summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
-
-        self.db.conn.commit()
-        return {"tables": summary}
+        return {"tables": summary, "errors": errors}

    # ---------------------- helpers ----------------------
    def _get_columns(self, cur, table: str) -> List[str]:
@@ -589,6 +630,135 @@ class DwdLoadTask(BaseTask):
                    expanded.append(child_row)
        return expanded

+    def _merge_dim(
+        self,
+        cur,
+        dwd_table: str,
+        ods_table: str,
+        dwd_cols: Sequence[str],
+        ods_cols: Sequence[str],
+        now: datetime,
+    ) -> int:
+        """
+        维表合并策略：
+        - 若主键包含 scd2 列（如 scd2_start_time/scd2_version），执行真正的 SCD2（关闭旧版+插入新版）。
+        - 否则（多数现有表主键仅为业务主键），执行 Type1 Upsert，避免重复键异常并保证可重复回放。
+        """
+        pk_cols = self._get_primary_keys(cur, dwd_table)
+        if not pk_cols:
+            raise ValueError(f"{dwd_table} 未配置主键，无法执行维表合并")
+
+        pk_has_scd = any(pk.lower() in self.SCD_COLS for pk in pk_cols)
+        scd_cols_present = any(c.lower() in self.SCD_COLS for c in dwd_cols)
+        if scd_cols_present and pk_has_scd:
+            return self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
+        return self._merge_dim_type1_upsert(cur, dwd_table, ods_table, dwd_cols, ods_cols, pk_cols, now)
+
+    def _merge_dim_type1_upsert(
+        self,
+        cur,
+        dwd_table: str,
+        ods_table: str,
+        dwd_cols: Sequence[str],
+        ods_cols: Sequence[str],
+        pk_cols: Sequence[str],
+        now: datetime,
+    ) -> int:
+        """维表 Type1 Upsert（主键冲突则更新），兼容带 scd2 字段但主键不支持多版本的表。"""
+        mapping = self._build_column_mapping(dwd_table, pk_cols, ods_cols)
+        ods_set = {c.lower() for c in ods_cols}
+        ods_table_sql = self._format_table(ods_table, "billiards_ods")
+
+        select_exprs: list[str] = []
+        added: set[str] = set()
+        for col in dwd_cols:
+            lc = col.lower()
+            if lc in self.SCD_COLS:
+                continue
+            if lc in mapping:
+                src, cast_type = mapping[lc]
+                select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
+                added.add(lc)
+            elif lc in ods_set:
+                select_exprs.append(f'\"{lc}\" AS \"{lc}\"')
+                added.add(lc)
+
+        for pk in pk_cols:
+            lc = pk.lower()
+            if lc in added:
+                continue
+            if lc in mapping:
+                src, cast_type = mapping[lc]
+                select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
+            elif lc in ods_set:
+                select_exprs.append(f'\"{lc}\" AS \"{lc}\"')
+            added.add(lc)
+
+        if not select_exprs:
+            return 0
+
+        cur.execute(f"SELECT {', '.join(select_exprs)} FROM {ods_table_sql}")
+        rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
+
+        if dwd_table == "billiards_dwd.dim_goods_category":
+            rows = self._expand_goods_category_rows(rows)
+
+        # 按主键去重
+        seen_pk: set[tuple[Any, ...]] = set()
+        src_rows: list[Dict[str, Any]] = []
+        pk_lower = [c.lower() for c in pk_cols]
+        for row in rows:
+            pk_key = tuple(row.get(pk) for pk in pk_lower)
+            if pk_key in seen_pk:
+                continue
+            if any(v is None for v in pk_key):
+                self.logger.warning("跳过 %s：主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key)))
+                continue
+            seen_pk.add(pk_key)
+            src_rows.append(row)
+
+        if not src_rows:
+            return 0
+
+        dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
+        sorted_cols = [c.lower() for c in sorted(dwd_cols)]
+        insert_cols_sql = ", ".join(f'\"{c}\"' for c in sorted_cols)
+
+        def build_row(src_row: Dict[str, Any]) -> list[Any]:
+            values: list[Any] = []
+            for c in sorted_cols:
+                if c == "scd2_start_time":
+                    values.append(now)
+                elif c == "scd2_end_time":
+                    values.append(datetime(9999, 12, 31, 0, 0, 0))
+                elif c == "scd2_is_current":
+                    values.append(1)
+                elif c == "scd2_version":
+                    values.append(1)
+                else:
+                    values.append(src_row.get(c))
+            return values
+
+        pk_sql = ", ".join(f'\"{c.lower()}\"' for c in pk_cols)
+        pk_lower_set = {c.lower() for c in pk_cols}
+        set_exprs: list[str] = []
+        for c in sorted_cols:
+            if c in pk_lower_set:
+                continue
+            if c == "scd2_start_time":
+                set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")')
+            elif c == "scd2_version":
+                set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")')
+            else:
+                set_exprs.append(f'\"{c}\" = EXCLUDED.\"{c}\"')
+
+        upsert_sql = (
+            f"INSERT INTO {dwd_table_sql} ({insert_cols_sql}) VALUES %s "
+            f"ON CONFLICT ({pk_sql}) DO UPDATE SET {', '.join(set_exprs)}"
+        )
+        execute_values(cur, upsert_sql, [build_row(r) for r in src_rows], page_size=500)
+        return len(src_rows)
+
    def _merge_dim_scd2(
        self,
        cur,
@@ -646,8 +816,9 @@ class DwdLoadTask(BaseTask):
        if dwd_table == "billiards_dwd.dim_goods_category":
            rows = self._expand_goods_category_rows(rows)

-        inserted_or_updated = 0
+        # 归一化源行并按主键去重
        seen_pk = set()
+        src_rows_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {}
        for row in rows:
            mapped_row: Dict[str, Any] = {}
            for col in dwd_cols:
@@ -663,10 +834,110 @@ class DwdLoadTask(BaseTask):
            pk_key = tuple(mapped_row.get(pk) for pk in pk_cols)
            if pk_key in seen_pk:
                continue
+            if any(v is None for v in pk_key):
+                self.logger.warning("跳过 %s：主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key)))
+                continue
            seen_pk.add(pk_key)
-            if self._upsert_scd2_row(cur, dwd_table, dwd_cols, pk_cols, mapped_row, now):
-                inserted_or_updated += 1
-        return len(rows)
+            src_rows_by_pk[pk_key] = mapped_row
+
+        if not src_rows_by_pk:
+            return 0
+
+        # 预加载当前版本（scd2_is_current=1），避免逐行 SELECT 造成大量 round-trip
+        table_sql_dwd = self._format_table(dwd_table, "billiards_dwd")
+        where_current = " AND ".join([f"COALESCE(scd2_is_current,1)=1"])
+        cur.execute(f"SELECT * FROM {table_sql_dwd} WHERE {where_current}")
+        current_rows = cur.fetchall() or []
+        current_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {}
+        for r in current_rows:
+            rr = {k.lower(): v for k, v in r.items()}
+            pk_key = tuple(rr.get(pk) for pk in pk_cols)
+            current_by_pk[pk_key] = rr
+
+        # 计算需要关闭/插入的主键集合
+        to_close: list[tuple[Any, ...]] = []
+        to_insert: list[tuple[Dict[str, Any], int]] = []
+        for pk_key, incoming in src_rows_by_pk.items():
+            current = current_by_pk.get(pk_key)
+            if current and not self._is_row_changed(current, incoming, dwd_cols):
+                continue
+            if current:
+                version = (current.get("scd2_version") or 1) + 1
+                to_close.append(pk_key)
+            else:
+                version = 1
+            to_insert.append((incoming, version))
+
+        # 先关闭旧版本（同一批次统一 end_time）
+        if to_close:
+            self._close_current_dim_bulk(cur, dwd_table, pk_cols, to_close, now)
+
+        # 批量插入新版本
+        if to_insert:
+            self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now)
+
+        return len(src_rows_by_pk)
+
+    def _close_current_dim_bulk(
+        self,
+        cur,
+        table: str,
+        pk_cols: Sequence[str],
+        pk_keys: Sequence[tuple[Any, ...]],
+        now: datetime,
+    ) -> None:
+        """批量关闭当前版本（scd2_is_current=0 + 填充结束时间）。"""
+        table_sql = self._format_table(table, "billiards_dwd")
+        if len(pk_cols) == 1:
+            pk = pk_cols[0]
+            ids = [k[0] for k in pk_keys]
+            cur.execute(
+                f'UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 '
+                f'WHERE COALESCE(scd2_is_current,1)=1 AND "{pk}" = ANY(%s)',
+                (now, ids),
+            )
+            return
+
+        # 复合主键：对“发生变更的键”逐条关闭（数量通常远小于全量行数）
+        where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols)
+        sql = (
+            f"UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 "
+            f"WHERE COALESCE(scd2_is_current,1)=1 AND {where_clause}"
+        )
+        args_list = [(now, *pk_key) for pk_key in pk_keys]
+        execute_batch(cur, sql, args_list, page_size=500)
+
+    def _insert_dim_rows_bulk(
+        self,
+        cur,
+        table: str,
+        dwd_cols: Sequence[str],
+        rows_with_version: Sequence[tuple[Dict[str, Any], int]],
+        now: datetime,
+    ) -> None:
+        """批量插入新的 SCD2 版本行。"""
+        sorted_cols = [c.lower() for c in sorted(dwd_cols)]
+        insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols)
+        table_sql = self._format_table(table, "billiards_dwd")
+
+        def build_row(src_row: Dict[str, Any], version: int) -> list[Any]:
+            values: list[Any] = []
+            for c in sorted_cols:
+                if c == "scd2_start_time":
+                    values.append(now)
+                elif c == "scd2_end_time":
+                    values.append(datetime(9999, 12, 31, 0, 0, 0))
+                elif c == "scd2_is_current":
+                    values.append(1)
+                elif c == "scd2_version":
+                    values.append(version)
+                else:
+                    values.append(src_row.get(c))
+            return values
+
+        values_rows = [build_row(r, ver) for r, ver in rows_with_version]
+        insert_sql = f"INSERT INTO {table_sql} ({insert_cols_sql}) VALUES %s"
+        execute_values(cur, insert_sql, values_rows, page_size=500)

    def _upsert_scd2_row(
        self,
@@ -762,6 +1033,8 @@ class DwdLoadTask(BaseTask):
        ods_cols: Sequence[str],
        dwd_types: Dict[str, str],
        ods_types: Dict[str, str],
+        window_start: datetime | None = None,
+        window_end: datetime | None = None,
    ) -> int:
        """事实表按时间增量插入，默认按列名交集写入。"""
        mapping_entries = self.FACT_MAPPINGS.get(dwd_table) or []
@@ -813,7 +1086,10 @@ class DwdLoadTask(BaseTask):
        params: List[Any] = []
        dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
        ods_table_sql = self._format_table(ods_table, "billiards_ods")
-        if order_col:
+        if order_col and window_start and window_end:
+            where_sql = f'WHERE "{order_col}" >= %s AND "{order_col}" < %s'
+            params.extend([window_start, window_end])
+        elif order_col:
            cur.execute(f'SELECT COALESCE(MAX("{order_col}"), %s) FROM {dwd_table_sql}', ("1970-01-01",))
            row = cur.fetchone() or {}
            watermark = list(row.values())[0] if row else "1970-01-01"