ETL 完成

This commit is contained in:
Neo
2026-01-18 22:37:38 +08:00
parent 8da6cb6563
commit 7ca19a4a2c
159 changed files with 31225 additions and 467 deletions

View File

@@ -2,10 +2,11 @@
"""DWD 装载任务:从 ODS 增量写入 DWD维度 SCD2事实按时间增量"""
from __future__ import annotations
import time
from datetime import datetime
from typing import Any, Dict, Iterable, List, Sequence
from psycopg2.extras import RealDictCursor
from psycopg2.extras import RealDictCursor, execute_batch, execute_values
from .base_task import BaseTask, TaskContext
@@ -61,14 +62,15 @@ class DwdLoadTask(BaseTask):
}
SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
# 增量/窗口过滤优先使用业务时间fetched_at入库时间放最后避免回溯窗口被“当前入库时间”干扰。
FACT_ORDER_CANDIDATES = [
"fetched_at",
"pay_time",
"create_time",
"update_time",
"occur_time",
"settle_time",
"start_use_time",
"fetched_at",
]
# 特殊列映射dwd 列名 -> 源列表达式(可选 CAST
@@ -457,30 +459,69 @@ class DwdLoadTask(BaseTask):
return {"now": datetime.now()}
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
"""遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。"""
"""
遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。
说明:
- 为避免长事务导致锁堆积/中断后遗留 idle-in-tx本任务按“每张表一次事务”提交
- 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息。
"""
now = extracted["now"]
summary: List[Dict[str, Any]] = []
errors: List[Dict[str, Any]] = []
only_tables_cfg = self.config.get("dwd.only_tables") or []
only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set()
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
for dwd_table, ods_table in self.TABLE_MAP.items():
dwd_cols = self._get_columns(cur, dwd_table)
ods_cols = self._get_columns(cur, ods_table)
if not dwd_cols:
self.logger.warning("跳过 %s,未能获取 DWD 列信息", dwd_table)
if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
continue
started = time.monotonic()
self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table)
try:
dwd_cols = self._get_columns(cur, dwd_table)
ods_cols = self._get_columns(cur, ods_table)
if not dwd_cols:
self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table)
continue
if self._table_base(dwd_table).startswith("dim_"):
processed = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
self.db.conn.commit()
summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
else:
dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
use_window = bool(
self.config.get("run.window_override.start")
and self.config.get("run.window_override.end")
)
inserted = self._merge_fact_increment(
cur,
dwd_table,
ods_table,
dwd_cols,
ods_cols,
dwd_types,
ods_types,
window_start=context.window_start if use_window else None,
window_end=context.window_end if use_window else None,
)
self.db.conn.commit()
summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
elapsed = time.monotonic() - started
self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed)
except Exception as exc: # noqa: BLE001
try:
self.db.conn.rollback()
except Exception:
pass
elapsed = time.monotonic() - started
self.logger.exception("DWD 装载失败:%s,用时 %.2fserr=%s", dwd_table, elapsed, exc)
errors.append({"table": dwd_table, "error": str(exc)})
continue
if self._table_base(dwd_table).startswith("dim_"):
processed = self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
else:
dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
inserted = self._merge_fact_increment(
cur, dwd_table, ods_table, dwd_cols, ods_cols, dwd_types, ods_types
)
summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
self.db.conn.commit()
return {"tables": summary}
return {"tables": summary, "errors": errors}
# ---------------------- helpers ----------------------
def _get_columns(self, cur, table: str) -> List[str]:
@@ -589,6 +630,135 @@ class DwdLoadTask(BaseTask):
expanded.append(child_row)
return expanded
def _merge_dim(
self,
cur,
dwd_table: str,
ods_table: str,
dwd_cols: Sequence[str],
ods_cols: Sequence[str],
now: datetime,
) -> int:
"""
维表合并策略:
- 若主键包含 scd2 列(如 scd2_start_time/scd2_version执行真正的 SCD2关闭旧版+插入新版)。
- 否则(多数现有表主键仅为业务主键),执行 Type1 Upsert避免重复键异常并保证可重复回放。
"""
pk_cols = self._get_primary_keys(cur, dwd_table)
if not pk_cols:
raise ValueError(f"{dwd_table} 未配置主键,无法执行维表合并")
pk_has_scd = any(pk.lower() in self.SCD_COLS for pk in pk_cols)
scd_cols_present = any(c.lower() in self.SCD_COLS for c in dwd_cols)
if scd_cols_present and pk_has_scd:
return self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
return self._merge_dim_type1_upsert(cur, dwd_table, ods_table, dwd_cols, ods_cols, pk_cols, now)
def _merge_dim_type1_upsert(
self,
cur,
dwd_table: str,
ods_table: str,
dwd_cols: Sequence[str],
ods_cols: Sequence[str],
pk_cols: Sequence[str],
now: datetime,
) -> int:
"""维表 Type1 Upsert主键冲突则更新兼容带 scd2 字段但主键不支持多版本的表。"""
mapping = self._build_column_mapping(dwd_table, pk_cols, ods_cols)
ods_set = {c.lower() for c in ods_cols}
ods_table_sql = self._format_table(ods_table, "billiards_ods")
select_exprs: list[str] = []
added: set[str] = set()
for col in dwd_cols:
lc = col.lower()
if lc in self.SCD_COLS:
continue
if lc in mapping:
src, cast_type = mapping[lc]
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
added.add(lc)
elif lc in ods_set:
select_exprs.append(f'\"{lc}\" AS \"{lc}\"')
added.add(lc)
for pk in pk_cols:
lc = pk.lower()
if lc in added:
continue
if lc in mapping:
src, cast_type = mapping[lc]
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
elif lc in ods_set:
select_exprs.append(f'\"{lc}\" AS \"{lc}\"')
added.add(lc)
if not select_exprs:
return 0
cur.execute(f"SELECT {', '.join(select_exprs)} FROM {ods_table_sql}")
rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
if dwd_table == "billiards_dwd.dim_goods_category":
rows = self._expand_goods_category_rows(rows)
# 按主键去重
seen_pk: set[tuple[Any, ...]] = set()
src_rows: list[Dict[str, Any]] = []
pk_lower = [c.lower() for c in pk_cols]
for row in rows:
pk_key = tuple(row.get(pk) for pk in pk_lower)
if pk_key in seen_pk:
continue
if any(v is None for v in pk_key):
self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key)))
continue
seen_pk.add(pk_key)
src_rows.append(row)
if not src_rows:
return 0
dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
sorted_cols = [c.lower() for c in sorted(dwd_cols)]
insert_cols_sql = ", ".join(f'\"{c}\"' for c in sorted_cols)
def build_row(src_row: Dict[str, Any]) -> list[Any]:
values: list[Any] = []
for c in sorted_cols:
if c == "scd2_start_time":
values.append(now)
elif c == "scd2_end_time":
values.append(datetime(9999, 12, 31, 0, 0, 0))
elif c == "scd2_is_current":
values.append(1)
elif c == "scd2_version":
values.append(1)
else:
values.append(src_row.get(c))
return values
pk_sql = ", ".join(f'\"{c.lower()}\"' for c in pk_cols)
pk_lower_set = {c.lower() for c in pk_cols}
set_exprs: list[str] = []
for c in sorted_cols:
if c in pk_lower_set:
continue
if c == "scd2_start_time":
set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")')
elif c == "scd2_version":
set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")')
else:
set_exprs.append(f'\"{c}\" = EXCLUDED.\"{c}\"')
upsert_sql = (
f"INSERT INTO {dwd_table_sql} ({insert_cols_sql}) VALUES %s "
f"ON CONFLICT ({pk_sql}) DO UPDATE SET {', '.join(set_exprs)}"
)
execute_values(cur, upsert_sql, [build_row(r) for r in src_rows], page_size=500)
return len(src_rows)
def _merge_dim_scd2(
self,
cur,
@@ -646,8 +816,9 @@ class DwdLoadTask(BaseTask):
if dwd_table == "billiards_dwd.dim_goods_category":
rows = self._expand_goods_category_rows(rows)
inserted_or_updated = 0
# 归一化源行并按主键去重
seen_pk = set()
src_rows_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {}
for row in rows:
mapped_row: Dict[str, Any] = {}
for col in dwd_cols:
@@ -663,10 +834,110 @@ class DwdLoadTask(BaseTask):
pk_key = tuple(mapped_row.get(pk) for pk in pk_cols)
if pk_key in seen_pk:
continue
if any(v is None for v in pk_key):
self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key)))
continue
seen_pk.add(pk_key)
if self._upsert_scd2_row(cur, dwd_table, dwd_cols, pk_cols, mapped_row, now):
inserted_or_updated += 1
return len(rows)
src_rows_by_pk[pk_key] = mapped_row
if not src_rows_by_pk:
return 0
# 预加载当前版本scd2_is_current=1避免逐行 SELECT 造成大量 round-trip
table_sql_dwd = self._format_table(dwd_table, "billiards_dwd")
where_current = " AND ".join([f"COALESCE(scd2_is_current,1)=1"])
cur.execute(f"SELECT * FROM {table_sql_dwd} WHERE {where_current}")
current_rows = cur.fetchall() or []
current_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {}
for r in current_rows:
rr = {k.lower(): v for k, v in r.items()}
pk_key = tuple(rr.get(pk) for pk in pk_cols)
current_by_pk[pk_key] = rr
# 计算需要关闭/插入的主键集合
to_close: list[tuple[Any, ...]] = []
to_insert: list[tuple[Dict[str, Any], int]] = []
for pk_key, incoming in src_rows_by_pk.items():
current = current_by_pk.get(pk_key)
if current and not self._is_row_changed(current, incoming, dwd_cols):
continue
if current:
version = (current.get("scd2_version") or 1) + 1
to_close.append(pk_key)
else:
version = 1
to_insert.append((incoming, version))
# 先关闭旧版本(同一批次统一 end_time
if to_close:
self._close_current_dim_bulk(cur, dwd_table, pk_cols, to_close, now)
# 批量插入新版本
if to_insert:
self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now)
return len(src_rows_by_pk)
def _close_current_dim_bulk(
self,
cur,
table: str,
pk_cols: Sequence[str],
pk_keys: Sequence[tuple[Any, ...]],
now: datetime,
) -> None:
"""批量关闭当前版本scd2_is_current=0 + 填充结束时间)。"""
table_sql = self._format_table(table, "billiards_dwd")
if len(pk_cols) == 1:
pk = pk_cols[0]
ids = [k[0] for k in pk_keys]
cur.execute(
f'UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 '
f'WHERE COALESCE(scd2_is_current,1)=1 AND "{pk}" = ANY(%s)',
(now, ids),
)
return
# 复合主键:对“发生变更的键”逐条关闭(数量通常远小于全量行数)
where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols)
sql = (
f"UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 "
f"WHERE COALESCE(scd2_is_current,1)=1 AND {where_clause}"
)
args_list = [(now, *pk_key) for pk_key in pk_keys]
execute_batch(cur, sql, args_list, page_size=500)
def _insert_dim_rows_bulk(
self,
cur,
table: str,
dwd_cols: Sequence[str],
rows_with_version: Sequence[tuple[Dict[str, Any], int]],
now: datetime,
) -> None:
"""批量插入新的 SCD2 版本行。"""
sorted_cols = [c.lower() for c in sorted(dwd_cols)]
insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols)
table_sql = self._format_table(table, "billiards_dwd")
def build_row(src_row: Dict[str, Any], version: int) -> list[Any]:
values: list[Any] = []
for c in sorted_cols:
if c == "scd2_start_time":
values.append(now)
elif c == "scd2_end_time":
values.append(datetime(9999, 12, 31, 0, 0, 0))
elif c == "scd2_is_current":
values.append(1)
elif c == "scd2_version":
values.append(version)
else:
values.append(src_row.get(c))
return values
values_rows = [build_row(r, ver) for r, ver in rows_with_version]
insert_sql = f"INSERT INTO {table_sql} ({insert_cols_sql}) VALUES %s"
execute_values(cur, insert_sql, values_rows, page_size=500)
def _upsert_scd2_row(
self,
@@ -762,6 +1033,8 @@ class DwdLoadTask(BaseTask):
ods_cols: Sequence[str],
dwd_types: Dict[str, str],
ods_types: Dict[str, str],
window_start: datetime | None = None,
window_end: datetime | None = None,
) -> int:
"""事实表按时间增量插入,默认按列名交集写入。"""
mapping_entries = self.FACT_MAPPINGS.get(dwd_table) or []
@@ -813,7 +1086,10 @@ class DwdLoadTask(BaseTask):
params: List[Any] = []
dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
ods_table_sql = self._format_table(ods_table, "billiards_ods")
if order_col:
if order_col and window_start and window_end:
where_sql = f'WHERE "{order_col}" >= %s AND "{order_col}" < %s'
params.extend([window_start, window_end])
elif order_col:
cur.execute(f'SELECT COALESCE(MAX("{order_col}"), %s) FROM {dwd_table_sql}', ("1970-01-01",))
row = cur.fetchone() or {}
watermark = list(row.values())[0] if row else "1970-01-01"