ETL 完成
This commit is contained in:
@@ -2,10 +2,11 @@
|
||||
"""DWD 装载任务:从 ODS 增量写入 DWD(维度 SCD2,事实按时间增量)。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Iterable, List, Sequence
|
||||
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from psycopg2.extras import RealDictCursor, execute_batch, execute_values
|
||||
|
||||
from .base_task import BaseTask, TaskContext
|
||||
|
||||
@@ -61,14 +62,15 @@ class DwdLoadTask(BaseTask):
|
||||
}
|
||||
|
||||
SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
|
||||
# 增量/窗口过滤优先使用业务时间;fetched_at(入库时间)放最后,避免回溯窗口被“当前入库时间”干扰。
|
||||
FACT_ORDER_CANDIDATES = [
|
||||
"fetched_at",
|
||||
"pay_time",
|
||||
"create_time",
|
||||
"update_time",
|
||||
"occur_time",
|
||||
"settle_time",
|
||||
"start_use_time",
|
||||
"fetched_at",
|
||||
]
|
||||
|
||||
# 特殊列映射:dwd 列名 -> 源列表达式(可选 CAST)
|
||||
@@ -457,30 +459,69 @@ class DwdLoadTask(BaseTask):
|
||||
return {"now": datetime.now()}
|
||||
|
||||
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
|
||||
"""遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。"""
|
||||
"""
|
||||
遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。
|
||||
|
||||
说明:
|
||||
- 为避免长事务导致锁堆积/中断后遗留 idle-in-tx,本任务按“每张表一次事务”提交;
|
||||
- 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息。
|
||||
"""
|
||||
now = extracted["now"]
|
||||
summary: List[Dict[str, Any]] = []
|
||||
errors: List[Dict[str, Any]] = []
|
||||
only_tables_cfg = self.config.get("dwd.only_tables") or []
|
||||
only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set()
|
||||
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
for dwd_table, ods_table in self.TABLE_MAP.items():
|
||||
dwd_cols = self._get_columns(cur, dwd_table)
|
||||
ods_cols = self._get_columns(cur, ods_table)
|
||||
if not dwd_cols:
|
||||
self.logger.warning("跳过 %s,未能获取 DWD 列信息", dwd_table)
|
||||
if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables:
|
||||
continue
|
||||
started = time.monotonic()
|
||||
self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table)
|
||||
try:
|
||||
dwd_cols = self._get_columns(cur, dwd_table)
|
||||
ods_cols = self._get_columns(cur, ods_table)
|
||||
if not dwd_cols:
|
||||
self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table)
|
||||
continue
|
||||
|
||||
if self._table_base(dwd_table).startswith("dim_"):
|
||||
processed = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
|
||||
self.db.conn.commit()
|
||||
summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
|
||||
else:
|
||||
dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
|
||||
ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
|
||||
use_window = bool(
|
||||
self.config.get("run.window_override.start")
|
||||
and self.config.get("run.window_override.end")
|
||||
)
|
||||
inserted = self._merge_fact_increment(
|
||||
cur,
|
||||
dwd_table,
|
||||
ods_table,
|
||||
dwd_cols,
|
||||
ods_cols,
|
||||
dwd_types,
|
||||
ods_types,
|
||||
window_start=context.window_start if use_window else None,
|
||||
window_end=context.window_end if use_window else None,
|
||||
)
|
||||
self.db.conn.commit()
|
||||
summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
|
||||
|
||||
elapsed = time.monotonic() - started
|
||||
self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
try:
|
||||
self.db.conn.rollback()
|
||||
except Exception:
|
||||
pass
|
||||
elapsed = time.monotonic() - started
|
||||
self.logger.exception("DWD 装载失败:%s,用时 %.2fs,err=%s", dwd_table, elapsed, exc)
|
||||
errors.append({"table": dwd_table, "error": str(exc)})
|
||||
continue
|
||||
|
||||
if self._table_base(dwd_table).startswith("dim_"):
|
||||
processed = self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
|
||||
summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
|
||||
else:
|
||||
dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
|
||||
ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
|
||||
inserted = self._merge_fact_increment(
|
||||
cur, dwd_table, ods_table, dwd_cols, ods_cols, dwd_types, ods_types
|
||||
)
|
||||
summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
|
||||
|
||||
self.db.conn.commit()
|
||||
return {"tables": summary}
|
||||
return {"tables": summary, "errors": errors}
|
||||
|
||||
# ---------------------- helpers ----------------------
|
||||
def _get_columns(self, cur, table: str) -> List[str]:
|
||||
@@ -589,6 +630,135 @@ class DwdLoadTask(BaseTask):
|
||||
expanded.append(child_row)
|
||||
return expanded
|
||||
|
||||
def _merge_dim(
|
||||
self,
|
||||
cur,
|
||||
dwd_table: str,
|
||||
ods_table: str,
|
||||
dwd_cols: Sequence[str],
|
||||
ods_cols: Sequence[str],
|
||||
now: datetime,
|
||||
) -> int:
|
||||
"""
|
||||
维表合并策略:
|
||||
- 若主键包含 scd2 列(如 scd2_start_time/scd2_version),执行真正的 SCD2(关闭旧版+插入新版)。
|
||||
- 否则(多数现有表主键仅为业务主键),执行 Type1 Upsert,避免重复键异常并保证可重复回放。
|
||||
"""
|
||||
pk_cols = self._get_primary_keys(cur, dwd_table)
|
||||
if not pk_cols:
|
||||
raise ValueError(f"{dwd_table} 未配置主键,无法执行维表合并")
|
||||
|
||||
pk_has_scd = any(pk.lower() in self.SCD_COLS for pk in pk_cols)
|
||||
scd_cols_present = any(c.lower() in self.SCD_COLS for c in dwd_cols)
|
||||
if scd_cols_present and pk_has_scd:
|
||||
return self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
|
||||
return self._merge_dim_type1_upsert(cur, dwd_table, ods_table, dwd_cols, ods_cols, pk_cols, now)
|
||||
|
||||
def _merge_dim_type1_upsert(
|
||||
self,
|
||||
cur,
|
||||
dwd_table: str,
|
||||
ods_table: str,
|
||||
dwd_cols: Sequence[str],
|
||||
ods_cols: Sequence[str],
|
||||
pk_cols: Sequence[str],
|
||||
now: datetime,
|
||||
) -> int:
|
||||
"""维表 Type1 Upsert(主键冲突则更新),兼容带 scd2 字段但主键不支持多版本的表。"""
|
||||
mapping = self._build_column_mapping(dwd_table, pk_cols, ods_cols)
|
||||
ods_set = {c.lower() for c in ods_cols}
|
||||
ods_table_sql = self._format_table(ods_table, "billiards_ods")
|
||||
|
||||
select_exprs: list[str] = []
|
||||
added: set[str] = set()
|
||||
for col in dwd_cols:
|
||||
lc = col.lower()
|
||||
if lc in self.SCD_COLS:
|
||||
continue
|
||||
if lc in mapping:
|
||||
src, cast_type = mapping[lc]
|
||||
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
|
||||
added.add(lc)
|
||||
elif lc in ods_set:
|
||||
select_exprs.append(f'\"{lc}\" AS \"{lc}\"')
|
||||
added.add(lc)
|
||||
|
||||
for pk in pk_cols:
|
||||
lc = pk.lower()
|
||||
if lc in added:
|
||||
continue
|
||||
if lc in mapping:
|
||||
src, cast_type = mapping[lc]
|
||||
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
|
||||
elif lc in ods_set:
|
||||
select_exprs.append(f'\"{lc}\" AS \"{lc}\"')
|
||||
added.add(lc)
|
||||
|
||||
if not select_exprs:
|
||||
return 0
|
||||
|
||||
cur.execute(f"SELECT {', '.join(select_exprs)} FROM {ods_table_sql}")
|
||||
rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
|
||||
|
||||
if dwd_table == "billiards_dwd.dim_goods_category":
|
||||
rows = self._expand_goods_category_rows(rows)
|
||||
|
||||
# 按主键去重
|
||||
seen_pk: set[tuple[Any, ...]] = set()
|
||||
src_rows: list[Dict[str, Any]] = []
|
||||
pk_lower = [c.lower() for c in pk_cols]
|
||||
for row in rows:
|
||||
pk_key = tuple(row.get(pk) for pk in pk_lower)
|
||||
if pk_key in seen_pk:
|
||||
continue
|
||||
if any(v is None for v in pk_key):
|
||||
self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key)))
|
||||
continue
|
||||
seen_pk.add(pk_key)
|
||||
src_rows.append(row)
|
||||
|
||||
if not src_rows:
|
||||
return 0
|
||||
|
||||
dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
|
||||
sorted_cols = [c.lower() for c in sorted(dwd_cols)]
|
||||
insert_cols_sql = ", ".join(f'\"{c}\"' for c in sorted_cols)
|
||||
|
||||
def build_row(src_row: Dict[str, Any]) -> list[Any]:
|
||||
values: list[Any] = []
|
||||
for c in sorted_cols:
|
||||
if c == "scd2_start_time":
|
||||
values.append(now)
|
||||
elif c == "scd2_end_time":
|
||||
values.append(datetime(9999, 12, 31, 0, 0, 0))
|
||||
elif c == "scd2_is_current":
|
||||
values.append(1)
|
||||
elif c == "scd2_version":
|
||||
values.append(1)
|
||||
else:
|
||||
values.append(src_row.get(c))
|
||||
return values
|
||||
|
||||
pk_sql = ", ".join(f'\"{c.lower()}\"' for c in pk_cols)
|
||||
pk_lower_set = {c.lower() for c in pk_cols}
|
||||
set_exprs: list[str] = []
|
||||
for c in sorted_cols:
|
||||
if c in pk_lower_set:
|
||||
continue
|
||||
if c == "scd2_start_time":
|
||||
set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")')
|
||||
elif c == "scd2_version":
|
||||
set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")')
|
||||
else:
|
||||
set_exprs.append(f'\"{c}\" = EXCLUDED.\"{c}\"')
|
||||
|
||||
upsert_sql = (
|
||||
f"INSERT INTO {dwd_table_sql} ({insert_cols_sql}) VALUES %s "
|
||||
f"ON CONFLICT ({pk_sql}) DO UPDATE SET {', '.join(set_exprs)}"
|
||||
)
|
||||
execute_values(cur, upsert_sql, [build_row(r) for r in src_rows], page_size=500)
|
||||
return len(src_rows)
|
||||
|
||||
def _merge_dim_scd2(
|
||||
self,
|
||||
cur,
|
||||
@@ -646,8 +816,9 @@ class DwdLoadTask(BaseTask):
|
||||
if dwd_table == "billiards_dwd.dim_goods_category":
|
||||
rows = self._expand_goods_category_rows(rows)
|
||||
|
||||
inserted_or_updated = 0
|
||||
# 归一化源行并按主键去重
|
||||
seen_pk = set()
|
||||
src_rows_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {}
|
||||
for row in rows:
|
||||
mapped_row: Dict[str, Any] = {}
|
||||
for col in dwd_cols:
|
||||
@@ -663,10 +834,110 @@ class DwdLoadTask(BaseTask):
|
||||
pk_key = tuple(mapped_row.get(pk) for pk in pk_cols)
|
||||
if pk_key in seen_pk:
|
||||
continue
|
||||
if any(v is None for v in pk_key):
|
||||
self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key)))
|
||||
continue
|
||||
seen_pk.add(pk_key)
|
||||
if self._upsert_scd2_row(cur, dwd_table, dwd_cols, pk_cols, mapped_row, now):
|
||||
inserted_or_updated += 1
|
||||
return len(rows)
|
||||
src_rows_by_pk[pk_key] = mapped_row
|
||||
|
||||
if not src_rows_by_pk:
|
||||
return 0
|
||||
|
||||
# 预加载当前版本(scd2_is_current=1),避免逐行 SELECT 造成大量 round-trip
|
||||
table_sql_dwd = self._format_table(dwd_table, "billiards_dwd")
|
||||
where_current = " AND ".join([f"COALESCE(scd2_is_current,1)=1"])
|
||||
cur.execute(f"SELECT * FROM {table_sql_dwd} WHERE {where_current}")
|
||||
current_rows = cur.fetchall() or []
|
||||
current_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {}
|
||||
for r in current_rows:
|
||||
rr = {k.lower(): v for k, v in r.items()}
|
||||
pk_key = tuple(rr.get(pk) for pk in pk_cols)
|
||||
current_by_pk[pk_key] = rr
|
||||
|
||||
# 计算需要关闭/插入的主键集合
|
||||
to_close: list[tuple[Any, ...]] = []
|
||||
to_insert: list[tuple[Dict[str, Any], int]] = []
|
||||
for pk_key, incoming in src_rows_by_pk.items():
|
||||
current = current_by_pk.get(pk_key)
|
||||
if current and not self._is_row_changed(current, incoming, dwd_cols):
|
||||
continue
|
||||
if current:
|
||||
version = (current.get("scd2_version") or 1) + 1
|
||||
to_close.append(pk_key)
|
||||
else:
|
||||
version = 1
|
||||
to_insert.append((incoming, version))
|
||||
|
||||
# 先关闭旧版本(同一批次统一 end_time)
|
||||
if to_close:
|
||||
self._close_current_dim_bulk(cur, dwd_table, pk_cols, to_close, now)
|
||||
|
||||
# 批量插入新版本
|
||||
if to_insert:
|
||||
self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now)
|
||||
|
||||
return len(src_rows_by_pk)
|
||||
|
||||
def _close_current_dim_bulk(
|
||||
self,
|
||||
cur,
|
||||
table: str,
|
||||
pk_cols: Sequence[str],
|
||||
pk_keys: Sequence[tuple[Any, ...]],
|
||||
now: datetime,
|
||||
) -> None:
|
||||
"""批量关闭当前版本(scd2_is_current=0 + 填充结束时间)。"""
|
||||
table_sql = self._format_table(table, "billiards_dwd")
|
||||
if len(pk_cols) == 1:
|
||||
pk = pk_cols[0]
|
||||
ids = [k[0] for k in pk_keys]
|
||||
cur.execute(
|
||||
f'UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 '
|
||||
f'WHERE COALESCE(scd2_is_current,1)=1 AND "{pk}" = ANY(%s)',
|
||||
(now, ids),
|
||||
)
|
||||
return
|
||||
|
||||
# 复合主键:对“发生变更的键”逐条关闭(数量通常远小于全量行数)
|
||||
where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols)
|
||||
sql = (
|
||||
f"UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 "
|
||||
f"WHERE COALESCE(scd2_is_current,1)=1 AND {where_clause}"
|
||||
)
|
||||
args_list = [(now, *pk_key) for pk_key in pk_keys]
|
||||
execute_batch(cur, sql, args_list, page_size=500)
|
||||
|
||||
def _insert_dim_rows_bulk(
|
||||
self,
|
||||
cur,
|
||||
table: str,
|
||||
dwd_cols: Sequence[str],
|
||||
rows_with_version: Sequence[tuple[Dict[str, Any], int]],
|
||||
now: datetime,
|
||||
) -> None:
|
||||
"""批量插入新的 SCD2 版本行。"""
|
||||
sorted_cols = [c.lower() for c in sorted(dwd_cols)]
|
||||
insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols)
|
||||
table_sql = self._format_table(table, "billiards_dwd")
|
||||
|
||||
def build_row(src_row: Dict[str, Any], version: int) -> list[Any]:
|
||||
values: list[Any] = []
|
||||
for c in sorted_cols:
|
||||
if c == "scd2_start_time":
|
||||
values.append(now)
|
||||
elif c == "scd2_end_time":
|
||||
values.append(datetime(9999, 12, 31, 0, 0, 0))
|
||||
elif c == "scd2_is_current":
|
||||
values.append(1)
|
||||
elif c == "scd2_version":
|
||||
values.append(version)
|
||||
else:
|
||||
values.append(src_row.get(c))
|
||||
return values
|
||||
|
||||
values_rows = [build_row(r, ver) for r, ver in rows_with_version]
|
||||
insert_sql = f"INSERT INTO {table_sql} ({insert_cols_sql}) VALUES %s"
|
||||
execute_values(cur, insert_sql, values_rows, page_size=500)
|
||||
|
||||
def _upsert_scd2_row(
|
||||
self,
|
||||
@@ -762,6 +1033,8 @@ class DwdLoadTask(BaseTask):
|
||||
ods_cols: Sequence[str],
|
||||
dwd_types: Dict[str, str],
|
||||
ods_types: Dict[str, str],
|
||||
window_start: datetime | None = None,
|
||||
window_end: datetime | None = None,
|
||||
) -> int:
|
||||
"""事实表按时间增量插入,默认按列名交集写入。"""
|
||||
mapping_entries = self.FACT_MAPPINGS.get(dwd_table) or []
|
||||
@@ -813,7 +1086,10 @@ class DwdLoadTask(BaseTask):
|
||||
params: List[Any] = []
|
||||
dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
|
||||
ods_table_sql = self._format_table(ods_table, "billiards_ods")
|
||||
if order_col:
|
||||
if order_col and window_start and window_end:
|
||||
where_sql = f'WHERE "{order_col}" >= %s AND "{order_col}" < %s'
|
||||
params.extend([window_start, window_end])
|
||||
elif order_col:
|
||||
cur.execute(f'SELECT COALESCE(MAX("{order_col}"), %s) FROM {dwd_table_sql}', ("1970-01-01",))
|
||||
row = cur.fetchone() or {}
|
||||
watermark = list(row.values())[0] if row else "1970-01-01"
|
||||
|
||||
Reference in New Issue
Block a user