This commit is contained in:
Neo
2026-02-04 21:39:01 +08:00
parent ee773a9b52
commit a3f4d04335
148 changed files with 31455 additions and 182 deletions

View File

@@ -1,4 +1,4 @@
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""ODS ingestion tasks."""
from __future__ import annotations
@@ -305,7 +305,9 @@ class BaseOdsTask(BaseTask):
source_endpoint: str | None,
) -> tuple[int, int]:
"""
按 DB 表结构动态写入 ODS只插新数据ON CONFLICT DO NOTHING
按 DB 表结构动态写入 ODS。
- 新记录:插入
- 已存在的记录:回填 NULL 列(用新值填充数据库中为 NULL 的字段)
返回 (inserted, skipped)。
"""
if not records:
@@ -324,9 +326,54 @@ class BaseOdsTask(BaseTask):
col_names = [c[0] for c in cols_info]
quoted_cols = ", ".join(f'\"{c}\"' for c in col_names)
sql = f"INSERT INTO {table} ({quoted_cols}) VALUES %s"
# 冲突处理模式:
# "nothing" - 跳过已存在记录 (DO NOTHING)
# "backfill" - 只回填 NULL 列 (COALESCE)
# "update" - 全字段对比更新 (覆盖所有变化的字段)
conflict_mode = str(self.config.get("run.ods_conflict_mode", "update")).lower()
# 兼容旧配置
if self.config.get("run.ods_backfill_null_columns") is False:
conflict_mode = "nothing"
if pk_cols:
pk_clause = ", ".join(f'\"{c}\"' for c in pk_cols)
sql += f" ON CONFLICT ({pk_clause}) DO NOTHING"
if conflict_mode in ("backfill", "update"):
# 排除主键列和元数据列
meta_cols = {"payload", "source_file", "source_endpoint", "fetched_at", "content_hash"}
pk_cols_lower = {c.lower() for c in pk_cols}
update_cols = [
c for c in col_names
if c.lower() not in pk_cols_lower and c.lower() not in meta_cols
]
if update_cols:
if conflict_mode == "backfill":
# 回填模式:只填充 NULL 列
set_clause = ", ".join(
f'"{c}" = COALESCE({table}."{c}", EXCLUDED."{c}")'
for c in update_cols
)
where_clause = " OR ".join(f'{table}."{c}" IS NULL' for c in update_cols)
sql += f" ON CONFLICT ({pk_clause}) DO UPDATE SET {set_clause} WHERE {where_clause}"
else:
# update 模式:全字段对比更新
set_clause = ", ".join(
f'"{c}" = EXCLUDED."{c}"'
for c in update_cols
)
# 只在有字段变化时才更新
where_clause = " OR ".join(
f'{table}."{c}" IS DISTINCT FROM EXCLUDED."{c}"'
for c in update_cols
)
sql += f" ON CONFLICT ({pk_clause}) DO UPDATE SET {set_clause} WHERE {where_clause}"
else:
sql += f" ON CONFLICT ({pk_clause}) DO NOTHING"
else:
sql += f" ON CONFLICT ({pk_clause}) DO NOTHING"
now = datetime.now(self.tz)
json_dump = lambda v: json.dumps(v, ensure_ascii=False) # noqa: E731
@@ -499,6 +546,14 @@ class BaseOdsTask(BaseTask):
if value is None:
return None
dt = (data_type or "").lower()
if dt == "boolean":
if isinstance(value, bool):
return value
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
return value.lower() in ("true", "1", "yes", "t")
return bool(value)
if dt in ("integer", "bigint", "smallint"):
if isinstance(value, bool):
return int(value)