数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/etl_billiards/scripts/migrate_snapshot_ods.py
+++ b/etl_billiards/scripts/migrate_snapshot_ods.py
@@ -0,0 +1,324 @@
+# -*- coding: utf-8 -*-
+"""
+迁移到“快照型 ODS + DWD SCD2”：
+1) 为所有 ODS 表补充 content_hash，并以 (业务主键, content_hash) 作为新主键；
+2) 基于 payload 计算 content_hash，避免重复快照；
+3) 为所有 DWD 维度表补齐 SCD2 字段，并调整主键为 (业务主键, scd2_start_time)。
+
+用法：
+  PYTHONPATH=. python -m etl_billiards.scripts.migrate_snapshot_ods --dsn "postgresql://..."
+
+可选参数：
+  --only-ods / --only-dwd
+  --dry-run
+  --batch-size 500
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import os
+from typing import Any, Iterable, List, Sequence
+
+import psycopg2
+from psycopg2.extras import execute_batch, RealDictCursor
+
+
+def _hash_default(value):
+    return value.isoformat() if hasattr(value, "isoformat") else str(value)
+
+
+def _sanitize_record_for_hash(record: Any) -> Any:
+    exclude = {
+        "data",
+        "payload",
+        "source_file",
+        "source_endpoint",
+        "fetched_at",
+        "content_hash",
+        "record_index",
+    }
+
+    def _strip(value):
+        if isinstance(value, dict):
+            cleaned = {}
+            for k, v in value.items():
+                if isinstance(k, str) and k.lower() in exclude:
+                    continue
+                cleaned[k] = _strip(v)
+            return cleaned
+        if isinstance(value, list):
+            return [_strip(v) for v in value]
+        return value
+
+    return _strip(record or {})
+
+
+def _compute_content_hash(record: Any) -> str:
+    cleaned = _sanitize_record_for_hash(record)
+    payload = json.dumps(
+        cleaned,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ":"),
+        default=_hash_default,
+    )
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def _fetch_tables(cur, schema: str) -> List[str]:
+    cur.execute(
+        """
+        SELECT table_name
+        FROM information_schema.tables
+        WHERE table_schema = %s AND table_type = 'BASE TABLE'
+        ORDER BY table_name
+        """,
+        (schema,),
+    )
+    return [r[0] for r in cur.fetchall()]
+
+
+def _fetch_columns(cur, schema: str, table: str) -> List[str]:
+    cur.execute(
+        """
+        SELECT column_name
+        FROM information_schema.columns
+        WHERE table_schema = %s AND table_name = %s
+        ORDER BY ordinal_position
+        """,
+        (schema, table),
+    )
+    cols = []
+    for row in cur.fetchall():
+        if isinstance(row, dict):
+            cols.append(row.get("column_name"))
+        else:
+            cols.append(row[0])
+    return [c for c in cols if c]
+
+
+def _fetch_pk_constraint(cur, schema: str, table: str) -> tuple[str | None, list[str]]:
+    cur.execute(
+        """
+        SELECT tc.constraint_name, kcu.column_name, kcu.ordinal_position
+        FROM information_schema.table_constraints tc
+        JOIN information_schema.key_column_usage kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+        WHERE tc.constraint_type = 'PRIMARY KEY'
+          AND tc.table_schema = %s
+          AND tc.table_name = %s
+        ORDER BY kcu.ordinal_position
+        """,
+        (schema, table),
+    )
+    rows = cur.fetchall()
+    if not rows:
+        return None, []
+    if isinstance(rows[0], dict):
+        name = rows[0].get("constraint_name")
+        cols = [r.get("column_name") for r in rows]
+    else:
+        name = rows[0][0]
+        cols = [r[1] for r in rows]
+    return name, [c for c in cols if c]
+
+
+def _ensure_content_hash_column(cur, schema: str, table: str, dry_run: bool) -> None:
+    cols = _fetch_columns(cur, schema, table)
+    if any(c.lower() == "content_hash" for c in cols):
+        return
+    sql = f'ALTER TABLE "{schema}"."{table}" ADD COLUMN content_hash TEXT'
+    if dry_run:
+        print(f"[DRY] {sql}")
+        return
+    print(f"[ODS] 添加 content_hash: {schema}.{table}")
+    cur.execute(sql)
+
+
+def _backfill_content_hash(conn, schema: str, table: str, batch_size: int, dry_run: bool) -> int:
+    updated = 0
+    with conn.cursor(cursor_factory=RealDictCursor) as cur:
+        cols = _fetch_columns(cur, schema, table)
+        if "content_hash" not in [c.lower() for c in cols]:
+            return 0
+        pk_name, pk_cols = _fetch_pk_constraint(cur, schema, table)
+        if not pk_cols:
+            return 0
+        # 过滤 content_hash
+        pk_cols = [c for c in pk_cols if c.lower() != "content_hash"]
+        select_cols = [*pk_cols]
+        if any(c.lower() == "payload" for c in cols):
+            select_cols.append("payload")
+        else:
+            select_cols.extend([c for c in cols if c.lower() not in {"content_hash"}])
+        select_cols_sql = ", ".join(f'"{c}"' for c in select_cols)
+        sql = f'SELECT {select_cols_sql} FROM "{schema}"."{table}" WHERE content_hash IS NULL'
+        cur.execute(sql)
+        rows = cur.fetchall()
+
+    if not rows:
+        return 0
+
+    def build_row(row: dict) -> tuple:
+        payload = row.get("payload")
+        if payload is None:
+            payload = {k: v for k, v in row.items() if k.lower() not in {"content_hash", "payload"}}
+        content_hash = _compute_content_hash(payload)
+        key_vals = [row.get(k) for k in pk_cols]
+        return (content_hash, *key_vals)
+
+    updates = [build_row(r) for r in rows]
+    if dry_run:
+        print(f"[DRY] {schema}.{table}: 预计更新 {len(updates)} 行 content_hash")
+        return len(updates)
+
+    where_clause = " AND ".join([f'"{c}" = %s' for c in pk_cols])
+    update_sql = (
+        f'UPDATE "{schema}"."{table}" SET content_hash = %s '
+        f'WHERE {where_clause} AND content_hash IS NULL'
+    )
+    with conn.cursor() as cur2:
+        execute_batch(cur2, update_sql, updates, page_size=batch_size)
+        updated = cur2.rowcount or len(updates)
+    print(f"[ODS] {schema}.{table}: 更新 content_hash {updated} 行")
+    return updated
+
+
+def _ensure_ods_primary_key(cur, schema: str, table: str, dry_run: bool) -> None:
+    name, pk_cols = _fetch_pk_constraint(cur, schema, table)
+    if not pk_cols:
+        return
+    if any(c.lower() == "content_hash" for c in pk_cols):
+        return
+    new_pk = pk_cols + ["content_hash"]
+    drop_sql = f'ALTER TABLE "{schema}"."{table}" DROP CONSTRAINT "{name}"'
+    cols_sql = ", ".join([f'"{c}"' for c in new_pk])
+    add_sql = f'ALTER TABLE "{schema}"."{table}" ADD PRIMARY KEY ({cols_sql})'
+    if dry_run:
+        print(f"[DRY] {drop_sql}")
+        print(f"[DRY] {add_sql}")
+        return
+    print(f"[ODS] 变更主键: {schema}.{table} -> ({', '.join(new_pk)})")
+    cur.execute(drop_sql)
+    cur.execute(add_sql)
+
+
+def _migrate_ods(conn, schema: str, batch_size: int, dry_run: bool) -> None:
+    with conn.cursor() as cur:
+        tables = _fetch_tables(cur, schema)
+    for table in tables:
+        with conn.cursor() as cur:
+            _ensure_content_hash_column(cur, schema, table, dry_run)
+        conn.commit()
+        _backfill_content_hash(conn, schema, table, batch_size, dry_run)
+        with conn.cursor() as cur:
+            _ensure_ods_primary_key(cur, schema, table, dry_run)
+        conn.commit()
+
+
+def _backfill_scd2_fields(cur, schema: str, table: str, columns: Sequence[str], dry_run: bool) -> None:
+    lower = {c.lower() for c in columns}
+    fallback_cols = [
+        "updated_at",
+        "update_time",
+        "created_at",
+        "create_time",
+        "fetched_at",
+    ]
+    fallback = None
+    for col in fallback_cols:
+        if col in lower:
+            fallback = f'"{col}"'
+            break
+    if fallback is None:
+        fallback = "now()"
+
+    sql = (
+        f'UPDATE "{schema}"."{table}" '
+        f'SET scd2_start_time = COALESCE(scd2_start_time, {fallback}), '
+        f"scd2_end_time = COALESCE(scd2_end_time, TIMESTAMPTZ '9999-12-31'), "
+        f"scd2_is_current = COALESCE(scd2_is_current, 1), "
+        f"scd2_version = COALESCE(scd2_version, 1) "
+        f"WHERE scd2_start_time IS NULL OR scd2_end_time IS NULL OR scd2_is_current IS NULL OR scd2_version IS NULL"
+    )
+    if dry_run:
+        print(f"[DRY] {sql}")
+        return
+    cur.execute(sql)
+
+
+def _ensure_dwd_primary_key(cur, schema: str, table: str, dry_run: bool) -> None:
+    name, pk_cols = _fetch_pk_constraint(cur, schema, table)
+    if not pk_cols:
+        return
+    if any(c.lower() == "scd2_start_time" for c in pk_cols):
+        return
+    new_pk = pk_cols + ["scd2_start_time"]
+    drop_sql = f'ALTER TABLE "{schema}"."{table}" DROP CONSTRAINT "{name}"'
+    cols_sql = ", ".join([f'"{c}"' for c in new_pk])
+    add_sql = f'ALTER TABLE "{schema}"."{table}" ADD PRIMARY KEY ({cols_sql})'
+    if dry_run:
+        print(f"[DRY] {drop_sql}")
+        print(f"[DRY] {add_sql}")
+        return
+    print(f"[DWD] 变更主键: {schema}.{table} -> ({', '.join(new_pk)})")
+    cur.execute(drop_sql)
+    cur.execute(add_sql)
+
+
+def _migrate_dwd(conn, schema: str, dry_run: bool) -> None:
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT DISTINCT table_name
+            FROM information_schema.columns
+            WHERE table_schema = %s AND column_name ILIKE 'scd2_start_time'
+            ORDER BY table_name
+            """,
+            (schema,),
+        )
+        tables = [r[0] for r in cur.fetchall()]
+
+    for table in tables:
+        with conn.cursor() as cur:
+            cols = _fetch_columns(cur, schema, table)
+            _backfill_scd2_fields(cur, schema, table, cols, dry_run)
+        conn.commit()
+        with conn.cursor() as cur:
+            _ensure_dwd_primary_key(cur, schema, table, dry_run)
+        conn.commit()
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="迁移 ODS 快照 + DWD SCD2")
+    parser.add_argument("--dsn", dest="dsn", help="PostgreSQL DSN（也可用环境变量 PG_DSN）")
+    parser.add_argument("--schema-ods", dest="schema_ods", default="billiards_ods")
+    parser.add_argument("--schema-dwd", dest="schema_dwd", default="billiards_dwd")
+    parser.add_argument("--batch-size", dest="batch_size", type=int, default=500)
+    parser.add_argument("--only-ods", dest="only_ods", action="store_true")
+    parser.add_argument("--only-dwd", dest="only_dwd", action="store_true")
+    parser.add_argument("--dry-run", dest="dry_run", action="store_true")
+    args = parser.parse_args()
+
+    dsn = args.dsn or os.environ.get("PG_DSN")
+    if not dsn:
+        print("缺少 DSN（--dsn 或环境变量 PG_DSN）")
+        return 2
+
+    conn = psycopg2.connect(dsn)
+    conn.autocommit = False
+    try:
+        if not args.only_dwd:
+            _migrate_ods(conn, args.schema_ods, args.batch_size, args.dry_run)
+        if not args.only_ods:
+            _migrate_dwd(conn, args.schema_dwd, args.dry_run)
+        return 0
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())