数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/etl_billiards/quality/integrity_checker.py
+++ b/etl_billiards/quality/integrity_checker.py
@@ -147,25 +147,164 @@ def _amount_columns(cols: List[str], types: Dict[str, str]) -> List[str]:
    return out


-def _count_table(cur, schema: str, table: str, time_col: str | None, window: IntegrityWindow | None) -> int:
-    where = ""
+def _build_hash_expr(alias: str, cols: list[str]) -> str:
+    if not cols:
+        return "NULL"
+    parts = ", ".join([f"COALESCE({alias}.\"{c}\"::text,'')" for c in cols])
+    return f"md5(concat_ws('||', {parts}))"
+
+
+def _build_snapshot_subquery(
+    schema: str,
+    table: str,
+    cols: list[str],
+    key_cols: list[str],
+    order_col: str | None,
+    where_sql: str,
+) -> str:
+    cols_sql = ", ".join([f'"{c}"' for c in cols])
+    if key_cols and order_col:
+        keys = ", ".join([f'"{c}"' for c in key_cols])
+        order_by = ", ".join([*(f'"{c}"' for c in key_cols), f'"{order_col}" DESC NULLS LAST'])
+        return (
+            f'SELECT DISTINCT ON ({keys}) {cols_sql} '
+            f'FROM "{schema}"."{table}" {where_sql} '
+            f"ORDER BY {order_by}"
+        )
+    return f'SELECT {cols_sql} FROM "{schema}"."{table}" {where_sql}'
+
+
+def _build_snapshot_expr_subquery(
+    schema: str,
+    table: str,
+    select_exprs: list[str],
+    key_exprs: list[str],
+    order_col: str | None,
+    where_sql: str,
+) -> str:
+    select_cols_sql = ", ".join(select_exprs)
+    table_sql = f'"{schema}"."{table}"'
+    if key_exprs and order_col:
+        distinct_on = ", ".join(key_exprs)
+        order_by = ", ".join([*key_exprs, f'"{order_col}" DESC NULLS LAST'])
+        return (
+            f"SELECT DISTINCT ON ({distinct_on}) {select_cols_sql} "
+            f"FROM {table_sql} {where_sql} "
+            f"ORDER BY {order_by}"
+        )
+    return f"SELECT {select_cols_sql} FROM {table_sql} {where_sql}"
+
+
+def _cast_expr(col: str, cast_type: str | None) -> str:
+    if col.upper() == "NULL":
+        base = "NULL"
+    else:
+        is_expr = not col.isidentifier() or "->" in col or "#>>" in col or "::" in col or "'" in col
+        base = col if is_expr else f'"{col}"'
+    if cast_type:
+        cast_lower = cast_type.lower()
+        if cast_lower in {"bigint", "integer", "numeric", "decimal"}:
+            return f"CAST(NULLIF(CAST({base} AS text), '') AS numeric):: {cast_type}"
+        if cast_lower == "timestamptz":
+            return f"({base})::timestamptz"
+        return f"{base}::{cast_type}"
+    return base
+
+
+def _fetch_pk_columns(cur, schema: str, table: str) -> List[str]:
+    cur.execute(
+        """
+        SELECT kcu.column_name
+        FROM information_schema.table_constraints tc
+        JOIN information_schema.key_column_usage kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+        WHERE tc.constraint_type = 'PRIMARY KEY'
+          AND tc.table_schema = %s
+          AND tc.table_name = %s
+        ORDER BY kcu.ordinal_position
+        """,
+        (schema, table),
+    )
+    return [r[0] for r in cur.fetchall()]
+
+
+def _pick_snapshot_order_column(cols: Iterable[str]) -> str | None:
+    lower = {c.lower() for c in cols}
+    for candidate in ("fetched_at", "update_time", "create_time"):
+        if candidate in lower:
+            return candidate
+    return None
+
+
+def _count_table(
+    cur,
+    schema: str,
+    table: str,
+    time_col: str | None,
+    window: IntegrityWindow | None,
+    *,
+    pk_cols: List[str] | None = None,
+    snapshot_order_col: str | None = None,
+    current_only: bool = False,
+) -> int:
+    where_parts: List[str] = []
    params: List[Any] = []
+    if current_only:
+        where_parts.append("COALESCE(scd2_is_current,1)=1")
    if time_col and window:
-        where = f'WHERE "{time_col}" >= %s AND "{time_col}" < %s'
-        params = [window.start, window.end]
-    sql = f'SELECT COUNT(1) FROM "{schema}"."{table}" {where}'
+        where_parts.append(f'"{time_col}" >= %s AND "{time_col}" < %s')
+        params.extend([window.start, window.end])
+    where = f"WHERE {' AND '.join(where_parts)}" if where_parts else ""
+
+    if pk_cols and snapshot_order_col:
+        keys = ", ".join(f'"{c}"' for c in pk_cols)
+        order_by = ", ".join([*(f'"{c}"' for c in pk_cols), f'"{snapshot_order_col}" DESC NULLS LAST'])
+        sql = (
+            f'SELECT COUNT(1) FROM ('
+            f'SELECT DISTINCT ON ({keys}) 1 FROM "{schema}"."{table}" {where} '
+            f'ORDER BY {order_by}'
+            f') t'
+        )
+    else:
+        sql = f'SELECT COUNT(1) FROM "{schema}"."{table}" {where}'
    cur.execute(sql, params)
    row = cur.fetchone()
    return int(row[0] if row else 0)


-def _sum_column(cur, schema: str, table: str, col: str, time_col: str | None, window: IntegrityWindow | None) -> float:
-    where = ""
+def _sum_column(
+    cur,
+    schema: str,
+    table: str,
+    col: str,
+    time_col: str | None,
+    window: IntegrityWindow | None,
+    *,
+    pk_cols: List[str] | None = None,
+    snapshot_order_col: str | None = None,
+    current_only: bool = False,
+) -> float:
+    where_parts: List[str] = []
    params: List[Any] = []
+    if current_only:
+        where_parts.append("COALESCE(scd2_is_current,1)=1")
    if time_col and window:
-        where = f'WHERE "{time_col}" >= %s AND "{time_col}" < %s'
-        params = [window.start, window.end]
-    sql = f'SELECT COALESCE(SUM("{col}"), 0) FROM "{schema}"."{table}" {where}'
+        where_parts.append(f'"{time_col}" >= %s AND "{time_col}" < %s')
+        params.extend([window.start, window.end])
+    where = f"WHERE {' AND '.join(where_parts)}" if where_parts else ""
+
+    if pk_cols and snapshot_order_col:
+        keys = ", ".join(f'"{c}"' for c in pk_cols)
+        order_by = ", ".join([*(f'"{c}"' for c in pk_cols), f'"{snapshot_order_col}" DESC NULLS LAST'])
+        sql = (
+            f'SELECT COALESCE(SUM("{col}"), 0) FROM ('
+            f'SELECT DISTINCT ON ({keys}) "{col}" FROM "{schema}"."{table}" {where} '
+            f'ORDER BY {order_by}'
+            f') t'
+        )
+    else:
+        sql = f'SELECT COALESCE(SUM("{col}"), 0) FROM "{schema}"."{table}" {where}'
    cur.execute(sql, params)
    row = cur.fetchone()
    return float(row[0] if row else 0)
@@ -176,14 +315,21 @@ def run_dwd_vs_ods_check(
    cfg: AppConfig,
    window: IntegrityWindow | None,
    include_dimensions: bool,
+    compare_content: bool | None = None,
+    content_sample_limit: int | None = None,
 ) -> Dict[str, Any]:
    dsn = cfg["db"]["dsn"]
    session = cfg["db"].get("session")
    db_conn = DatabaseConnection(dsn=dsn, session=session)
+    if compare_content is None:
+        compare_content = bool(cfg.get("integrity.compare_content", True))
+    if content_sample_limit is None:
+        content_sample_limit = cfg.get("integrity.content_sample_limit") or 50
    try:
        with db_conn.conn.cursor() as cur:
            results: List[Dict[str, Any]] = []
            table_map = DwdLoadTask.TABLE_MAP
+            total_mismatch = 0
            for dwd_table, ods_table in table_map.items():
                if not include_dimensions and ".dim_" in dwd_table:
                    continue
@@ -193,16 +339,55 @@ def run_dwd_vs_ods_check(
                    dwd_cols, dwd_types = _fetch_columns(cur, schema_dwd, name_dwd)
                    ods_cols, ods_types = _fetch_columns(cur, schema_ods, name_ods)
                    time_col = _pick_time_column(dwd_cols, ods_cols)
-                    count_dwd = _count_table(cur, schema_dwd, name_dwd, time_col, window)
-                    count_ods = _count_table(cur, schema_ods, name_ods, time_col, window)
+                    pk_dwd = _fetch_pk_columns(cur, schema_dwd, name_dwd)
+                    pk_ods_raw = _fetch_pk_columns(cur, schema_ods, name_ods)
+                    pk_ods = [c for c in pk_ods_raw if c.lower() != "content_hash"]
+                    ods_has_snapshot = any(c.lower() == "content_hash" for c in ods_cols)
+                    ods_snapshot_order = _pick_snapshot_order_column(ods_cols) if ods_has_snapshot else None
+                    dwd_current_only = any(c.lower() == "scd2_is_current" for c in dwd_cols)
+
+                    count_dwd = _count_table(
+                        cur,
+                        schema_dwd,
+                        name_dwd,
+                        time_col,
+                        window,
+                        current_only=dwd_current_only,
+                    )
+                    count_ods = _count_table(
+                        cur,
+                        schema_ods,
+                        name_ods,
+                        time_col,
+                        window,
+                        pk_cols=pk_ods if ods_has_snapshot else None,
+                        snapshot_order_col=ods_snapshot_order if ods_has_snapshot else None,
+                    )

                    dwd_amount_cols = _amount_columns(dwd_cols, dwd_types)
                    ods_amount_cols = _amount_columns(ods_cols, ods_types)
                    common_amount_cols = sorted(set(dwd_amount_cols) & set(ods_amount_cols))
                    amounts: List[Dict[str, Any]] = []
                    for col in common_amount_cols:
-                        dwd_sum = _sum_column(cur, schema_dwd, name_dwd, col, time_col, window)
-                        ods_sum = _sum_column(cur, schema_ods, name_ods, col, time_col, window)
+                        dwd_sum = _sum_column(
+                            cur,
+                            schema_dwd,
+                            name_dwd,
+                            col,
+                            time_col,
+                            window,
+                            current_only=dwd_current_only,
+                        )
+                        ods_sum = _sum_column(
+                            cur,
+                            schema_ods,
+                            name_ods,
+                            col,
+                            time_col,
+                            window,
+                            pk_cols=pk_ods if ods_has_snapshot else None,
+                            snapshot_order_col=ods_snapshot_order if ods_has_snapshot else None,
+                        )
                        amounts.append(
                            {
                                "column": col,
@@ -212,6 +397,151 @@ def run_dwd_vs_ods_check(
                            }
                        )

+                    mismatch = None
+                    mismatch_samples: list[dict] = []
+                    mismatch_error = None
+                    if compare_content:
+                        dwd_cols_lower = [c.lower() for c in dwd_cols]
+                        ods_cols_lower = [c.lower() for c in ods_cols]
+                        dwd_col_set = set(dwd_cols_lower)
+                        ods_col_set = set(ods_cols_lower)
+                        scd_cols = {c.lower() for c in DwdLoadTask.SCD_COLS}
+                        ods_exclude = {
+                            "payload", "source_file", "source_endpoint", "fetched_at", "content_hash", "record_index"
+                        }
+                        numeric_types = {
+                            "integer",
+                            "bigint",
+                            "smallint",
+                            "numeric",
+                            "double precision",
+                            "real",
+                            "decimal",
+                        }
+                        text_types = {"text", "character varying", "varchar"}
+                        mapping = {
+                            dst.lower(): (src, cast_type)
+                            for dst, src, cast_type in (DwdLoadTask.FACT_MAPPINGS.get(dwd_table) or [])
+                        }
+                        business_keys = [c for c in pk_dwd if c.lower() not in scd_cols]
+                        def resolve_ods_expr(col: str) -> str | None:
+                            mapped = mapping.get(col)
+                            if mapped:
+                                src, cast_type = mapped
+                                return _cast_expr(src, cast_type)
+                            if col in ods_col_set:
+                                d_type = dwd_types.get(col)
+                                o_type = ods_types.get(col)
+                                if d_type in numeric_types and o_type in text_types:
+                                    return _cast_expr(col, d_type)
+                                return f'"{col}"'
+                            if "id" in ods_col_set and col.endswith("_id"):
+                                d_type = dwd_types.get(col)
+                                o_type = ods_types.get("id")
+                                if d_type in numeric_types and o_type in text_types:
+                                    return _cast_expr("id", d_type)
+                                return '"id"'
+                            return None
+
+                        key_exprs: list[str] = []
+                        join_keys: list[str] = []
+                        for key in business_keys:
+                            key_lower = key.lower()
+                            expr = resolve_ods_expr(key_lower)
+                            if expr is None:
+                                key_exprs = []
+                                join_keys = []
+                                break
+                            key_exprs.append(expr)
+                            join_keys.append(key_lower)
+
+                        compare_cols: list[str] = []
+                        for col in dwd_col_set:
+                            if col in ods_exclude or col in scd_cols:
+                                continue
+                            if col in {k.lower() for k in business_keys}:
+                                continue
+                            if dwd_types.get(col) in ("json", "jsonb"):
+                                continue
+                            if ods_types.get(col) in ("json", "jsonb"):
+                                continue
+                            if resolve_ods_expr(col) is None:
+                                continue
+                            compare_cols.append(col)
+                        compare_cols = sorted(set(compare_cols))
+
+                        if join_keys and compare_cols:
+                            where_parts_dwd: list[str] = []
+                            params_dwd: list[Any] = []
+                            if dwd_current_only:
+                                where_parts_dwd.append("COALESCE(scd2_is_current,1)=1")
+                            if time_col and window:
+                                where_parts_dwd.append(f"\"{time_col}\" >= %s AND \"{time_col}\" < %s")
+                                params_dwd.extend([window.start, window.end])
+                            where_dwd = f"WHERE {' AND '.join(where_parts_dwd)}" if where_parts_dwd else ""
+
+                            where_parts_ods: list[str] = []
+                            params_ods: list[Any] = []
+                            if time_col and window:
+                                where_parts_ods.append(f"\"{time_col}\" >= %s AND \"{time_col}\" < %s")
+                                params_ods.extend([window.start, window.end])
+                            where_ods = f"WHERE {' AND '.join(where_parts_ods)}" if where_parts_ods else ""
+
+                            ods_select_exprs: list[str] = []
+                            needed_cols = sorted(set(join_keys + compare_cols))
+                            for col in needed_cols:
+                                expr = resolve_ods_expr(col)
+                                if expr is None:
+                                    continue
+                                ods_select_exprs.append(f"{expr} AS \"{col}\"")
+
+                            if not ods_select_exprs:
+                                mismatch_error = "join_keys_or_compare_cols_unavailable"
+                            else:
+                                ods_sql = _build_snapshot_expr_subquery(
+                                    schema_ods,
+                                    name_ods,
+                                    ods_select_exprs,
+                                    key_exprs,
+                                    ods_snapshot_order,
+                                    where_ods,
+                                )
+                                dwd_cols_sql = ", ".join([f"\"{c}\"" for c in needed_cols])
+                                dwd_sql = f"SELECT {dwd_cols_sql} FROM \"{schema_dwd}\".\"{name_dwd}\" {where_dwd}"
+
+                                join_cond = " AND ".join([f"d.\"{k}\" = o.\"{k}\"" for k in join_keys])
+                                hash_o = _build_hash_expr("o", compare_cols)
+                                hash_d = _build_hash_expr("d", compare_cols)
+
+                                mismatch_sql = (
+                                    f"WITH ods_latest AS ({ods_sql}), dwd_filtered AS ({dwd_sql}) "
+                                    f"SELECT COUNT(1) FROM ("
+                                    f"SELECT 1 FROM ods_latest o JOIN dwd_filtered d ON {join_cond} "
+                                    f"WHERE {hash_o} <> {hash_d}"
+                                    f") t"
+                                )
+                                params = params_ods + params_dwd
+                                cur.execute(mismatch_sql, params)
+                                row = cur.fetchone()
+                                mismatch = int(row[0] if row and row[0] is not None else 0)
+                                total_mismatch += mismatch
+
+                                if content_sample_limit and mismatch > 0:
+                                    select_keys_sql = ", ".join([f"d.\"{k}\" AS \"{k}\"" for k in join_keys])
+                                    sample_sql = (
+                                        f"WITH ods_latest AS ({ods_sql}), dwd_filtered AS ({dwd_sql}) "
+                                        f"SELECT {select_keys_sql}, {hash_o} AS ods_hash, {hash_d} AS dwd_hash "
+                                        f"FROM ods_latest o JOIN dwd_filtered d ON {join_cond} "
+                                        f"WHERE {hash_o} <> {hash_d} LIMIT %s"
+                                    )
+                                    cur.execute(sample_sql, params + [int(content_sample_limit)])
+                                    rows = cur.fetchall() or []
+                                    if rows:
+                                        columns = [desc[0] for desc in (cur.description or [])]
+                                        mismatch_samples = [dict(zip(columns, r)) for r in rows]
+                        else:
+                            mismatch_error = "join_keys_or_compare_cols_unavailable"
+
                    results.append(
                        {
                            "dwd_table": dwd_table,
@@ -220,6 +550,9 @@ def run_dwd_vs_ods_check(
                            "window_col": time_col,
                            "count": {"dwd": count_dwd, "ods": count_ods, "diff": count_dwd - count_ods},
                            "amounts": amounts,
+                            "mismatch": mismatch,
+                            "mismatch_samples": mismatch_samples,
+                            "mismatch_error": mismatch_error,
                        }
                    )
                except Exception as exc:  # noqa: BLE001
@@ -231,6 +564,8 @@ def run_dwd_vs_ods_check(
                            "window_col": None,
                            "count": {"dwd": None, "ods": None, "diff": None},
                            "amounts": [],
+                            "mismatch": None,
+                            "mismatch_samples": [],
                            "error": f"{type(exc).__name__}: {exc}",
                        }
                    )
@@ -243,6 +578,7 @@ def run_dwd_vs_ods_check(
        return {
            "tables": results,
            "total_count_diff": total_count_diff,
+            "total_mismatch": total_mismatch,
        }
    finally:
        db_conn.close()
@@ -262,6 +598,8 @@ def run_integrity_window(
    task_codes: str,
    logger,
    write_report: bool,
+    compare_content: bool | None = None,
+    content_sample_limit: int | None = None,
    report_path: Path | None = None,
    window_split_unit: str | None = None,
    window_compensation_hours: int | None = None,
@@ -274,6 +612,11 @@ def run_integrity_window(
        window_days = 0
        window_hours = max(1, total_seconds // 3600 or 1)

+    if compare_content is None:
+        compare_content = bool(cfg.get("integrity.compare_content", True))
+    if content_sample_limit is None:
+        content_sample_limit = cfg.get("integrity.content_sample_limit")
+
    ods_payload = run_gap_check(
        cfg=cfg,
        start=window.start,
@@ -290,6 +633,8 @@ def run_integrity_window(
        cutoff_overlap_hours=24,
        allow_small_window=True,
        logger=logger,
+        compare_content=bool(compare_content),
+        content_sample_limit=content_sample_limit,
        window_split_unit=window_split_unit,
        window_compensation_hours=window_compensation_hours,
    )
@@ -298,6 +643,8 @@ def run_integrity_window(
        cfg=cfg,
        window=window,
        include_dimensions=include_dimensions,
+        compare_content=compare_content,
+        content_sample_limit=content_sample_limit,
    )

    report = {
@@ -331,12 +678,15 @@ def run_integrity_history(
    task_codes: str,
    logger,
    write_report: bool,
+    compare_content: bool | None = None,
+    content_sample_limit: int | None = None,
    report_path: Path | None = None,
 ) -> Dict[str, Any]:
    tz = ZoneInfo(cfg.get("app.timezone", "Asia/Taipei"))
    windows = build_history_windows(start_dt, end_dt, tz)
    results: List[Dict[str, Any]] = []
    total_missing = 0
+    total_mismatch = 0
    total_errors = 0

    for window in windows:
@@ -348,9 +698,12 @@ def run_integrity_history(
            task_codes=task_codes,
            logger=logger,
            write_report=False,
+            compare_content=compare_content,
+            content_sample_limit=content_sample_limit,
        )
        results.append(payload)
        total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
+        total_mismatch += int(payload.get("api_to_ods", {}).get("total_mismatch") or 0)
        total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)

    report = {
@@ -359,6 +712,7 @@ def run_integrity_history(
        "end": _ensure_tz(end_dt, tz).isoformat(),
        "windows": results,
        "total_missing": total_missing,
+        "total_mismatch": total_mismatch,
        "total_errors": total_errors,
        "generated_at": datetime.now(tz).isoformat(),
    }