Neo-ZQYY/apps/etl/pipelines/feiqiu/quality/integrity_checker.py

# -*- coding: utf-8 -*-
# AI_CHANGELOG [2026-02-14] 默认时区从 Asia/Taipei 修正为 Asia/Shanghai（3 处）
"""Integrity checks across API -> ODS -> DWD."""
from __future__ import annotations

from dataclasses import dataclass
from datetime import date, datetime, time, timedelta
from pathlib import Path
from typing import Any, Dict, Iterable, List, Tuple
from zoneinfo import ZoneInfo

import json

from config.settings import AppConfig
from database.connection import DatabaseConnection
from tasks.dwd.dwd_load_task import DwdLoadTask
from scripts.check.check_ods_gaps import run_gap_check

AMOUNT_KEYWORDS = ("amount", "money", "fee", "balance")


@dataclass(frozen=True)
class IntegrityWindow:
    start: datetime
    end: datetime
    label: str
    granularity: str


def _ensure_tz(dt: datetime, tz: ZoneInfo) -> datetime:
    if dt.tzinfo is None:
        return dt.replace(tzinfo=tz)
    return dt.astimezone(tz)


def _month_start(day: date) -> date:
    return date(day.year, day.month, 1)


def _next_month(day: date) -> date:
    if day.month == 12:
        return date(day.year + 1, 1, 1)
    return date(day.year, day.month + 1, 1)


def _date_to_start(dt: date, tz: ZoneInfo) -> datetime:
    return datetime.combine(dt, time.min).replace(tzinfo=tz)


def _date_to_end_exclusive(dt: date, tz: ZoneInfo) -> datetime:
    return datetime.combine(dt, time.min).replace(tzinfo=tz) + timedelta(days=1)


def build_history_windows(start_dt: datetime, end_dt: datetime, tz: ZoneInfo) -> List[IntegrityWindow]:
    """Build weekly windows for current month, monthly windows for earlier months."""
    start_dt = _ensure_tz(start_dt, tz)
    end_dt = _ensure_tz(end_dt, tz)
    if end_dt <= start_dt:
        return []

    start_date = start_dt.date()
    end_date = end_dt.date()
    current_month_start = _month_start(end_date)

    windows: List[IntegrityWindow] = []
    cur = start_date
    while cur <= end_date:
        month_start = _month_start(cur)
        month_end_exclusive = _next_month(cur)
        range_start = max(cur, month_start)
        range_end = min(end_date, month_end_exclusive - timedelta(days=1))

        if month_start == current_month_start:
            week_start = range_start
            while week_start <= range_end:
                week_end = min(week_start + timedelta(days=6), range_end)
                w_start_dt = _date_to_start(week_start, tz)
                w_end_dt = _date_to_end_exclusive(week_end, tz)
                if w_start_dt < end_dt and w_end_dt > start_dt:
                    windows.append(
                        IntegrityWindow(
                            start=max(w_start_dt, start_dt),
                            end=min(w_end_dt, end_dt),
                            label=f"week_{week_start.isoformat()}",
                            granularity="week",
                        )
                    )
                week_start = week_end + timedelta(days=1)
        else:
            m_start_dt = _date_to_start(range_start, tz)
            m_end_dt = _date_to_end_exclusive(range_end, tz)
            if m_start_dt < end_dt and m_end_dt > start_dt:
                windows.append(
                    IntegrityWindow(
                        start=max(m_start_dt, start_dt),
                        end=min(m_end_dt, end_dt),
                        label=f"month_{month_start.isoformat()}",
                        granularity="month",
                    )
                )
        cur = month_end_exclusive

    return windows


def _split_table(name: str, default_schema: str) -> Tuple[str, str]:
    if "." in name:
        schema, table = name.split(".", 1)
        return schema, table
    return default_schema, name


def _pick_time_column(dwd_cols: Iterable[str], ods_cols: Iterable[str]) -> str | None:
    lower_cols = {c.lower() for c in dwd_cols} & {c.lower() for c in ods_cols}
    for candidate in DwdLoadTask.FACT_ORDER_CANDIDATES:
        if candidate.lower() in lower_cols:
            return candidate.lower()
    return None


def _fetch_columns(cur, schema: str, table: str) -> Tuple[List[str], Dict[str, str]]:
    cur.execute(
        """
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_schema = %s AND table_name = %s
        ORDER BY ordinal_position
        """,
        (schema, table),
    )
    cols = []
    types: Dict[str, str] = {}
    for name, data_type in cur.fetchall():
        cols.append(name)
        types[name.lower()] = (data_type or "").lower()
    return cols, types


def _amount_columns(cols: List[str], types: Dict[str, str]) -> List[str]:
    numeric_types = {"numeric", "double precision", "integer", "bigint", "smallint", "real", "decimal"}
    out = []
    for col in cols:
        lc = col.lower()
        if types.get(lc) not in numeric_types:
            continue
        if any(key in lc for key in AMOUNT_KEYWORDS):
            out.append(lc)
    return out


def _build_hash_expr(alias: str, cols: list[str]) -> str:
    if not cols:
        return "NULL"
    parts = ", ".join([f"COALESCE({alias}.\"{c}\"::text,'')" for c in cols])
    return f"md5(concat_ws('||', {parts}))"


def _build_snapshot_subquery(
    schema: str,
    table: str,
    cols: list[str],
    key_cols: list[str],
    order_col: str | None,
    where_sql: str,
) -> str:
    cols_sql = ", ".join([f'"{c}"' for c in cols])
    if key_cols and order_col:
        keys = ", ".join([f'"{c}"' for c in key_cols])
        order_by = ", ".join([*(f'"{c}"' for c in key_cols), f'"{order_col}" DESC NULLS LAST'])
        return (
            f'SELECT DISTINCT ON ({keys}) {cols_sql} '
            f'FROM "{schema}"."{table}" {where_sql} '
            f"ORDER BY {order_by}"
        )
    return f'SELECT {cols_sql} FROM "{schema}"."{table}" {where_sql}'


def _build_snapshot_expr_subquery(
    schema: str,
    table: str,
    select_exprs: list[str],
    key_exprs: list[str],
    order_col: str | None,
    where_sql: str,
) -> str:
    select_cols_sql = ", ".join(select_exprs)
    table_sql = f'"{schema}"."{table}"'
    if key_exprs and order_col:
        distinct_on = ", ".join(key_exprs)
        order_by = ", ".join([*key_exprs, f'"{order_col}" DESC NULLS LAST'])
        return (
            f"SELECT DISTINCT ON ({distinct_on}) {select_cols_sql} "
            f"FROM {table_sql} {where_sql} "
            f"ORDER BY {order_by}"
        )
    return f"SELECT {select_cols_sql} FROM {table_sql} {where_sql}"


def _cast_expr(col: str, cast_type: str | None) -> str:
    if col.upper() == "NULL":
        base = "NULL"
    else:
        is_expr = not col.isidentifier() or "->" in col or "#>>" in col or "::" in col or "'" in col
        base = col if is_expr else f'"{col}"'
    if cast_type:
        cast_lower = cast_type.lower()
        if cast_lower in {"bigint", "integer", "numeric", "decimal"}:
            return f"CAST(NULLIF(CAST({base} AS text), '') AS numeric):: {cast_type}"
        if cast_lower == "timestamptz":
            return f"({base})::timestamptz"
        return f"{base}::{cast_type}"
    return base


def _fetch_pk_columns(cur, schema: str, table: str) -> List[str]:
    cur.execute(
        """
        SELECT kcu.column_name
        FROM information_schema.table_constraints tc
        JOIN information_schema.key_column_usage kcu
          ON tc.constraint_name = kcu.constraint_name
         AND tc.table_schema = kcu.table_schema
        WHERE tc.constraint_type = 'PRIMARY KEY'
          AND tc.table_schema = %s
          AND tc.table_name = %s
        ORDER BY kcu.ordinal_position
        """,
        (schema, table),
    )
    return [r[0] for r in cur.fetchall()]


def _pick_snapshot_order_column(cols: Iterable[str]) -> str | None:
    lower = {c.lower() for c in cols}
    for candidate in ("fetched_at", "update_time", "create_time"):
        if candidate in lower:
            return candidate
    return None


def _count_table(
    cur,
    schema: str,
    table: str,
    time_col: str | None,
    window: IntegrityWindow | None,
    *,
    pk_cols: List[str] | None = None,
    snapshot_order_col: str | None = None,
    current_only: bool = False,
) -> int:
    where_parts: List[str] = []
    params: List[Any] = []
    if current_only:
        where_parts.append("COALESCE(scd2_is_current,1)=1")
    if time_col and window:
        where_parts.append(f'"{time_col}" >= %s AND "{time_col}" < %s')
        params.extend([window.start, window.end])
    where = f"WHERE {' AND '.join(where_parts)}" if where_parts else ""

    if pk_cols and snapshot_order_col:
        keys = ", ".join(f'"{c}"' for c in pk_cols)
        order_by = ", ".join([*(f'"{c}"' for c in pk_cols), f'"{snapshot_order_col}" DESC NULLS LAST'])
        sql = (
            f'SELECT COUNT(1) FROM ('
            f'SELECT DISTINCT ON ({keys}) 1 FROM "{schema}"."{table}" {where} '
            f'ORDER BY {order_by}'
            f') t'
        )
    else:
        sql = f'SELECT COUNT(1) FROM "{schema}"."{table}" {where}'
    cur.execute(sql, params)
    row = cur.fetchone()
    return int(row[0] if row else 0)


def _sum_column(
    cur,
    schema: str,
    table: str,
    col: str,
    time_col: str | None,
    window: IntegrityWindow | None,
    *,
    pk_cols: List[str] | None = None,
    snapshot_order_col: str | None = None,
    current_only: bool = False,
) -> float:
    where_parts: List[str] = []
    params: List[Any] = []
    if current_only:
        where_parts.append("COALESCE(scd2_is_current,1)=1")
    if time_col and window:
        where_parts.append(f'"{time_col}" >= %s AND "{time_col}" < %s')
        params.extend([window.start, window.end])
    where = f"WHERE {' AND '.join(where_parts)}" if where_parts else ""

    if pk_cols and snapshot_order_col:
        keys = ", ".join(f'"{c}"' for c in pk_cols)
        order_by = ", ".join([*(f'"{c}"' for c in pk_cols), f'"{snapshot_order_col}" DESC NULLS LAST'])
        sql = (
            f'SELECT COALESCE(SUM("{col}"), 0) FROM ('
            f'SELECT DISTINCT ON ({keys}) "{col}" FROM "{schema}"."{table}" {where} '
            f'ORDER BY {order_by}'
            f') t'
        )
    else:
        sql = f'SELECT COALESCE(SUM("{col}"), 0) FROM "{schema}"."{table}" {where}'
    cur.execute(sql, params)
    row = cur.fetchone()
    return float(row[0] if row else 0)


def run_dwd_vs_ods_check(
    *,
    cfg: AppConfig,
    window: IntegrityWindow | None,
    include_dimensions: bool,
    compare_content: bool | None = None,
    content_sample_limit: int | None = None,
) -> Dict[str, Any]:
    dsn = cfg["db"]["dsn"]
    session = cfg["db"].get("session")
    db_conn = DatabaseConnection(dsn=dsn, session=session)
    if compare_content is None:
        compare_content = bool(cfg.get("integrity.compare_content", True))
    if content_sample_limit is None:
        content_sample_limit = cfg.get("integrity.content_sample_limit") or 50
    try:
        with db_conn.conn.cursor() as cur:
            results: List[Dict[str, Any]] = []
            table_map = DwdLoadTask.TABLE_MAP
            total_mismatch = 0
            for dwd_table, ods_table in table_map.items():
                if not include_dimensions and ".dim_" in dwd_table:
                    continue
                schema_dwd, name_dwd = _split_table(dwd_table, "billiards_dwd")
                schema_ods, name_ods = _split_table(ods_table, "billiards_ods")
                try:
                    dwd_cols, dwd_types = _fetch_columns(cur, schema_dwd, name_dwd)
                    ods_cols, ods_types = _fetch_columns(cur, schema_ods, name_ods)
                    time_col = _pick_time_column(dwd_cols, ods_cols)
                    pk_dwd = _fetch_pk_columns(cur, schema_dwd, name_dwd)
                    pk_ods_raw = _fetch_pk_columns(cur, schema_ods, name_ods)
                    pk_ods = [c for c in pk_ods_raw if c.lower() != "content_hash"]
                    ods_has_snapshot = any(c.lower() == "content_hash" for c in ods_cols)
                    ods_snapshot_order = _pick_snapshot_order_column(ods_cols) if ods_has_snapshot else None
                    dwd_current_only = any(c.lower() == "scd2_is_current" for c in dwd_cols)

                    count_dwd = _count_table(
                        cur,
                        schema_dwd,
                        name_dwd,
                        time_col,
                        window,
                        current_only=dwd_current_only,
                    )
                    count_ods = _count_table(
                        cur,
                        schema_ods,
                        name_ods,
                        time_col,
                        window,
                        pk_cols=pk_ods if ods_has_snapshot else None,
                        snapshot_order_col=ods_snapshot_order if ods_has_snapshot else None,
                    )

                    dwd_amount_cols = _amount_columns(dwd_cols, dwd_types)
                    ods_amount_cols = _amount_columns(ods_cols, ods_types)
                    common_amount_cols = sorted(set(dwd_amount_cols) & set(ods_amount_cols))
                    amounts: List[Dict[str, Any]] = []
                    for col in common_amount_cols:
                        dwd_sum = _sum_column(
                            cur,
                            schema_dwd,
                            name_dwd,
                            col,
                            time_col,
                            window,
                            current_only=dwd_current_only,
                        )
                        ods_sum = _sum_column(
                            cur,
                            schema_ods,
                            name_ods,
                            col,
                            time_col,
                            window,
                            pk_cols=pk_ods if ods_has_snapshot else None,
                            snapshot_order_col=ods_snapshot_order if ods_has_snapshot else None,
                        )
                        amounts.append(
                            {
                                "column": col,
                                "dwd_sum": dwd_sum,
                                "ods_sum": ods_sum,
                                "diff": dwd_sum - ods_sum,
                            }
                        )

                    mismatch = None
                    mismatch_samples: list[dict] = []
                    mismatch_error = None
                    if compare_content:
                        dwd_cols_lower = [c.lower() for c in dwd_cols]
                        ods_cols_lower = [c.lower() for c in ods_cols]
                        dwd_col_set = set(dwd_cols_lower)
                        ods_col_set = set(ods_cols_lower)
                        scd_cols = {c.lower() for c in DwdLoadTask.SCD_COLS}
                        ods_exclude = {
                            "payload", "source_file", "source_endpoint", "fetched_at", "content_hash", "record_index"
                        }
                        numeric_types = {
                            "integer",
                            "bigint",
                            "smallint",
                            "numeric",
                            "double precision",
                            "real",
                            "decimal",
                        }
                        text_types = {"text", "character varying", "varchar"}
                        mapping = {
                            dst.lower(): (src, cast_type)
                            for dst, src, cast_type in (DwdLoadTask.FACT_MAPPINGS.get(dwd_table) or [])
                        }
                        business_keys = [c for c in pk_dwd if c.lower() not in scd_cols]
                        def resolve_ods_expr(col: str) -> str | None:
                            mapped = mapping.get(col)
                            if mapped:
                                src, cast_type = mapped
                                return _cast_expr(src, cast_type)
                            if col in ods_col_set:
                                d_type = dwd_types.get(col)
                                o_type = ods_types.get(col)
                                if d_type in numeric_types and o_type in text_types:
                                    return _cast_expr(col, d_type)
                                return f'"{col}"'
                            if "id" in ods_col_set and col.endswith("_id"):
                                d_type = dwd_types.get(col)
                                o_type = ods_types.get("id")
                                if d_type in numeric_types and o_type in text_types:
                                    return _cast_expr("id", d_type)
                                return '"id"'
                            return None

                        key_exprs: list[str] = []
                        join_keys: list[str] = []
                        for key in business_keys:
                            key_lower = key.lower()
                            expr = resolve_ods_expr(key_lower)
                            if expr is None:
                                key_exprs = []
                                join_keys = []
                                break
                            key_exprs.append(expr)
                            join_keys.append(key_lower)

                        compare_cols: list[str] = []
                        for col in dwd_col_set:
                            if col in ods_exclude or col in scd_cols:
                                continue
                            if col in {k.lower() for k in business_keys}:
                                continue
                            if dwd_types.get(col) in ("json", "jsonb"):
                                continue
                            if ods_types.get(col) in ("json", "jsonb"):
                                continue
                            if resolve_ods_expr(col) is None:
                                continue
                            compare_cols.append(col)
                        compare_cols = sorted(set(compare_cols))

                        if join_keys and compare_cols:
                            where_parts_dwd: list[str] = []
                            params_dwd: list[Any] = []
                            if dwd_current_only:
                                where_parts_dwd.append("COALESCE(scd2_is_current,1)=1")
                            if time_col and window:
                                where_parts_dwd.append(f"\"{time_col}\" >= %s AND \"{time_col}\" < %s")
                                params_dwd.extend([window.start, window.end])
                            where_dwd = f"WHERE {' AND '.join(where_parts_dwd)}" if where_parts_dwd else ""

                            where_parts_ods: list[str] = []
                            params_ods: list[Any] = []
                            if time_col and window:
                                where_parts_ods.append(f"\"{time_col}\" >= %s AND \"{time_col}\" < %s")
                                params_ods.extend([window.start, window.end])
                            where_ods = f"WHERE {' AND '.join(where_parts_ods)}" if where_parts_ods else ""

                            ods_select_exprs: list[str] = []
                            needed_cols = sorted(set(join_keys + compare_cols))
                            for col in needed_cols:
                                expr = resolve_ods_expr(col)
                                if expr is None:
                                    continue
                                ods_select_exprs.append(f"{expr} AS \"{col}\"")

                            if not ods_select_exprs:
                                mismatch_error = "join_keys_or_compare_cols_unavailable"
                            else:
                                ods_sql = _build_snapshot_expr_subquery(
                                    schema_ods,
                                    name_ods,
                                    ods_select_exprs,
                                    key_exprs,
                                    ods_snapshot_order,
                                    where_ods,
                                )
                                dwd_cols_sql = ", ".join([f"\"{c}\"" for c in needed_cols])
                                dwd_sql = f"SELECT {dwd_cols_sql} FROM \"{schema_dwd}\".\"{name_dwd}\" {where_dwd}"

                                join_cond = " AND ".join([f"d.\"{k}\" = o.\"{k}\"" for k in join_keys])
                                hash_o = _build_hash_expr("o", compare_cols)
                                hash_d = _build_hash_expr("d", compare_cols)

                                mismatch_sql = (
                                    f"WITH ods_latest AS ({ods_sql}), dwd_filtered AS ({dwd_sql}) "
                                    f"SELECT COUNT(1) FROM ("
                                    f"SELECT 1 FROM ods_latest o JOIN dwd_filtered d ON {join_cond} "
                                    f"WHERE {hash_o} <> {hash_d}"
                                    f") t"
                                )
                                params = params_ods + params_dwd
                                cur.execute(mismatch_sql, params)
                                row = cur.fetchone()
                                mismatch = int(row[0] if row and row[0] is not None else 0)
                                total_mismatch += mismatch

                                if content_sample_limit and mismatch > 0:
                                    select_keys_sql = ", ".join([f"d.\"{k}\" AS \"{k}\"" for k in join_keys])
                                    sample_sql = (
                                        f"WITH ods_latest AS ({ods_sql}), dwd_filtered AS ({dwd_sql}) "
                                        f"SELECT {select_keys_sql}, {hash_o} AS ods_hash, {hash_d} AS dwd_hash "
                                        f"FROM ods_latest o JOIN dwd_filtered d ON {join_cond} "
                                        f"WHERE {hash_o} <> {hash_d} LIMIT %s"
                                    )
                                    cur.execute(sample_sql, params + [int(content_sample_limit)])
                                    rows = cur.fetchall() or []
                                    if rows:
                                        columns = [desc[0] for desc in (cur.description or [])]
                                        mismatch_samples = [dict(zip(columns, r)) for r in rows]
                        else:
                            mismatch_error = "join_keys_or_compare_cols_unavailable"

                    results.append(
                        {
                            "dwd_table": dwd_table,
                            "ods_table": ods_table,
                            "windowed": bool(time_col and window),
                            "window_col": time_col,
                            "count": {"dwd": count_dwd, "ods": count_ods, "diff": count_dwd - count_ods},
                            "amounts": amounts,
                            "mismatch": mismatch,
                            "mismatch_samples": mismatch_samples,
                            "mismatch_error": mismatch_error,
                        }
                    )
                except Exception as exc:  # noqa: BLE001
                    results.append(
                        {
                            "dwd_table": dwd_table,
                            "ods_table": ods_table,
                            "windowed": bool(window),
                            "window_col": None,
                            "count": {"dwd": None, "ods": None, "diff": None},
                            "amounts": [],
                            "mismatch": None,
                            "mismatch_samples": [],
                            "error": f"{type(exc).__name__}: {exc}",
                        }
                    )

        total_count_diff = sum(
            int(item.get("count", {}).get("diff") or 0)
            for item in results
            if isinstance(item.get("count", {}).get("diff"), (int, float))
        )
        return {
            "tables": results,
            "total_count_diff": total_count_diff,
            "total_mismatch": total_mismatch,
        }
    finally:
        db_conn.close()


def _default_report_path(prefix: str) -> Path:
    root = Path(__file__).resolve().parents[1]
    stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    return root / "reports" / f"{prefix}_{stamp}.json"


def run_integrity_window(
    *,
    cfg: AppConfig,
    window: IntegrityWindow,
    include_dimensions: bool,
    task_codes: str,
    logger,
    write_report: bool,
    compare_content: bool | None = None,
    content_sample_limit: int | None = None,
    report_path: Path | None = None,
    window_split_unit: str | None = None,
    window_compensation_hours: int | None = None,
) -> Dict[str, Any]:
    total_seconds = max(0, int((window.end - window.start).total_seconds()))
    if total_seconds >= 86400:
        window_days = max(1, total_seconds // 86400)
        window_hours = 0
    else:
        window_days = 0
        window_hours = max(1, total_seconds // 3600 or 1)

    if compare_content is None:
        compare_content = bool(cfg.get("integrity.compare_content", True))
    if content_sample_limit is None:
        content_sample_limit = cfg.get("integrity.content_sample_limit")

    ods_payload = run_gap_check(
        cfg=cfg,
        start=window.start,
        end=window.end,
        window_days=window_days,
        window_hours=window_hours,
        page_size=int(cfg.get("api.page_size") or 200),
        chunk_size=500,
        sample_limit=50,
        sleep_per_window=0,
        sleep_per_page=0,
        task_codes=task_codes,
        from_cutoff=False,
        cutoff_overlap_hours=24,
        allow_small_window=True,
        logger=logger,
        compare_content=bool(compare_content),
        content_sample_limit=content_sample_limit,
        window_split_unit=window_split_unit,
        window_compensation_hours=window_compensation_hours,
    )

    dwd_payload = run_dwd_vs_ods_check(
        cfg=cfg,
        window=window,
        include_dimensions=include_dimensions,
        compare_content=compare_content,
        content_sample_limit=content_sample_limit,
    )

    report = {
        "mode": "window",
        "window": {
            "start": window.start.isoformat(),
            "end": window.end.isoformat(),
            "label": window.label,
            "granularity": window.granularity,
        },
        "api_to_ods": ods_payload,
        "ods_to_dwd": dwd_payload,
        "generated_at": datetime.now(ZoneInfo(cfg.get("app.timezone", "Asia/Shanghai"))).isoformat(),
    }

    if write_report:
        path = report_path or _default_report_path("data_integrity_window")
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
        report["report_path"] = str(path)

    return report


def run_integrity_history(
    *,
    cfg: AppConfig,
    start_dt: datetime,
    end_dt: datetime,
    include_dimensions: bool,
    task_codes: str,
    logger,
    write_report: bool,
    compare_content: bool | None = None,
    content_sample_limit: int | None = None,
    report_path: Path | None = None,
) -> Dict[str, Any]:
    tz = ZoneInfo(cfg.get("app.timezone", "Asia/Shanghai"))
    windows = build_history_windows(start_dt, end_dt, tz)
    results: List[Dict[str, Any]] = []
    total_missing = 0
    total_mismatch = 0
    total_errors = 0

    for window in windows:
        logger.info("校验窗口 起始=%s 结束=%s", window.start, window.end)
        payload = run_integrity_window(
            cfg=cfg,
            window=window,
            include_dimensions=include_dimensions,
            task_codes=task_codes,
            logger=logger,
            write_report=False,
            compare_content=compare_content,
            content_sample_limit=content_sample_limit,
        )
        results.append(payload)
        total_missing += int(payload.get("api_to_ods", {}).get("total_missing") or 0)
        total_mismatch += int(payload.get("api_to_ods", {}).get("total_mismatch") or 0)
        total_errors += int(payload.get("api_to_ods", {}).get("total_errors") or 0)

    report = {
        "mode": "history",
        "start": _ensure_tz(start_dt, tz).isoformat(),
        "end": _ensure_tz(end_dt, tz).isoformat(),
        "windows": results,
        "total_missing": total_missing,
        "total_mismatch": total_mismatch,
        "total_errors": total_errors,
        "generated_at": datetime.now(tz).isoformat(),
    }

    if write_report:
        path = report_path or _default_report_path("data_integrity_history")
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps(report, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
        report["report_path"] = str(path)

    return report


def compute_last_etl_end(cfg: AppConfig) -> datetime | None:
    dsn = cfg["db"]["dsn"]
    session = cfg["db"].get("session")
    db_conn = DatabaseConnection(dsn=dsn, session=session)
    try:
        rows = db_conn.query(
            "SELECT MAX(window_end) AS mx FROM etl_admin.etl_run WHERE store_id = %s",
            (cfg.get("app.store_id"),),
        )
        mx = rows[0]["mx"] if rows else None
        if isinstance(mx, datetime):
            tz = ZoneInfo(cfg.get("app.timezone", "Asia/Shanghai"))
            return _ensure_tz(mx, tz)
    finally:
        db_conn.close()
    return None