Neo-ZQYY/scripts/ops/field_disappearance_scan.py

"""
字段消失扫描器：检测 DWD 表中字段值从某天起突然全部为空的异常
判定条件：连续 ≥3 天 且 连续空记录 ≥20 条
报告类型：
  - ONGOING：从某天起至今持续为空（如 DQ-6 member_phone）
  - RECOVERED：中途消失后又恢复
输出：终端 + CSV → export/SYSTEM/REPORTS/field_scan/
"""
import os
import csv
from datetime import date, timedelta
from dataclasses import dataclass
from dotenv import load_dotenv

load_dotenv()

PG_DSN = os.environ.get("TEST_DB_DSN") or os.environ.get("PG_DSN")
if not PG_DSN:
    raise RuntimeError("TEST_DB_DSN / PG_DSN 未配置")

SYSTEM_ANALYZE_ROOT = os.environ.get("SYSTEM_ANALYZE_ROOT")
if not SYSTEM_ANALYZE_ROOT:
    raise RuntimeError("SYSTEM_ANALYZE_ROOT 未配置")

import psycopg2

# ── 扫描配置 ──────────────────────────────────────────────
# (schema.table, time_column, field, filter_sql)
# filter_sql 用于限定有意义的行（如只看会员订单）
SCAN_TARGETS = [
    ("dwd.dwd_settlement_head", "pay_time", "member_phone",
     "settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
    ("dwd.dwd_settlement_head", "pay_time", "member_name",
     "settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
    ("dwd.dwd_settlement_head", "pay_time", "member_card_type_name",
     "settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
    ("dwd.dwd_settlement_head", "pay_time", "is_bind_member",
     "settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
]

# 阈值
MIN_CONSECUTIVE_DAYS = 3
MIN_CONSECUTIVE_ROWS = 20


@dataclass
class Gap:
    """一段字段消失区间"""
    table: str
    field: str
    start_date: date
    end_date: date       # 最后一个空日期
    days: int
    total_rows: int      # 区间内总行数
    null_rows: int       # 区间内空行数
    recovered: bool      # 后面是否恢复了


def build_daily_sql(table: str, time_col: str, field: str, filter_sql: str) -> str:
    """生成按天统计非空率的 SQL（直接分组，不用 generate_series）"""
    where = f"WHERE {filter_sql}" if filter_sql else ""
    return f"""
        SELECT
            {time_col}::date AS day,
            COUNT(*) AS total,
            COUNT(CASE
                WHEN {field} IS NOT NULL
                 AND {field}::text != ''
                 AND {field}::text != '0'
                THEN 1
            END) AS non_null
        FROM {table}
        {where}
        GROUP BY {time_col}::date
        HAVING COUNT(*) > 0
        ORDER BY day
    """


def detect_gaps(daily_stats: list[tuple[date, int, int]],
                table: str, field: str) -> list[Gap]:
    """从每日统计中检测连续全空段"""
    gaps = []
    in_gap = False
    gap_start = None
    gap_rows = 0
    gap_null = 0
    gap_days = 0

    for day, total, non_null in daily_stats:
        is_empty = (non_null == 0)
        if is_empty:
            if not in_gap:
                in_gap = True
                gap_start = day
                gap_rows = 0
                gap_null = 0
                gap_days = 0
            gap_days += 1
            gap_rows += total
            gap_null += total
        else:
            if in_gap:
                # 空段结束，检查是否达到阈值
                if gap_days >= MIN_CONSECUTIVE_DAYS and gap_null >= MIN_CONSECUTIVE_ROWS:
                    gaps.append(Gap(
                        table=table, field=field,
                        start_date=gap_start,
                        end_date=day - timedelta(days=1),
                        days=gap_days, total_rows=gap_rows,
                        null_rows=gap_null, recovered=True
                    ))
                in_gap = False

    # 如果到最后仍在空段中
    if in_gap and gap_days >= MIN_CONSECUTIVE_DAYS and gap_null >= MIN_CONSECUTIVE_ROWS:
        last_day = daily_stats[-1][0]
        gaps.append(Gap(
            table=table, field=field,
            start_date=gap_start,
            end_date=last_day,
            days=gap_days, total_rows=gap_rows,
            null_rows=gap_null, recovered=False
        ))

    return gaps


def run_scan():
    all_gaps: list[Gap] = []

    with psycopg2.connect(PG_DSN, connect_timeout=15,
                          options="-c statement_timeout=120000") as conn:
        with conn.cursor() as cur:
            for table, time_col, field, filter_sql in SCAN_TARGETS:
                print(f"扫描 {table}.{field} ...")
                sql = build_daily_sql(table, time_col, field, filter_sql)
                cur.execute(sql)
                rows = cur.fetchall()
                if not rows:
                    print(f"  ⏭️ 无数据")
                    continue

                gaps = detect_gaps(rows, table, field)
                if gaps:
                    for g in gaps:
                        status = "🔴 ONGOING" if not g.recovered else "🟡 RECOVERED"
                        print(f"  {status} {g.field}: {g.start_date} → {g.end_date} "
                              f"({g.days}天, {g.null_rows}条全空)")
                    all_gaps.extend(gaps)
                else:
                    print(f"  ✅ 无异常")

    # 输出报告
    if not all_gaps:
        print("\n✅ 所有字段正常，未发现消失段")
        return

    report_dir = os.path.join(os.path.dirname(SYSTEM_ANALYZE_ROOT), "field_scan")
    os.makedirs(report_dir, exist_ok=True)
    csv_path = os.path.join(report_dir, "field_disappearance_report.csv")

    with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["表", "字段", "状态", "消失起始日", "消失结束日",
                         "持续天数", "区间总行数", "空行数"])
        for g in all_gaps:
            writer.writerow([
                g.table, g.field,
                "ONGOING" if not g.recovered else "RECOVERED",
                g.start_date, g.end_date,
                g.days, g.total_rows, g.null_rows
            ])

    print(f"\n📊 发现 {len(all_gaps)} 个字段消失段")
    print(f"   报告已生成: {csv_path}")

    # 终端汇总
    print(f"\n{'='*90}")
    print(f"{'表':<35} {'字段':<20} {'状态':<12} {'起始':<12} {'结束':<12} {'天数':>5} {'空行':>6}")
    print(f"{'='*90}")
    for g in all_gaps:
        status = "ONGOING" if not g.recovered else "RECOVERED"
        print(f"{g.table:<35} {g.field:<20} {status:<12} "
              f"{str(g.start_date):<12} {str(g.end_date):<12} {g.days:>5} {g.null_rows:>6}")


if __name__ == "__main__":
    run_scan()