190 lines
6.6 KiB
Python
190 lines
6.6 KiB
Python
"""
|
||
字段消失扫描器:检测 DWD 表中字段值从某天起突然全部为空的异常
|
||
判定条件:连续 ≥3 天 且 连续空记录 ≥20 条
|
||
报告类型:
|
||
- ONGOING:从某天起至今持续为空(如 DQ-6 member_phone)
|
||
- RECOVERED:中途消失后又恢复
|
||
输出:终端 + CSV → export/SYSTEM/REPORTS/field_scan/
|
||
"""
|
||
import os
|
||
import csv
|
||
from datetime import date, timedelta
|
||
from dataclasses import dataclass
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv()
|
||
|
||
PG_DSN = os.environ.get("TEST_DB_DSN") or os.environ.get("PG_DSN")
|
||
if not PG_DSN:
|
||
raise RuntimeError("TEST_DB_DSN / PG_DSN 未配置")
|
||
|
||
SYSTEM_ANALYZE_ROOT = os.environ.get("SYSTEM_ANALYZE_ROOT")
|
||
if not SYSTEM_ANALYZE_ROOT:
|
||
raise RuntimeError("SYSTEM_ANALYZE_ROOT 未配置")
|
||
|
||
import psycopg2
|
||
|
||
# ── 扫描配置 ──────────────────────────────────────────────
|
||
# (schema.table, time_column, field, filter_sql)
|
||
# filter_sql 用于限定有意义的行(如只看会员订单)
|
||
SCAN_TARGETS = [
|
||
("dwd.dwd_settlement_head", "pay_time", "member_phone",
|
||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||
("dwd.dwd_settlement_head", "pay_time", "member_name",
|
||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||
("dwd.dwd_settlement_head", "pay_time", "member_card_type_name",
|
||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||
("dwd.dwd_settlement_head", "pay_time", "is_bind_member",
|
||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||
]
|
||
|
||
# 阈值
|
||
MIN_CONSECUTIVE_DAYS = 3
|
||
MIN_CONSECUTIVE_ROWS = 20
|
||
|
||
|
||
@dataclass
|
||
class Gap:
|
||
"""一段字段消失区间"""
|
||
table: str
|
||
field: str
|
||
start_date: date
|
||
end_date: date # 最后一个空日期
|
||
days: int
|
||
total_rows: int # 区间内总行数
|
||
null_rows: int # 区间内空行数
|
||
recovered: bool # 后面是否恢复了
|
||
|
||
|
||
def build_daily_sql(table: str, time_col: str, field: str, filter_sql: str) -> str:
|
||
"""生成按天统计非空率的 SQL(直接分组,不用 generate_series)"""
|
||
where = f"WHERE {filter_sql}" if filter_sql else ""
|
||
return f"""
|
||
SELECT
|
||
{time_col}::date AS day,
|
||
COUNT(*) AS total,
|
||
COUNT(CASE
|
||
WHEN {field} IS NOT NULL
|
||
AND {field}::text != ''
|
||
AND {field}::text != '0'
|
||
THEN 1
|
||
END) AS non_null
|
||
FROM {table}
|
||
{where}
|
||
GROUP BY {time_col}::date
|
||
HAVING COUNT(*) > 0
|
||
ORDER BY day
|
||
"""
|
||
|
||
|
||
def detect_gaps(daily_stats: list[tuple[date, int, int]],
|
||
table: str, field: str) -> list[Gap]:
|
||
"""从每日统计中检测连续全空段"""
|
||
gaps = []
|
||
in_gap = False
|
||
gap_start = None
|
||
gap_rows = 0
|
||
gap_null = 0
|
||
gap_days = 0
|
||
|
||
for day, total, non_null in daily_stats:
|
||
is_empty = (non_null == 0)
|
||
if is_empty:
|
||
if not in_gap:
|
||
in_gap = True
|
||
gap_start = day
|
||
gap_rows = 0
|
||
gap_null = 0
|
||
gap_days = 0
|
||
gap_days += 1
|
||
gap_rows += total
|
||
gap_null += total
|
||
else:
|
||
if in_gap:
|
||
# 空段结束,检查是否达到阈值
|
||
if gap_days >= MIN_CONSECUTIVE_DAYS and gap_null >= MIN_CONSECUTIVE_ROWS:
|
||
gaps.append(Gap(
|
||
table=table, field=field,
|
||
start_date=gap_start,
|
||
end_date=day - timedelta(days=1),
|
||
days=gap_days, total_rows=gap_rows,
|
||
null_rows=gap_null, recovered=True
|
||
))
|
||
in_gap = False
|
||
|
||
# 如果到最后仍在空段中
|
||
if in_gap and gap_days >= MIN_CONSECUTIVE_DAYS and gap_null >= MIN_CONSECUTIVE_ROWS:
|
||
last_day = daily_stats[-1][0]
|
||
gaps.append(Gap(
|
||
table=table, field=field,
|
||
start_date=gap_start,
|
||
end_date=last_day,
|
||
days=gap_days, total_rows=gap_rows,
|
||
null_rows=gap_null, recovered=False
|
||
))
|
||
|
||
return gaps
|
||
|
||
|
||
def run_scan():
|
||
all_gaps: list[Gap] = []
|
||
|
||
with psycopg2.connect(PG_DSN, connect_timeout=15,
|
||
options="-c statement_timeout=120000") as conn:
|
||
with conn.cursor() as cur:
|
||
for table, time_col, field, filter_sql in SCAN_TARGETS:
|
||
print(f"扫描 {table}.{field} ...")
|
||
sql = build_daily_sql(table, time_col, field, filter_sql)
|
||
cur.execute(sql)
|
||
rows = cur.fetchall()
|
||
if not rows:
|
||
print(f" ⏭️ 无数据")
|
||
continue
|
||
|
||
gaps = detect_gaps(rows, table, field)
|
||
if gaps:
|
||
for g in gaps:
|
||
status = "🔴 ONGOING" if not g.recovered else "🟡 RECOVERED"
|
||
print(f" {status} {g.field}: {g.start_date} → {g.end_date} "
|
||
f"({g.days}天, {g.null_rows}条全空)")
|
||
all_gaps.extend(gaps)
|
||
else:
|
||
print(f" ✅ 无异常")
|
||
|
||
# 输出报告
|
||
if not all_gaps:
|
||
print("\n✅ 所有字段正常,未发现消失段")
|
||
return
|
||
|
||
report_dir = os.path.join(os.path.dirname(SYSTEM_ANALYZE_ROOT), "field_scan")
|
||
os.makedirs(report_dir, exist_ok=True)
|
||
csv_path = os.path.join(report_dir, "field_disappearance_report.csv")
|
||
|
||
with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
|
||
writer = csv.writer(f)
|
||
writer.writerow(["表", "字段", "状态", "消失起始日", "消失结束日",
|
||
"持续天数", "区间总行数", "空行数"])
|
||
for g in all_gaps:
|
||
writer.writerow([
|
||
g.table, g.field,
|
||
"ONGOING" if not g.recovered else "RECOVERED",
|
||
g.start_date, g.end_date,
|
||
g.days, g.total_rows, g.null_rows
|
||
])
|
||
|
||
print(f"\n📊 发现 {len(all_gaps)} 个字段消失段")
|
||
print(f" 报告已生成: {csv_path}")
|
||
|
||
# 终端汇总
|
||
print(f"\n{'='*90}")
|
||
print(f"{'表':<35} {'字段':<20} {'状态':<12} {'起始':<12} {'结束':<12} {'天数':>5} {'空行':>6}")
|
||
print(f"{'='*90}")
|
||
for g in all_gaps:
|
||
status = "ONGOING" if not g.recovered else "RECOVERED"
|
||
print(f"{g.table:<35} {g.field:<20} {status:<12} "
|
||
f"{str(g.start_date):<12} {str(g.end_date):<12} {g.days:>5} {g.null_rows:>6}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
run_scan()
|