1
This commit is contained in:
189
scripts/ops/field_disappearance_scan.py
Normal file
189
scripts/ops/field_disappearance_scan.py
Normal file
@@ -0,0 +1,189 @@
|
||||
"""
|
||||
字段消失扫描器:检测 DWD 表中字段值从某天起突然全部为空的异常
|
||||
判定条件:连续 ≥3 天 且 连续空记录 ≥20 条
|
||||
报告类型:
|
||||
- ONGOING:从某天起至今持续为空(如 DQ-6 member_phone)
|
||||
- RECOVERED:中途消失后又恢复
|
||||
输出:终端 + CSV → export/SYSTEM/REPORTS/field_scan/
|
||||
"""
|
||||
import os
|
||||
import csv
|
||||
from datetime import date, timedelta
|
||||
from dataclasses import dataclass
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
PG_DSN = os.environ.get("TEST_DB_DSN") or os.environ.get("PG_DSN")
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("TEST_DB_DSN / PG_DSN 未配置")
|
||||
|
||||
SYSTEM_ANALYZE_ROOT = os.environ.get("SYSTEM_ANALYZE_ROOT")
|
||||
if not SYSTEM_ANALYZE_ROOT:
|
||||
raise RuntimeError("SYSTEM_ANALYZE_ROOT 未配置")
|
||||
|
||||
import psycopg2
|
||||
|
||||
# ── 扫描配置 ──────────────────────────────────────────────
|
||||
# (schema.table, time_column, field, filter_sql)
|
||||
# filter_sql 用于限定有意义的行(如只看会员订单)
|
||||
SCAN_TARGETS = [
|
||||
("dwd.dwd_settlement_head", "pay_time", "member_phone",
|
||||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||||
("dwd.dwd_settlement_head", "pay_time", "member_name",
|
||||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||||
("dwd.dwd_settlement_head", "pay_time", "member_card_type_name",
|
||||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||||
("dwd.dwd_settlement_head", "pay_time", "is_bind_member",
|
||||
"settle_type IN (1,3) AND member_id IS NOT NULL AND member_id != 0"),
|
||||
]
|
||||
|
||||
# 阈值
|
||||
MIN_CONSECUTIVE_DAYS = 3
|
||||
MIN_CONSECUTIVE_ROWS = 20
|
||||
|
||||
|
||||
@dataclass
|
||||
class Gap:
|
||||
"""一段字段消失区间"""
|
||||
table: str
|
||||
field: str
|
||||
start_date: date
|
||||
end_date: date # 最后一个空日期
|
||||
days: int
|
||||
total_rows: int # 区间内总行数
|
||||
null_rows: int # 区间内空行数
|
||||
recovered: bool # 后面是否恢复了
|
||||
|
||||
|
||||
def build_daily_sql(table: str, time_col: str, field: str, filter_sql: str) -> str:
|
||||
"""生成按天统计非空率的 SQL(直接分组,不用 generate_series)"""
|
||||
where = f"WHERE {filter_sql}" if filter_sql else ""
|
||||
return f"""
|
||||
SELECT
|
||||
{time_col}::date AS day,
|
||||
COUNT(*) AS total,
|
||||
COUNT(CASE
|
||||
WHEN {field} IS NOT NULL
|
||||
AND {field}::text != ''
|
||||
AND {field}::text != '0'
|
||||
THEN 1
|
||||
END) AS non_null
|
||||
FROM {table}
|
||||
{where}
|
||||
GROUP BY {time_col}::date
|
||||
HAVING COUNT(*) > 0
|
||||
ORDER BY day
|
||||
"""
|
||||
|
||||
|
||||
def detect_gaps(daily_stats: list[tuple[date, int, int]],
|
||||
table: str, field: str) -> list[Gap]:
|
||||
"""从每日统计中检测连续全空段"""
|
||||
gaps = []
|
||||
in_gap = False
|
||||
gap_start = None
|
||||
gap_rows = 0
|
||||
gap_null = 0
|
||||
gap_days = 0
|
||||
|
||||
for day, total, non_null in daily_stats:
|
||||
is_empty = (non_null == 0)
|
||||
if is_empty:
|
||||
if not in_gap:
|
||||
in_gap = True
|
||||
gap_start = day
|
||||
gap_rows = 0
|
||||
gap_null = 0
|
||||
gap_days = 0
|
||||
gap_days += 1
|
||||
gap_rows += total
|
||||
gap_null += total
|
||||
else:
|
||||
if in_gap:
|
||||
# 空段结束,检查是否达到阈值
|
||||
if gap_days >= MIN_CONSECUTIVE_DAYS and gap_null >= MIN_CONSECUTIVE_ROWS:
|
||||
gaps.append(Gap(
|
||||
table=table, field=field,
|
||||
start_date=gap_start,
|
||||
end_date=day - timedelta(days=1),
|
||||
days=gap_days, total_rows=gap_rows,
|
||||
null_rows=gap_null, recovered=True
|
||||
))
|
||||
in_gap = False
|
||||
|
||||
# 如果到最后仍在空段中
|
||||
if in_gap and gap_days >= MIN_CONSECUTIVE_DAYS and gap_null >= MIN_CONSECUTIVE_ROWS:
|
||||
last_day = daily_stats[-1][0]
|
||||
gaps.append(Gap(
|
||||
table=table, field=field,
|
||||
start_date=gap_start,
|
||||
end_date=last_day,
|
||||
days=gap_days, total_rows=gap_rows,
|
||||
null_rows=gap_null, recovered=False
|
||||
))
|
||||
|
||||
return gaps
|
||||
|
||||
|
||||
def run_scan():
|
||||
all_gaps: list[Gap] = []
|
||||
|
||||
with psycopg2.connect(PG_DSN, connect_timeout=15,
|
||||
options="-c statement_timeout=120000") as conn:
|
||||
with conn.cursor() as cur:
|
||||
for table, time_col, field, filter_sql in SCAN_TARGETS:
|
||||
print(f"扫描 {table}.{field} ...")
|
||||
sql = build_daily_sql(table, time_col, field, filter_sql)
|
||||
cur.execute(sql)
|
||||
rows = cur.fetchall()
|
||||
if not rows:
|
||||
print(f" ⏭️ 无数据")
|
||||
continue
|
||||
|
||||
gaps = detect_gaps(rows, table, field)
|
||||
if gaps:
|
||||
for g in gaps:
|
||||
status = "🔴 ONGOING" if not g.recovered else "🟡 RECOVERED"
|
||||
print(f" {status} {g.field}: {g.start_date} → {g.end_date} "
|
||||
f"({g.days}天, {g.null_rows}条全空)")
|
||||
all_gaps.extend(gaps)
|
||||
else:
|
||||
print(f" ✅ 无异常")
|
||||
|
||||
# 输出报告
|
||||
if not all_gaps:
|
||||
print("\n✅ 所有字段正常,未发现消失段")
|
||||
return
|
||||
|
||||
report_dir = os.path.join(os.path.dirname(SYSTEM_ANALYZE_ROOT), "field_scan")
|
||||
os.makedirs(report_dir, exist_ok=True)
|
||||
csv_path = os.path.join(report_dir, "field_disappearance_report.csv")
|
||||
|
||||
with open(csv_path, "w", newline="", encoding="utf-8-sig") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["表", "字段", "状态", "消失起始日", "消失结束日",
|
||||
"持续天数", "区间总行数", "空行数"])
|
||||
for g in all_gaps:
|
||||
writer.writerow([
|
||||
g.table, g.field,
|
||||
"ONGOING" if not g.recovered else "RECOVERED",
|
||||
g.start_date, g.end_date,
|
||||
g.days, g.total_rows, g.null_rows
|
||||
])
|
||||
|
||||
print(f"\n📊 发现 {len(all_gaps)} 个字段消失段")
|
||||
print(f" 报告已生成: {csv_path}")
|
||||
|
||||
# 终端汇总
|
||||
print(f"\n{'='*90}")
|
||||
print(f"{'表':<35} {'字段':<20} {'状态':<12} {'起始':<12} {'结束':<12} {'天数':>5} {'空行':>6}")
|
||||
print(f"{'='*90}")
|
||||
for g in all_gaps:
|
||||
status = "ONGOING" if not g.recovered else "RECOVERED"
|
||||
print(f"{g.table:<35} {g.field:<20} {status:<12} "
|
||||
f"{str(g.start_date):<12} {str(g.end_date):<12} {g.days:>5} {g.null_rows:>6}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_scan()
|
||||
Reference in New Issue
Block a user