"""诊断 SPI 哪些值超出 numeric 精度""" import os, sys, math from pathlib import Path from dotenv import load_dotenv load_dotenv(Path(__file__).resolve().parents[2] / ".env") dsn = os.environ.get("PG_DSN") if not dsn: raise RuntimeError("PG_DSN 未设置") # 模拟 SPI 计算,找出哪些值溢出 import psycopg2 conn = psycopg2.connect(dsn) cur = conn.cursor() # 检查 SPI 特征数据范围 site_id = 2790685415443269 # 1. 查看消费特征的极值 cur.execute(""" SELECT MAX(ABS(spend_30)) as max_spend_30, MAX(ABS(spend_90)) as max_spend_90, MAX(ABS(recharge_90)) as max_recharge_90, MAX(ABS(avg_ticket_90)) as max_avg_ticket, MAX(ABS(daily_spend_ewma_90)) as max_ewma FROM dws.dws_member_consumption WHERE site_id = %s """, (site_id,)) row = cur.fetchone() if row: print(f"消费特征极值: spend_30={row[0]}, spend_90={row[1]}, recharge_90={row[2]}, avg_ticket={row[3]}, ewma={row[4]}") # 2. 模拟 score 计算 # 默认参数 params = { 'amount_base_spend_30': 500.0, 'amount_base_spend_90': 1500.0, 'amount_base_ticket_90': 200.0, 'amount_base_recharge_90': 1000.0, 'amount_base_speed_abs': 100.0, 'amount_base_ewma_90': 50.0, 'w_level_spend_30': 0.30, 'w_level_spend_90': 0.30, 'w_level_ticket_90': 0.20, 'w_level_recharge_90': 0.20, 'w_speed_abs': 0.40, 'w_speed_rel': 0.30, 'w_speed_ewma': 0.30, 'speed_epsilon': 1e-6, } # 查询实际消费数据 cur.execute(""" SELECT member_id, COALESCE(spend_30, 0), COALESCE(spend_90, 0), COALESCE(avg_ticket_90, 0), COALESCE(orders_30, 0), COALESCE(orders_90, 0), COALESCE(visit_days_30, 0), COALESCE(visit_days_90, 0) FROM dws.dws_member_consumption WHERE site_id = %s """, (site_id,)) rows = cur.fetchall() print(f"\n会员数: {len(rows)}") # 模拟计算,找出极值 max_level = (-float('inf'), None) max_speed = (-float('inf'), None) max_raw = (-float('inf'), None) overflow_members = [] for row in rows: mid = row[0] spend_30 = float(row[1]) spend_90 = float(row[2]) avg_ticket = float(row[3]) orders_30 = int(row[4]) orders_90 = int(row[5]) visit_days_30 = int(row[6]) visit_days_90 = int(row[7]) recharge_90 = 0.0 # 简化 # Level level = ( params['w_level_spend_30'] * math.log1p(spend_30 / params['amount_base_spend_30']) + params['w_level_spend_90'] * math.log1p(spend_90 / params['amount_base_spend_90']) + params['w_level_ticket_90'] * math.log1p(avg_ticket / params['amount_base_ticket_90']) + params['w_level_recharge_90'] * math.log1p(recharge_90 / params['amount_base_recharge_90']) ) # Speed eps = params['speed_epsilon'] v_abs = math.log1p(spend_30 / (max(visit_days_30, 1) * params['amount_base_speed_abs'])) v_30 = spend_30 / 30.0 v_90 = spend_90 / 90.0 v_rel = math.log((v_30 + eps) / (v_90 + eps)) v_ewma = 0.0 # 简化 speed = ( params['w_speed_abs'] * v_abs + params['w_speed_rel'] * max(0.0, v_rel) + params['w_speed_ewma'] * v_ewma ) # Raw raw = 0.60 * level + 0.30 * speed + 0.10 * 0.0 if level > max_level[0]: max_level = (level, mid) if speed > max_speed[0]: max_speed = (speed, mid) if raw > max_raw[0]: max_raw = (raw, mid) # 检查是否超出 numeric(10,4) 范围 RAW_MAX = 999999.9999 if abs(level) > RAW_MAX or abs(speed) > RAW_MAX or abs(raw) > RAW_MAX: overflow_members.append((mid, level, speed, raw)) # 检查 inf/nan if math.isinf(level) or math.isnan(level) or math.isinf(speed) or math.isnan(speed): print(f" INF/NAN: member_id={mid}, level={level}, speed={speed}, spend_30={spend_30}, spend_90={spend_90}") print(f"\nMax level: {max_level[0]:.6f} (member_id={max_level[1]})") print(f"Max speed: {max_speed[0]:.6f} (member_id={max_speed[1]})") print(f"Max raw: {max_raw[0]:.6f} (member_id={max_raw[1]})") print(f"Overflow members (>{RAW_MAX}): {len(overflow_members)}") # 也检查 daily_spend_ewma_90 的实际值 # 这个值是在 _compute_daily_spend_ewma_batch 中计算的 # 看看 dws_member_consumption 中有没有极端值 cur.execute(""" SELECT member_id, spend_30, spend_90, avg_ticket_90, daily_spend_ewma_90 FROM dws.dws_member_consumption WHERE site_id = %s ORDER BY spend_90 DESC LIMIT 5 """, (site_id,)) print("\nTop 5 消费会员:") for r in cur.fetchall(): print(f" member_id={r[0]}, spend_30={r[1]}, spend_90={r[2]}, avg_ticket={r[3]}, ewma={r[4]}") # 检查 numeric(14,2) 的实际数据范围 cur.execute(""" SELECT MAX(spend_30), MAX(spend_90), MAX(recharge_90), MAX(avg_ticket_90), MAX(daily_spend_ewma_90) FROM dws.dws_member_consumption WHERE site_id = %s """, (site_id,)) r = cur.fetchone() print(f"\n最大值: spend_30={r[0]}, spend_90={r[1]}, recharge_90={r[2]}, avg_ticket={r[3]}, ewma={r[4]}") conn.close() print("\n诊断完成")