Files
Neo-ZQYY/scripts/ops/_debug_spi_values.py

157 lines
4.9 KiB
Python

"""诊断 SPI 哪些值超出 numeric 精度"""
import os, sys, math
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
dsn = os.environ.get("PG_DSN")
if not dsn:
raise RuntimeError("PG_DSN 未设置")
# 模拟 SPI 计算,找出哪些值溢出
import psycopg2
conn = psycopg2.connect(dsn)
cur = conn.cursor()
# 检查 SPI 特征数据范围
site_id = 2790685415443269
# 1. 查看消费特征的极值
cur.execute("""
SELECT
MAX(ABS(spend_30)) as max_spend_30,
MAX(ABS(spend_90)) as max_spend_90,
MAX(ABS(recharge_90)) as max_recharge_90,
MAX(ABS(avg_ticket_90)) as max_avg_ticket,
MAX(ABS(daily_spend_ewma_90)) as max_ewma
FROM dws.dws_member_consumption
WHERE site_id = %s
""", (site_id,))
row = cur.fetchone()
if row:
print(f"消费特征极值: spend_30={row[0]}, spend_90={row[1]}, recharge_90={row[2]}, avg_ticket={row[3]}, ewma={row[4]}")
# 2. 模拟 score 计算
# 默认参数
params = {
'amount_base_spend_30': 500.0,
'amount_base_spend_90': 1500.0,
'amount_base_ticket_90': 200.0,
'amount_base_recharge_90': 1000.0,
'amount_base_speed_abs': 100.0,
'amount_base_ewma_90': 50.0,
'w_level_spend_30': 0.30,
'w_level_spend_90': 0.30,
'w_level_ticket_90': 0.20,
'w_level_recharge_90': 0.20,
'w_speed_abs': 0.40,
'w_speed_rel': 0.30,
'w_speed_ewma': 0.30,
'speed_epsilon': 1e-6,
}
# 查询实际消费数据
cur.execute("""
SELECT member_id,
COALESCE(spend_30, 0), COALESCE(spend_90, 0),
COALESCE(avg_ticket_90, 0),
COALESCE(orders_30, 0), COALESCE(orders_90, 0),
COALESCE(visit_days_30, 0), COALESCE(visit_days_90, 0)
FROM dws.dws_member_consumption
WHERE site_id = %s
""", (site_id,))
rows = cur.fetchall()
print(f"\n会员数: {len(rows)}")
# 模拟计算,找出极值
max_level = (-float('inf'), None)
max_speed = (-float('inf'), None)
max_raw = (-float('inf'), None)
overflow_members = []
for row in rows:
mid = row[0]
spend_30 = float(row[1])
spend_90 = float(row[2])
avg_ticket = float(row[3])
orders_30 = int(row[4])
orders_90 = int(row[5])
visit_days_30 = int(row[6])
visit_days_90 = int(row[7])
recharge_90 = 0.0 # 简化
# Level
level = (
params['w_level_spend_30'] * math.log1p(spend_30 / params['amount_base_spend_30'])
+ params['w_level_spend_90'] * math.log1p(spend_90 / params['amount_base_spend_90'])
+ params['w_level_ticket_90'] * math.log1p(avg_ticket / params['amount_base_ticket_90'])
+ params['w_level_recharge_90'] * math.log1p(recharge_90 / params['amount_base_recharge_90'])
)
# Speed
eps = params['speed_epsilon']
v_abs = math.log1p(spend_30 / (max(visit_days_30, 1) * params['amount_base_speed_abs']))
v_30 = spend_30 / 30.0
v_90 = spend_90 / 90.0
v_rel = math.log((v_30 + eps) / (v_90 + eps))
v_ewma = 0.0 # 简化
speed = (
params['w_speed_abs'] * v_abs
+ params['w_speed_rel'] * max(0.0, v_rel)
+ params['w_speed_ewma'] * v_ewma
)
# Raw
raw = 0.60 * level + 0.30 * speed + 0.10 * 0.0
if level > max_level[0]:
max_level = (level, mid)
if speed > max_speed[0]:
max_speed = (speed, mid)
if raw > max_raw[0]:
max_raw = (raw, mid)
# 检查是否超出 numeric(10,4) 范围
RAW_MAX = 999999.9999
if abs(level) > RAW_MAX or abs(speed) > RAW_MAX or abs(raw) > RAW_MAX:
overflow_members.append((mid, level, speed, raw))
# 检查 inf/nan
if math.isinf(level) or math.isnan(level) or math.isinf(speed) or math.isnan(speed):
print(f" INF/NAN: member_id={mid}, level={level}, speed={speed}, spend_30={spend_30}, spend_90={spend_90}")
print(f"\nMax level: {max_level[0]:.6f} (member_id={max_level[1]})")
print(f"Max speed: {max_speed[0]:.6f} (member_id={max_speed[1]})")
print(f"Max raw: {max_raw[0]:.6f} (member_id={max_raw[1]})")
print(f"Overflow members (>{RAW_MAX}): {len(overflow_members)}")
# 也检查 daily_spend_ewma_90 的实际值
# 这个值是在 _compute_daily_spend_ewma_batch 中计算的
# 看看 dws_member_consumption 中有没有极端值
cur.execute("""
SELECT member_id, spend_30, spend_90, avg_ticket_90, daily_spend_ewma_90
FROM dws.dws_member_consumption
WHERE site_id = %s
ORDER BY spend_90 DESC
LIMIT 5
""", (site_id,))
print("\nTop 5 消费会员:")
for r in cur.fetchall():
print(f" member_id={r[0]}, spend_30={r[1]}, spend_90={r[2]}, avg_ticket={r[3]}, ewma={r[4]}")
# 检查 numeric(14,2) 的实际数据范围
cur.execute("""
SELECT
MAX(spend_30), MAX(spend_90), MAX(recharge_90),
MAX(avg_ticket_90), MAX(daily_spend_ewma_90)
FROM dws.dws_member_consumption
WHERE site_id = %s
""", (site_id,))
r = cur.fetchone()
print(f"\n最大值: spend_30={r[0]}, spend_90={r[1]}, recharge_90={r[2]}, avg_ticket={r[3]}, ewma={r[4]}")
conn.close()
print("\n诊断完成")