"""诊断 SPI 基数校准中位数为 0 的原因""" import os from pathlib import Path from dotenv import load_dotenv load_dotenv(Path(__file__).resolve().parents[2] / ".env") PG_DSN = os.environ.get("PG_DSN") if not PG_DSN: raise RuntimeError("PG_DSN 未设置") import psycopg2 import psycopg2.extras conn = psycopg2.connect(PG_DSN) conn.autocommit = True def q(sql, params=None): with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: cur.execute(sql, params) return cur.fetchall() # SPI 提取的是近 90 天有消费的会员 # ETL 运行时间是 2026-02-27 07:55,所以 NOW() - 90 days ≈ 2025-11-29 # NOW() - 30 days ≈ 2026-01-28 # 但测试数据的时间范围是 2025-11-01 ~ 2026-02-27 print("SPI 特征提取模拟(与 ETL 运行时一致的窗口)") print("=" * 60) # 模拟 SPI 的 SQL,看有多少会员有 spend_30 > 0 rows = q(""" WITH consume_source AS ( SELECT COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id, s.pay_time, COALESCE(s.pay_amount, 0) AS pay_amount FROM dwd.dwd_settlement_head s LEFT JOIN dwd.dim_member_card_account mca ON s.member_card_account_id = mca.member_card_id AND mca.scd2_is_current = 1 AND mca.register_site_id = s.site_id AND COALESCE(mca.is_delete, 0) = 0 WHERE s.site_id = (SELECT DISTINCT site_id FROM dwd.dwd_settlement_head LIMIT 1) AND s.settle_type IN (1, 3) AND s.pay_time >= NOW() - INTERVAL '90 days' ) SELECT canonical_member_id AS member_id, SUM(pay_amount) AS spend_90, SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN pay_amount ELSE 0 END) AS spend_30, COUNT(*) AS orders_90, SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN 1 ELSE 0 END) AS orders_30 FROM consume_source WHERE canonical_member_id > 0 GROUP BY canonical_member_id """) print(f"近 90 天有消费的会员数: {len(rows)}") if rows: spend_30_values = sorted([float(r['spend_30']) for r in rows]) spend_90_values = sorted([float(r['spend_90']) for r in rows]) n = len(spend_30_values) median_idx = n // 2 zero_30 = sum(1 for v in spend_30_values if v == 0) zero_90 = sum(1 for v in spend_90_values if v == 0) print(f"\nspend_30 分布:") print(f" 为 0 的会员: {zero_30}/{n} ({zero_30/n*100:.1f}%)") print(f" 中位数: {spend_30_values[median_idx]:.2f}") print(f" 最大值: {spend_30_values[-1]:.2f}") print(f"\nspend_90 分布:") print(f" 为 0 的会员: {zero_90}/{n} ({zero_90/n*100:.1f}%)") print(f" 中位数: {spend_90_values[median_idx]:.2f}") print(f" 最大值: {spend_90_values[-1]:.2f}") # 检查 pay_time 的实际范围 rows2 = q(""" SELECT MIN(pay_time) AS min_pay, MAX(pay_time) AS max_pay, NOW() - INTERVAL '90 days' AS cutoff_90, NOW() - INTERVAL '30 days' AS cutoff_30, NOW() AS now_ts FROM dwd.dwd_settlement_head WHERE settle_type IN (1, 3) """) if rows2: r = rows2[0] print(f"\n时间范围:") print(f" 数据最早: {r['min_pay']}") print(f" 数据最晚: {r['max_pay']}") print(f" NOW(): {r['now_ts']}") print(f" 90天截止: {r['cutoff_90']}") print(f" 30天截止: {r['cutoff_30']}") # 检查 avg_ticket_90 和 daily_spend 的中位数 avg_tickets = sorted([float(r['spend_90']) / max(int(r['orders_90']), 1) for r in rows]) print(f"\navg_ticket_90 中位数: {avg_tickets[median_idx]:.2f}") # 检查 recharge_90 rows3 = q(""" SELECT COUNT(*) AS cnt, SUM(recharge_amount) AS total FROM dwd.dwd_settlement_head WHERE settle_type = 2 AND pay_time >= NOW() - INTERVAL '90 days' """) if rows3: r = rows3[0] print(f"\n近 90 天充值记录: {r['cnt']} 条, 总额: {r['total']}") conn.close() print("\n诊断完成。")