Files
Neo-ZQYY/scripts/ops/_diagnose_spi_median.py

116 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""诊断 SPI 基数校准中位数为 0 的原因"""
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
PG_DSN = os.environ.get("PG_DSN")
if not PG_DSN:
raise RuntimeError("PG_DSN 未设置")
import psycopg2
import psycopg2.extras
conn = psycopg2.connect(PG_DSN)
conn.autocommit = True
def q(sql, params=None):
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, params)
return cur.fetchall()
# SPI 提取的是近 90 天有消费的会员
# ETL 运行时间是 2026-02-27 07:55所以 NOW() - 90 days ≈ 2025-11-29
# NOW() - 30 days ≈ 2026-01-28
# 但测试数据的时间范围是 2025-11-01 ~ 2026-02-27
print("SPI 特征提取模拟(与 ETL 运行时一致的窗口)")
print("=" * 60)
# 模拟 SPI 的 SQL看有多少会员有 spend_30 > 0
rows = q("""
WITH consume_source AS (
SELECT
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time,
COALESCE(s.pay_amount, 0) AS pay_amount
FROM dwd.dwd_settlement_head s
LEFT JOIN dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = (SELECT DISTINCT site_id FROM dwd.dwd_settlement_head LIMIT 1)
AND s.settle_type IN (1, 3)
AND s.pay_time >= NOW() - INTERVAL '90 days'
)
SELECT
canonical_member_id AS member_id,
SUM(pay_amount) AS spend_90,
SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN pay_amount ELSE 0 END) AS spend_30,
COUNT(*) AS orders_90,
SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN 1 ELSE 0 END) AS orders_30
FROM consume_source
WHERE canonical_member_id > 0
GROUP BY canonical_member_id
""")
print(f"近 90 天有消费的会员数: {len(rows)}")
if rows:
spend_30_values = sorted([float(r['spend_30']) for r in rows])
spend_90_values = sorted([float(r['spend_90']) for r in rows])
n = len(spend_30_values)
median_idx = n // 2
zero_30 = sum(1 for v in spend_30_values if v == 0)
zero_90 = sum(1 for v in spend_90_values if v == 0)
print(f"\nspend_30 分布:")
print(f" 为 0 的会员: {zero_30}/{n} ({zero_30/n*100:.1f}%)")
print(f" 中位数: {spend_30_values[median_idx]:.2f}")
print(f" 最大值: {spend_30_values[-1]:.2f}")
print(f"\nspend_90 分布:")
print(f" 为 0 的会员: {zero_90}/{n} ({zero_90/n*100:.1f}%)")
print(f" 中位数: {spend_90_values[median_idx]:.2f}")
print(f" 最大值: {spend_90_values[-1]:.2f}")
# 检查 pay_time 的实际范围
rows2 = q("""
SELECT MIN(pay_time) AS min_pay, MAX(pay_time) AS max_pay,
NOW() - INTERVAL '90 days' AS cutoff_90,
NOW() - INTERVAL '30 days' AS cutoff_30,
NOW() AS now_ts
FROM dwd.dwd_settlement_head
WHERE settle_type IN (1, 3)
""")
if rows2:
r = rows2[0]
print(f"\n时间范围:")
print(f" 数据最早: {r['min_pay']}")
print(f" 数据最晚: {r['max_pay']}")
print(f" NOW(): {r['now_ts']}")
print(f" 90天截止: {r['cutoff_90']}")
print(f" 30天截止: {r['cutoff_30']}")
# 检查 avg_ticket_90 和 daily_spend 的中位数
avg_tickets = sorted([float(r['spend_90']) / max(int(r['orders_90']), 1) for r in rows])
print(f"\navg_ticket_90 中位数: {avg_tickets[median_idx]:.2f}")
# 检查 recharge_90
rows3 = q("""
SELECT COUNT(*) AS cnt, SUM(recharge_amount) AS total
FROM dwd.dwd_settlement_head
WHERE settle_type = 2
AND pay_time >= NOW() - INTERVAL '90 days'
""")
if rows3:
r = rows3[0]
print(f"\n近 90 天充值记录: {r['cnt']} 条, 总额: {r['total']}")
conn.close()
print("\n诊断完成。")