116 lines
4.0 KiB
Python
116 lines
4.0 KiB
Python
"""诊断 SPI 基数校准中位数为 0 的原因"""
|
||
import os
|
||
from pathlib import Path
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
|
||
|
||
PG_DSN = os.environ.get("PG_DSN")
|
||
if not PG_DSN:
|
||
raise RuntimeError("PG_DSN 未设置")
|
||
|
||
import psycopg2
|
||
import psycopg2.extras
|
||
|
||
conn = psycopg2.connect(PG_DSN)
|
||
conn.autocommit = True
|
||
|
||
def q(sql, params=None):
|
||
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
|
||
cur.execute(sql, params)
|
||
return cur.fetchall()
|
||
|
||
# SPI 提取的是近 90 天有消费的会员
|
||
# ETL 运行时间是 2026-02-27 07:55,所以 NOW() - 90 days ≈ 2025-11-29
|
||
# NOW() - 30 days ≈ 2026-01-28
|
||
# 但测试数据的时间范围是 2025-11-01 ~ 2026-02-27
|
||
|
||
print("SPI 特征提取模拟(与 ETL 运行时一致的窗口)")
|
||
print("=" * 60)
|
||
|
||
# 模拟 SPI 的 SQL,看有多少会员有 spend_30 > 0
|
||
rows = q("""
|
||
WITH consume_source AS (
|
||
SELECT
|
||
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
|
||
s.pay_time,
|
||
COALESCE(s.pay_amount, 0) AS pay_amount
|
||
FROM dwd.dwd_settlement_head s
|
||
LEFT JOIN dwd.dim_member_card_account mca
|
||
ON s.member_card_account_id = mca.member_card_id
|
||
AND mca.scd2_is_current = 1
|
||
AND mca.register_site_id = s.site_id
|
||
AND COALESCE(mca.is_delete, 0) = 0
|
||
WHERE s.site_id = (SELECT DISTINCT site_id FROM dwd.dwd_settlement_head LIMIT 1)
|
||
AND s.settle_type IN (1, 3)
|
||
AND s.pay_time >= NOW() - INTERVAL '90 days'
|
||
)
|
||
SELECT
|
||
canonical_member_id AS member_id,
|
||
SUM(pay_amount) AS spend_90,
|
||
SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN pay_amount ELSE 0 END) AS spend_30,
|
||
COUNT(*) AS orders_90,
|
||
SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN 1 ELSE 0 END) AS orders_30
|
||
FROM consume_source
|
||
WHERE canonical_member_id > 0
|
||
GROUP BY canonical_member_id
|
||
""")
|
||
|
||
print(f"近 90 天有消费的会员数: {len(rows)}")
|
||
|
||
if rows:
|
||
spend_30_values = sorted([float(r['spend_30']) for r in rows])
|
||
spend_90_values = sorted([float(r['spend_90']) for r in rows])
|
||
|
||
n = len(spend_30_values)
|
||
median_idx = n // 2
|
||
|
||
zero_30 = sum(1 for v in spend_30_values if v == 0)
|
||
zero_90 = sum(1 for v in spend_90_values if v == 0)
|
||
|
||
print(f"\nspend_30 分布:")
|
||
print(f" 为 0 的会员: {zero_30}/{n} ({zero_30/n*100:.1f}%)")
|
||
print(f" 中位数: {spend_30_values[median_idx]:.2f}")
|
||
print(f" 最大值: {spend_30_values[-1]:.2f}")
|
||
|
||
print(f"\nspend_90 分布:")
|
||
print(f" 为 0 的会员: {zero_90}/{n} ({zero_90/n*100:.1f}%)")
|
||
print(f" 中位数: {spend_90_values[median_idx]:.2f}")
|
||
print(f" 最大值: {spend_90_values[-1]:.2f}")
|
||
|
||
# 检查 pay_time 的实际范围
|
||
rows2 = q("""
|
||
SELECT MIN(pay_time) AS min_pay, MAX(pay_time) AS max_pay,
|
||
NOW() - INTERVAL '90 days' AS cutoff_90,
|
||
NOW() - INTERVAL '30 days' AS cutoff_30,
|
||
NOW() AS now_ts
|
||
FROM dwd.dwd_settlement_head
|
||
WHERE settle_type IN (1, 3)
|
||
""")
|
||
if rows2:
|
||
r = rows2[0]
|
||
print(f"\n时间范围:")
|
||
print(f" 数据最早: {r['min_pay']}")
|
||
print(f" 数据最晚: {r['max_pay']}")
|
||
print(f" NOW(): {r['now_ts']}")
|
||
print(f" 90天截止: {r['cutoff_90']}")
|
||
print(f" 30天截止: {r['cutoff_30']}")
|
||
|
||
# 检查 avg_ticket_90 和 daily_spend 的中位数
|
||
avg_tickets = sorted([float(r['spend_90']) / max(int(r['orders_90']), 1) for r in rows])
|
||
print(f"\navg_ticket_90 中位数: {avg_tickets[median_idx]:.2f}")
|
||
|
||
# 检查 recharge_90
|
||
rows3 = q("""
|
||
SELECT COUNT(*) AS cnt, SUM(recharge_amount) AS total
|
||
FROM dwd.dwd_settlement_head
|
||
WHERE settle_type = 2
|
||
AND pay_time >= NOW() - INTERVAL '90 days'
|
||
""")
|
||
if rows3:
|
||
r = rows3[0]
|
||
print(f"\n近 90 天充值记录: {r['cnt']} 条, 总额: {r['total']}")
|
||
|
||
conn.close()
|
||
print("\n诊断完成。")
|