Files
Neo-ZQYY/scripts/ops/_diagnose_spi_v3.py

150 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""检查 ODS/DWD 数据为什么只到 2/14以及 SPI canonical_member_id 映射"""
import os
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
PG_DSN = os.environ.get("PG_DSN")
if not PG_DSN:
raise RuntimeError("PG_DSN 未设置")
import psycopg2
import psycopg2.extras
conn = psycopg2.connect(PG_DSN)
conn.autocommit = True
def q(sql, params=None):
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql, params or ())
return cur.fetchall()
# 1. 查 ODS 结算表的实际表名
print("ODS 结算相关表:")
rows = q("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'ods' AND table_name LIKE '%settle%'
ORDER BY table_name
""")
for r in rows:
print(f" {r['table_name']}")
rows2 = q("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'ods' AND table_name LIKE '%payment%'
ORDER BY table_name
""")
for r in rows2:
print(f" {r['table_name']}")
# 2. 查 ODS payment 表的最新数据
rows3 = q("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'ods'
ORDER BY table_name
""")
print(f"\nODS 全部表 ({len(rows3)} 张):")
for r in rows3:
print(f" {r['table_name']}")
# 3. 检查 SPI 的 canonical_member_id 映射
print("\n" + "=" * 60)
print("SPI canonical_member_id 映射分析")
rows4 = q("""
WITH consume_source AS (
SELECT
s.member_id AS raw_member_id,
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time,
COALESCE(s.pay_amount, 0) AS pay_amount
FROM dwd.dwd_settlement_head s
LEFT JOIN dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = (SELECT DISTINCT site_id FROM dwd.dwd_settlement_head LIMIT 1)
AND s.settle_type IN (1, 3)
AND s.pay_time >= NOW() - INTERVAL '90 days'
)
SELECT
COUNT(*) AS total_records,
COUNT(DISTINCT raw_member_id) AS raw_members,
COUNT(DISTINCT canonical_member_id) AS canonical_members,
COUNT(*) FILTER (WHERE canonical_member_id IS NULL OR canonical_member_id = 0) AS null_canonical,
COUNT(*) FILTER (WHERE raw_member_id != canonical_member_id) AS remapped
FROM consume_source
""")
r = rows4[0]
print(f" 总记录: {r['total_records']}")
print(f" 原始 member_id 去重: {r['raw_members']}")
print(f" canonical_member_id 去重: {r['canonical_members']}")
print(f" canonical 为 NULL/0: {r['null_canonical']}")
print(f" 被重映射的记录: {r['remapped']}")
# 4. 30 天窗口内 canonical_member_id 有消费的会员数
rows5 = q("""
WITH consume_source AS (
SELECT
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time,
COALESCE(s.pay_amount, 0) AS pay_amount
FROM dwd.dwd_settlement_head s
LEFT JOIN dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = (SELECT DISTINCT site_id FROM dwd.dwd_settlement_head LIMIT 1)
AND s.settle_type IN (1, 3)
AND s.pay_time >= NOW() - INTERVAL '90 days'
)
SELECT
canonical_member_id,
SUM(pay_amount) AS spend_90,
SUM(CASE WHEN pay_time >= NOW() - INTERVAL '30 days' THEN pay_amount ELSE 0 END) AS spend_30
FROM consume_source
WHERE canonical_member_id > 0
GROUP BY canonical_member_id
""")
total = len(rows5)
has_30 = sum(1 for r in rows5 if float(r['spend_30']) > 0)
has_90 = sum(1 for r in rows5 if float(r['spend_90']) > 0)
print(f"\nSPI 视角canonical_member_id:")
print(f" 90天有消费会员: {total}")
print(f" 30天有消费会员: {has_30} ({has_30/total*100:.1f}%)")
print(f" 30天无消费会员: {total - has_30} ({(total-has_30)/total*100:.1f}%)")
# 5. 中位数计算
spend_30_vals = sorted([float(r['spend_30']) for r in rows5])
spend_90_vals = sorted([float(r['spend_90']) for r in rows5])
n = len(spend_30_vals)
median_30 = spend_30_vals[n // 2] if n else 0
median_90 = spend_90_vals[n // 2] if n else 0
print(f"\n spend_30 中位数: {median_30:.2f}")
print(f" spend_90 中位数: {median_90:.2f}")
# 6. 检查 API 拉取的最新数据时间(从 ODS 表看)
print("\n" + "=" * 60)
print("ODS 各表最新数据时间:")
for r in rows3[:5]:
tname = r['table_name']
try:
cols = q(f"""
SELECT column_name FROM information_schema.columns
WHERE table_schema = 'ods' AND table_name = '{tname}'
AND column_name IN ('pay_time', 'create_time', 'updated_at', 'etl_loaded_at')
ORDER BY column_name
""")
if cols:
col = cols[0]['column_name']
maxr = q(f"SELECT MAX({col}) AS max_time FROM ods.{tname}")
if maxr and maxr[0]['max_time']:
print(f" {tname}.{col}: {maxr[0]['max_time']}")
except Exception as e:
print(f" {tname}: 查询失败 ({e})")
conn.close()
print("\n诊断完成。")