Files
Neo-ZQYY/scripts/ops/_diagnose_etl_issues.py

173 lines
6.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""诊断 ETL 联调三个问题:
1. DWS_ASSISTANT_SALARY 为什么 ins=0
2. DWS_MEMBER_VISIT 唯一约束冲突原因
3. SPI 基数校准 WARNING 原因
"""
import os, sys
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
PG_DSN = os.environ.get("PG_DSN")
if not PG_DSN:
raise RuntimeError("PG_DSN 未设置")
import psycopg2
import psycopg2.extras
conn = psycopg2.connect(PG_DSN)
conn.autocommit = True
def q(sql):
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql)
return cur.fetchall()
print("=" * 60)
print("问题 1: DWS_ASSISTANT_SALARY 源数据检查")
print("=" * 60)
# 检查 dws_assistant_monthly_summary 是否有数据
rows = q("SELECT stat_month, COUNT(*) AS cnt FROM dws.dws_assistant_monthly_summary GROUP BY stat_month ORDER BY stat_month")
if rows:
print(f"dws_assistant_monthly_summary 数据分布(按月):")
for r in rows:
print(f" {r['stat_month']}: {r['cnt']}")
else:
print("dws_assistant_monthly_summary: 无数据!")
# 检查 salary_calc 当前数据
rows2 = q("SELECT salary_month, COUNT(*) AS cnt FROM dws.dws_assistant_salary_calc GROUP BY salary_month ORDER BY salary_month")
if rows2:
print(f"\ndws_assistant_salary_calc 已有数据:")
for r in rows2:
print(f" {r['salary_month']}: {r['cnt']}")
else:
print("\ndws_assistant_salary_calc: 无数据")
# 检查 _should_skip_run 逻辑ETL 运行日期是 2/27day=27 > run_days=5所以会跳过
print(f"\n结论: ETL 运行日期 2026-02-27day=27 > run_days(默认5)")
print(" _should_skip_run() 返回 True任务被跳过这是设计行为。")
print(" 工资计算仅在月初前5天运行可通过 dws.salary.run_days 配置)。")
print()
print("=" * 60)
print("问题 2: DWS_MEMBER_VISIT 唯一约束冲突")
print("=" * 60)
# 检查源数据中是否有重复的 (site_id, member_id, order_settle_id)
rows3 = q("""
SELECT site_id, member_id, order_settle_id, COUNT(*) AS cnt
FROM dwd.dwd_settlement_head
WHERE member_id IS NOT NULL AND member_id != 0
GROUP BY site_id, member_id, order_settle_id
HAVING COUNT(*) > 1
ORDER BY cnt DESC
LIMIT 10
""")
if rows3:
print(f"dwd_settlement_head 中有 {len(rows3)} 组重复 (site_id, member_id, order_settle_id):")
for r in rows3:
print(f" site={r['site_id']}, member={r['member_id']}, order={r['order_settle_id']}, cnt={r['cnt']}")
else:
print("dwd_settlement_head 中无重复 (site_id, member_id, order_settle_id)")
# 检查是否是跨窗口重复delete-before-insert 按日期窗口删除,但同一 order 可能跨窗口)
rows4 = q("""
SELECT site_id, member_id, order_settle_id, COUNT(*) AS cnt
FROM dws.dws_member_visit_detail
GROUP BY site_id, member_id, order_settle_id
HAVING COUNT(*) > 1
LIMIT 10
""")
if rows4:
print(f"\ndws_member_visit_detail 中已有重复:")
for r in rows4:
print(f" site={r['site_id']}, member={r['member_id']}, order={r['order_settle_id']}, cnt={r['cnt']}")
else:
print("\ndws_member_visit_detail 中无重复(当前数据干净)")
# 检查 visit_date 分布,看是否有跨窗口的 order
rows5 = q("""
SELECT visit_date, COUNT(*) AS cnt
FROM dws.dws_member_visit_detail
GROUP BY visit_date
ORDER BY visit_date
""")
print(f"\ndws_member_visit_detail visit_date 分布: {len(rows5)} 个日期")
if rows5:
print(f" 最早: {rows5[0]['visit_date']} ({rows5[0]['cnt']} 行)")
print(f" 最晚: {rows5[-1]['visit_date']} ({rows5[-1]['cnt']} 行)")
total = sum(r['cnt'] for r in rows5)
print(f" 总计: {total}")
# 关键:检查同一 order_settle_id 是否出现在不同 visit_datebiz_date 计算可能导致跨窗口)
rows6 = q("""
WITH order_dates AS (
SELECT order_settle_id, member_id, site_id,
pay_time,
pay_time::date AS pay_date
FROM dwd.dwd_settlement_head
WHERE member_id IS NOT NULL AND member_id != 0
)
SELECT order_settle_id, member_id, COUNT(DISTINCT pay_date) AS date_cnt
FROM order_dates
GROUP BY order_settle_id, member_id
HAVING COUNT(DISTINCT pay_date) > 1
LIMIT 5
""")
if rows6:
print(f"\n同一 order 出现在多个 pay_date: {len(rows6)}")
else:
print("\n同一 order 不跨日期")
# 检查 SCD2 是否导致 member_id 重复映射
rows7 = q("""
SELECT order_settle_id, COUNT(DISTINCT member_id) AS member_cnt
FROM dwd.dwd_settlement_head
WHERE member_id IS NOT NULL AND member_id != 0
GROUP BY order_settle_id
HAVING COUNT(DISTINCT member_id) > 1
LIMIT 10
""")
if rows7:
print(f"\n同一 order_settle_id 对应多个 member_id: {len(rows7)}")
for r in rows7:
print(f" order={r['order_settle_id']}, member_cnt={r['member_cnt']}")
else:
print("\n同一 order_settle_id 不对应多个 member_id")
print()
print("=" * 60)
print("问题 3: SPI 基数校准 WARNING")
print("=" * 60)
# 检查 SPI 源数据:有多少会员有消费
rows8 = q("""
SELECT
COUNT(*) AS total_members,
COUNT(*) FILTER (WHERE spend_30d > 0) AS has_spend_30,
COUNT(*) FILTER (WHERE spend_90d > 0) AS has_spend_90,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY spend_30d) AS median_spend_30,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY spend_90d) AS median_spend_90
FROM dws.dws_member_consumption_summary
""")
if rows8:
r = rows8[0]
print(f"dws_member_consumption_summary:")
print(f" 总会员数: {r['total_members']}")
print(f" 30天有消费: {r['has_spend_30']}")
print(f" 90天有消费: {r['has_spend_90']}")
print(f" 30天消费中位数: {r['median_spend_30']}")
print(f" 90天消费中位数: {r['median_spend_90']}")
# 检查 SPI 结果表
rows9 = q("SELECT COUNT(*) AS cnt, AVG(display_score) AS avg_score FROM dws.dws_member_spending_power_index")
if rows9:
r = rows9[0]
print(f"\ndws_member_spending_power_index: {r['cnt']} 行, 平均分: {r['avg_score']}")
conn.close()
print("\n诊断完成。")