微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -0,0 +1,172 @@
"""诊断 ETL 联调三个问题:
1. DWS_ASSISTANT_SALARY 为什么 ins=0
2. DWS_MEMBER_VISIT 唯一约束冲突原因
3. SPI 基数校准 WARNING 原因
"""
import os, sys
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
PG_DSN = os.environ.get("PG_DSN")
if not PG_DSN:
raise RuntimeError("PG_DSN 未设置")
import psycopg2
import psycopg2.extras
conn = psycopg2.connect(PG_DSN)
conn.autocommit = True
def q(sql):
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
cur.execute(sql)
return cur.fetchall()
print("=" * 60)
print("问题 1: DWS_ASSISTANT_SALARY 源数据检查")
print("=" * 60)
# 检查 dws_assistant_monthly_summary 是否有数据
rows = q("SELECT stat_month, COUNT(*) AS cnt FROM dws.dws_assistant_monthly_summary GROUP BY stat_month ORDER BY stat_month")
if rows:
print(f"dws_assistant_monthly_summary 数据分布(按月):")
for r in rows:
print(f" {r['stat_month']}: {r['cnt']}")
else:
print("dws_assistant_monthly_summary: 无数据!")
# 检查 salary_calc 当前数据
rows2 = q("SELECT salary_month, COUNT(*) AS cnt FROM dws.dws_assistant_salary_calc GROUP BY salary_month ORDER BY salary_month")
if rows2:
print(f"\ndws_assistant_salary_calc 已有数据:")
for r in rows2:
print(f" {r['salary_month']}: {r['cnt']}")
else:
print("\ndws_assistant_salary_calc: 无数据")
# 检查 _should_skip_run 逻辑ETL 运行日期是 2/27day=27 > run_days=5所以会跳过
print(f"\n结论: ETL 运行日期 2026-02-27day=27 > run_days(默认5)")
print(" _should_skip_run() 返回 True任务被跳过这是设计行为。")
print(" 工资计算仅在月初前5天运行可通过 dws.salary.run_days 配置)。")
print()
print("=" * 60)
print("问题 2: DWS_MEMBER_VISIT 唯一约束冲突")
print("=" * 60)
# 检查源数据中是否有重复的 (site_id, member_id, order_settle_id)
rows3 = q("""
SELECT site_id, member_id, order_settle_id, COUNT(*) AS cnt
FROM dwd.dwd_settlement_head
WHERE member_id IS NOT NULL AND member_id != 0
GROUP BY site_id, member_id, order_settle_id
HAVING COUNT(*) > 1
ORDER BY cnt DESC
LIMIT 10
""")
if rows3:
print(f"dwd_settlement_head 中有 {len(rows3)} 组重复 (site_id, member_id, order_settle_id):")
for r in rows3:
print(f" site={r['site_id']}, member={r['member_id']}, order={r['order_settle_id']}, cnt={r['cnt']}")
else:
print("dwd_settlement_head 中无重复 (site_id, member_id, order_settle_id)")
# 检查是否是跨窗口重复delete-before-insert 按日期窗口删除,但同一 order 可能跨窗口)
rows4 = q("""
SELECT site_id, member_id, order_settle_id, COUNT(*) AS cnt
FROM dws.dws_member_visit_detail
GROUP BY site_id, member_id, order_settle_id
HAVING COUNT(*) > 1
LIMIT 10
""")
if rows4:
print(f"\ndws_member_visit_detail 中已有重复:")
for r in rows4:
print(f" site={r['site_id']}, member={r['member_id']}, order={r['order_settle_id']}, cnt={r['cnt']}")
else:
print("\ndws_member_visit_detail 中无重复(当前数据干净)")
# 检查 visit_date 分布,看是否有跨窗口的 order
rows5 = q("""
SELECT visit_date, COUNT(*) AS cnt
FROM dws.dws_member_visit_detail
GROUP BY visit_date
ORDER BY visit_date
""")
print(f"\ndws_member_visit_detail visit_date 分布: {len(rows5)} 个日期")
if rows5:
print(f" 最早: {rows5[0]['visit_date']} ({rows5[0]['cnt']} 行)")
print(f" 最晚: {rows5[-1]['visit_date']} ({rows5[-1]['cnt']} 行)")
total = sum(r['cnt'] for r in rows5)
print(f" 总计: {total}")
# 关键:检查同一 order_settle_id 是否出现在不同 visit_datebiz_date 计算可能导致跨窗口)
rows6 = q("""
WITH order_dates AS (
SELECT order_settle_id, member_id, site_id,
pay_time,
pay_time::date AS pay_date
FROM dwd.dwd_settlement_head
WHERE member_id IS NOT NULL AND member_id != 0
)
SELECT order_settle_id, member_id, COUNT(DISTINCT pay_date) AS date_cnt
FROM order_dates
GROUP BY order_settle_id, member_id
HAVING COUNT(DISTINCT pay_date) > 1
LIMIT 5
""")
if rows6:
print(f"\n同一 order 出现在多个 pay_date: {len(rows6)}")
else:
print("\n同一 order 不跨日期")
# 检查 SCD2 是否导致 member_id 重复映射
rows7 = q("""
SELECT order_settle_id, COUNT(DISTINCT member_id) AS member_cnt
FROM dwd.dwd_settlement_head
WHERE member_id IS NOT NULL AND member_id != 0
GROUP BY order_settle_id
HAVING COUNT(DISTINCT member_id) > 1
LIMIT 10
""")
if rows7:
print(f"\n同一 order_settle_id 对应多个 member_id: {len(rows7)}")
for r in rows7:
print(f" order={r['order_settle_id']}, member_cnt={r['member_cnt']}")
else:
print("\n同一 order_settle_id 不对应多个 member_id")
print()
print("=" * 60)
print("问题 3: SPI 基数校准 WARNING")
print("=" * 60)
# 检查 SPI 源数据:有多少会员有消费
rows8 = q("""
SELECT
COUNT(*) AS total_members,
COUNT(*) FILTER (WHERE spend_30d > 0) AS has_spend_30,
COUNT(*) FILTER (WHERE spend_90d > 0) AS has_spend_90,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY spend_30d) AS median_spend_30,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY spend_90d) AS median_spend_90
FROM dws.dws_member_consumption_summary
""")
if rows8:
r = rows8[0]
print(f"dws_member_consumption_summary:")
print(f" 总会员数: {r['total_members']}")
print(f" 30天有消费: {r['has_spend_30']}")
print(f" 90天有消费: {r['has_spend_90']}")
print(f" 30天消费中位数: {r['median_spend_30']}")
print(f" 90天消费中位数: {r['median_spend_90']}")
# 检查 SPI 结果表
rows9 = q("SELECT COUNT(*) AS cnt, AVG(display_score) AS avg_score FROM dws.dws_member_spending_power_index")
if rows9:
r = rows9[0]
print(f"\ndws_member_spending_power_index: {r['cnt']} 行, 平均分: {r['avg_score']}")
conn.close()
print("\n诊断完成。")