#!/usr/bin/env python3 """ 最终根因分析:ODS 重复数据和 DWD 处理逻辑 """ import os import psycopg2 from datetime import datetime from dotenv import load_dotenv def main(): # 加载环境变量 load_dotenv() test_db_dsn = os.environ.get('TEST_DB_DSN') system_log_root = os.environ.get('SYSTEM_LOG_ROOT') if not test_db_dsn or not system_log_root: raise RuntimeError("环境变量未设置") print("🚨 最终根因分析") print("=" * 50) with psycopg2.connect(test_db_dsn) as conn: with conn.cursor() as cur: # 1. 深入分析 ODS 重复数据 print("\n📊 1. ODS 重复数据分析") cur.execute(""" SELECT id, COUNT(*) as duplicate_count, ARRAY_AGG(DISTINCT paytime ORDER BY paytime) as pay_times, ARRAY_AGG(DISTINCT payamount ORDER BY payamount) as pay_amounts FROM ods.settlement_records WHERE paytime::date BETWEEN '2026-02-10' AND '2026-02-14' GROUP BY id HAVING COUNT(*) > 1 ORDER BY duplicate_count DESC LIMIT 10 """) duplicates = cur.fetchall() print(f"发现 {len(duplicates)} 个重复的订单ID (样本):") for oid, count, times, amounts in duplicates: print(f" ID {oid}: 重复 {count} 次") print(f" 时间: {times}") print(f" 金额: {amounts}") # 统计重复情况 cur.execute(""" SELECT COUNT(*) as duplicate_count, COUNT(DISTINCT id) as unique_ids FROM ods.settlement_records WHERE paytime::date BETWEEN '2026-02-10' AND '2026-02-14' """) dup_stats = cur.fetchone() print(f"\n重复统计 (2026-02-10 到 2026-02-14):") print(f" 总记录数: {dup_stats[0]:,}") print(f" 唯一ID数: {dup_stats[1]:,}") print(f" 重复倍数: {dup_stats[0] / dup_stats[1]:.2f}") # 2. 检查 DWD 如何处理重复数据 print("\n🔄 2. DWD 重复处理策略") # 检查 DWD 是否有重复的 order_settle_id cur.execute(""" SELECT order_settle_id, COUNT(*) as count FROM dwd.dwd_settlement_head GROUP BY order_settle_id HAVING COUNT(*) > 1 LIMIT 5 """) dwd_duplicates = cur.fetchall() if dwd_duplicates: print("DWD 中的重复记录:") for oid, count in dwd_duplicates: print(f" ID {oid}: {count} 次") else: print("DWD 中无重复记录 - 说明 DWD 有去重逻辑") # 3. 分析历史数据处理情况 print("\n📈 3. 历史数据处理分析") cur.execute(""" WITH monthly_stats AS ( SELECT DATE_TRUNC('month', paytime) as month, COUNT(*) as ods_count, COUNT(DISTINCT id) as ods_unique FROM ods.settlement_records GROUP BY DATE_TRUNC('month', paytime) ), dwd_monthly_stats AS ( SELECT DATE_TRUNC('month', pay_time) as month, COUNT(*) as dwd_count, COUNT(DISTINCT order_settle_id) as dwd_unique FROM dwd.dwd_settlement_head GROUP BY DATE_TRUNC('month', pay_time) ) SELECT o.month, o.ods_count, o.ods_unique, COALESCE(d.dwd_count, 0) as dwd_count, COALESCE(d.dwd_unique, 0) as dwd_unique, o.ods_count - COALESCE(d.dwd_count, 0) as missing_records, o.ods_unique - COALESCE(d.dwd_unique, 0) as missing_unique FROM monthly_stats o LEFT JOIN dwd_monthly_stats d ON o.month = d.month ORDER BY o.month DESC LIMIT 6 """) monthly_data = cur.fetchall() print("按月数据处理情况:") for month, ods_count, ods_unique, dwd_count, dwd_unique, missing_records, missing_unique in monthly_data: print(f" {month.strftime('%Y-%m')}:") print(f" ODS: {ods_count:,} 条 ({ods_unique:,} 唯一)") print(f" DWD: {dwd_count:,} 条 ({dwd_unique:,} 唯一)") print(f" 缺失: {missing_records:,} 条 ({missing_unique:,} 唯一)") if ods_unique > 0: coverage = (dwd_unique / ods_unique) * 100 print(f" 覆盖率: {coverage:.1f}%") # 4. 检查 SPI 计算基础数据 print("\n💰 4. SPI 计算基础数据验证") # 重新计算会员消费统计,使用正确的逻辑 cur.execute(""" WITH member_consumption AS ( SELECT member_id, COUNT(*) as order_count_30d, SUM(pay_amount) as total_amount_30d, AVG(pay_amount) as avg_amount_30d, PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY pay_amount) as median_amount_30d FROM dwd.dwd_settlement_head WHERE member_id > 0 AND pay_time >= CURRENT_DATE - INTERVAL '30 days' AND pay_amount > 0 -- 排除零和负数消费 GROUP BY member_id ) SELECT COUNT(*) as active_members_30d, AVG(total_amount_30d) as avg_total_30d, PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_amount_30d) as median_total_30d, AVG(avg_amount_30d) as avg_per_order_30d, PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY avg_amount_30d) as median_per_order_30d FROM member_consumption """) spi_stats = cur.fetchone() if spi_stats and spi_stats[0] > 0: print("修正后的 SPI 基础数据 (近30天,排除零消费):") print(f" 活跃会员数: {spi_stats[0]:,}") print(f" 平均总消费: {spi_stats[1]:.2f}") print(f" 中位数总消费: {spi_stats[2]:.2f}") print(f" 平均单次消费: {spi_stats[3]:.2f}") print(f" 中位数单次消费: {spi_stats[4]:.2f}") else: print("近30天无有效消费数据") # 5. 检查数据质量问题 print("\n🔍 5. 数据质量问题检查") # 检查负数和零消费 cur.execute(""" SELECT CASE WHEN pay_amount < 0 THEN '负数消费' WHEN pay_amount = 0 THEN '零消费' WHEN pay_amount > 0 AND pay_amount <= 10 THEN '小额消费(≤10)' WHEN pay_amount > 10 THEN '正常消费(>10)' END as amount_category, COUNT(*) as record_count, COUNT(DISTINCT member_id) as member_count, AVG(pay_amount) as avg_amount FROM dwd.dwd_settlement_head WHERE pay_time >= CURRENT_DATE - INTERVAL '90 days' GROUP BY CASE WHEN pay_amount < 0 THEN '负数消费' WHEN pay_amount = 0 THEN '零消费' WHEN pay_amount > 0 AND pay_amount <= 10 THEN '小额消费(≤10)' WHEN pay_amount > 10 THEN '正常消费(>10)' END ORDER BY record_count DESC """) quality_stats = cur.fetchall() print("90天内消费金额质量分析:") for category, record_count, member_count, avg_amount in quality_stats: print(f" {category}: {record_count:,} 条, {member_count:,} 会员, 平均 {avg_amount:.2f}") # 生成最终报告 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = f"{system_log_root}/final_root_cause_analysis_{timestamp}.md" report_content = f"""# ETL 数据问题最终根因分析报告 **生成时间**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")} ## 🎯 核心发现 ### 1. ODS 数据重复问题 - ODS 表中存在大量重复记录,每个订单ID平均重复2次 - 这导致 ODS 记录数看起来是实际订单数的2倍 ### 2. DWD 去重处理 - DWD 层正确实现了去重逻辑,每个 order_settle_id 只保留一条记录 - 这解释了为什么 DWD 记录数约为 ODS 的50% ### 3. 历史数据缺失 - 总体上 DWD 缺失约60%的历史数据 - 这可能是由于历史 ETL 执行不完整导致的 ### 4. SPI 警告根因 - 大量零消费和负数消费记录影响了中位数计算 - 近30天活跃会员数量极少,导致统计基数不足 ## 🔧 解决建议 1. **数据修复**: 运行完整的历史数据回填 2. **SPI 优化**: 在计算中排除零消费和负数消费 3. **监控改进**: 建立 ETL 数据完整性监控 4. **质量控制**: 加强数据质量检查和清洗 ## 📊 影响评估 - **数据完整性**: 需要修复历史缺失数据 - **SPI 准确性**: 需要优化计算逻辑 - **业务影响**: 当前 SPI 指标可能不准确 """ os.makedirs(os.path.dirname(report_path), exist_ok=True) with open(report_path, 'w', encoding='utf-8') as f: f.write(report_content) print(f"\n📝 最终根因分析报告: {report_path}") if __name__ == "__main__": main()