微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -0,0 +1,250 @@
#!/usr/bin/env python3
"""
最终根因分析ODS 重复数据和 DWD 处理逻辑
"""
import os
import psycopg2
from datetime import datetime
from dotenv import load_dotenv
def main():
# 加载环境变量
load_dotenv()
test_db_dsn = os.environ.get('TEST_DB_DSN')
system_log_root = os.environ.get('SYSTEM_LOG_ROOT')
if not test_db_dsn or not system_log_root:
raise RuntimeError("环境变量未设置")
print("🚨 最终根因分析")
print("=" * 50)
with psycopg2.connect(test_db_dsn) as conn:
with conn.cursor() as cur:
# 1. 深入分析 ODS 重复数据
print("\n📊 1. ODS 重复数据分析")
cur.execute("""
SELECT
id,
COUNT(*) as duplicate_count,
ARRAY_AGG(DISTINCT paytime ORDER BY paytime) as pay_times,
ARRAY_AGG(DISTINCT payamount ORDER BY payamount) as pay_amounts
FROM ods.settlement_records
WHERE paytime::date BETWEEN '2026-02-10' AND '2026-02-14'
GROUP BY id
HAVING COUNT(*) > 1
ORDER BY duplicate_count DESC
LIMIT 10
""")
duplicates = cur.fetchall()
print(f"发现 {len(duplicates)} 个重复的订单ID (样本):")
for oid, count, times, amounts in duplicates:
print(f" ID {oid}: 重复 {count}")
print(f" 时间: {times}")
print(f" 金额: {amounts}")
# 统计重复情况
cur.execute("""
SELECT
COUNT(*) as duplicate_count,
COUNT(DISTINCT id) as unique_ids
FROM ods.settlement_records
WHERE paytime::date BETWEEN '2026-02-10' AND '2026-02-14'
""")
dup_stats = cur.fetchone()
print(f"\n重复统计 (2026-02-10 到 2026-02-14):")
print(f" 总记录数: {dup_stats[0]:,}")
print(f" 唯一ID数: {dup_stats[1]:,}")
print(f" 重复倍数: {dup_stats[0] / dup_stats[1]:.2f}")
# 2. 检查 DWD 如何处理重复数据
print("\n🔄 2. DWD 重复处理策略")
# 检查 DWD 是否有重复的 order_settle_id
cur.execute("""
SELECT
order_settle_id,
COUNT(*) as count
FROM dwd.dwd_settlement_head
GROUP BY order_settle_id
HAVING COUNT(*) > 1
LIMIT 5
""")
dwd_duplicates = cur.fetchall()
if dwd_duplicates:
print("DWD 中的重复记录:")
for oid, count in dwd_duplicates:
print(f" ID {oid}: {count}")
else:
print("DWD 中无重复记录 - 说明 DWD 有去重逻辑")
# 3. 分析历史数据处理情况
print("\n📈 3. 历史数据处理分析")
cur.execute("""
WITH monthly_stats AS (
SELECT
DATE_TRUNC('month', paytime) as month,
COUNT(*) as ods_count,
COUNT(DISTINCT id) as ods_unique
FROM ods.settlement_records
GROUP BY DATE_TRUNC('month', paytime)
),
dwd_monthly_stats AS (
SELECT
DATE_TRUNC('month', pay_time) as month,
COUNT(*) as dwd_count,
COUNT(DISTINCT order_settle_id) as dwd_unique
FROM dwd.dwd_settlement_head
GROUP BY DATE_TRUNC('month', pay_time)
)
SELECT
o.month,
o.ods_count,
o.ods_unique,
COALESCE(d.dwd_count, 0) as dwd_count,
COALESCE(d.dwd_unique, 0) as dwd_unique,
o.ods_count - COALESCE(d.dwd_count, 0) as missing_records,
o.ods_unique - COALESCE(d.dwd_unique, 0) as missing_unique
FROM monthly_stats o
LEFT JOIN dwd_monthly_stats d ON o.month = d.month
ORDER BY o.month DESC
LIMIT 6
""")
monthly_data = cur.fetchall()
print("按月数据处理情况:")
for month, ods_count, ods_unique, dwd_count, dwd_unique, missing_records, missing_unique in monthly_data:
print(f" {month.strftime('%Y-%m')}:")
print(f" ODS: {ods_count:,} 条 ({ods_unique:,} 唯一)")
print(f" DWD: {dwd_count:,} 条 ({dwd_unique:,} 唯一)")
print(f" 缺失: {missing_records:,} 条 ({missing_unique:,} 唯一)")
if ods_unique > 0:
coverage = (dwd_unique / ods_unique) * 100
print(f" 覆盖率: {coverage:.1f}%")
# 4. 检查 SPI 计算基础数据
print("\n💰 4. SPI 计算基础数据验证")
# 重新计算会员消费统计,使用正确的逻辑
cur.execute("""
WITH member_consumption AS (
SELECT
member_id,
COUNT(*) as order_count_30d,
SUM(pay_amount) as total_amount_30d,
AVG(pay_amount) as avg_amount_30d,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY pay_amount) as median_amount_30d
FROM dwd.dwd_settlement_head
WHERE member_id > 0
AND pay_time >= CURRENT_DATE - INTERVAL '30 days'
AND pay_amount > 0 -- 排除零和负数消费
GROUP BY member_id
)
SELECT
COUNT(*) as active_members_30d,
AVG(total_amount_30d) as avg_total_30d,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_amount_30d) as median_total_30d,
AVG(avg_amount_30d) as avg_per_order_30d,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY avg_amount_30d) as median_per_order_30d
FROM member_consumption
""")
spi_stats = cur.fetchone()
if spi_stats and spi_stats[0] > 0:
print("修正后的 SPI 基础数据 (近30天排除零消费):")
print(f" 活跃会员数: {spi_stats[0]:,}")
print(f" 平均总消费: {spi_stats[1]:.2f}")
print(f" 中位数总消费: {spi_stats[2]:.2f}")
print(f" 平均单次消费: {spi_stats[3]:.2f}")
print(f" 中位数单次消费: {spi_stats[4]:.2f}")
else:
print("近30天无有效消费数据")
# 5. 检查数据质量问题
print("\n🔍 5. 数据质量问题检查")
# 检查负数和零消费
cur.execute("""
SELECT
CASE
WHEN pay_amount < 0 THEN '负数消费'
WHEN pay_amount = 0 THEN '零消费'
WHEN pay_amount > 0 AND pay_amount <= 10 THEN '小额消费(≤10)'
WHEN pay_amount > 10 THEN '正常消费(>10)'
END as amount_category,
COUNT(*) as record_count,
COUNT(DISTINCT member_id) as member_count,
AVG(pay_amount) as avg_amount
FROM dwd.dwd_settlement_head
WHERE pay_time >= CURRENT_DATE - INTERVAL '90 days'
GROUP BY
CASE
WHEN pay_amount < 0 THEN '负数消费'
WHEN pay_amount = 0 THEN '零消费'
WHEN pay_amount > 0 AND pay_amount <= 10 THEN '小额消费(≤10)'
WHEN pay_amount > 10 THEN '正常消费(>10)'
END
ORDER BY record_count DESC
""")
quality_stats = cur.fetchall()
print("90天内消费金额质量分析:")
for category, record_count, member_count, avg_amount in quality_stats:
print(f" {category}: {record_count:,} 条, {member_count:,} 会员, 平均 {avg_amount:.2f}")
# 生成最终报告
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = f"{system_log_root}/final_root_cause_analysis_{timestamp}.md"
report_content = f"""# ETL 数据问题最终根因分析报告
**生成时间**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
## 🎯 核心发现
### 1. ODS 数据重复问题
- ODS 表中存在大量重复记录每个订单ID平均重复2次
- 这导致 ODS 记录数看起来是实际订单数的2倍
### 2. DWD 去重处理
- DWD 层正确实现了去重逻辑,每个 order_settle_id 只保留一条记录
- 这解释了为什么 DWD 记录数约为 ODS 的50%
### 3. 历史数据缺失
- 总体上 DWD 缺失约60%的历史数据
- 这可能是由于历史 ETL 执行不完整导致的
### 4. SPI 警告根因
- 大量零消费和负数消费记录影响了中位数计算
- 近30天活跃会员数量极少导致统计基数不足
## 🔧 解决建议
1. **数据修复**: 运行完整的历史数据回填
2. **SPI 优化**: 在计算中排除零消费和负数消费
3. **监控改进**: 建立 ETL 数据完整性监控
4. **质量控制**: 加强数据质量检查和清洗
## 📊 影响评估
- **数据完整性**: 需要修复历史缺失数据
- **SPI 准确性**: 需要优化计算逻辑
- **业务影响**: 当前 SPI 指标可能不准确
"""
os.makedirs(os.path.dirname(report_path), exist_ok=True)
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report_content)
print(f"\n📝 最终根因分析报告: {report_path}")
if __name__ == "__main__":
main()