194 lines
8.0 KiB
Python
194 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
重新验证 ODS 和 DWD 数据一致性,查明之前分析的矛盾
|
|
"""
|
|
|
|
import os
|
|
import psycopg2
|
|
from datetime import datetime
|
|
from dotenv import load_dotenv
|
|
|
|
def main():
|
|
# 加载环境变量
|
|
load_dotenv()
|
|
|
|
test_db_dsn = os.environ.get('TEST_DB_DSN')
|
|
if not test_db_dsn:
|
|
raise RuntimeError("TEST_DB_DSN 环境变量未设置")
|
|
|
|
print("🔍 重新验证数据一致性")
|
|
print("=" * 40)
|
|
|
|
with psycopg2.connect(test_db_dsn) as conn:
|
|
with conn.cursor() as cur:
|
|
|
|
# 1. 重新检查 ODS 和 DWD 的记录数对比
|
|
print("\n📊 1. 重新统计 ODS vs DWD 记录数")
|
|
|
|
# 检查具体日期的详细情况
|
|
test_dates = ['2026-02-13', '2026-02-12', '2026-02-11']
|
|
|
|
for test_date in test_dates:
|
|
print(f"\n--- {test_date} 详细分析 ---")
|
|
|
|
# ODS 记录数
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM ods.settlement_records
|
|
WHERE paytime::date = %s
|
|
""", (test_date,))
|
|
ods_count = cur.fetchone()[0]
|
|
|
|
# DWD 记录数
|
|
cur.execute("""
|
|
SELECT COUNT(*) FROM dwd.dwd_settlement_head
|
|
WHERE pay_time::date = %s
|
|
""", (test_date,))
|
|
dwd_count = cur.fetchone()[0]
|
|
|
|
print(f"ODS: {ods_count:,} 条")
|
|
print(f"DWD: {dwd_count:,} 条")
|
|
print(f"差异: {ods_count - dwd_count:,} 条")
|
|
|
|
if ods_count != dwd_count:
|
|
# 查找缺失的记录
|
|
cur.execute("""
|
|
SELECT o.id, o.paytime, o.payamount, o.settlestatus, o.settletype
|
|
FROM ods.settlement_records o
|
|
LEFT JOIN dwd.dwd_settlement_head d ON o.id = d.order_settle_id
|
|
WHERE o.paytime::date = %s AND d.order_settle_id IS NULL
|
|
LIMIT 5
|
|
""", (test_date,))
|
|
|
|
missing_records = cur.fetchall()
|
|
if missing_records:
|
|
print("缺失记录样本:")
|
|
for record in missing_records:
|
|
print(f" ID: {record[0]}, 时间: {record[1]}, 金额: {record[2]}")
|
|
|
|
# 查找多余的记录
|
|
cur.execute("""
|
|
SELECT d.order_settle_id, d.pay_time, d.pay_amount
|
|
FROM dwd.dwd_settlement_head d
|
|
LEFT JOIN ods.settlement_records o ON d.order_settle_id = o.id
|
|
WHERE d.pay_time::date = %s AND o.id IS NULL
|
|
LIMIT 5
|
|
""", (test_date,))
|
|
|
|
extra_records = cur.fetchall()
|
|
if extra_records:
|
|
print("多余记录样本:")
|
|
for record in extra_records:
|
|
print(f" ID: {record[0]}, 时间: {record[1]}, 金额: {record[2]}")
|
|
|
|
# 2. 检查数据类型和字段映射
|
|
print(f"\n🔍 2. 检查字段映射和数据类型")
|
|
|
|
# 检查 ODS 表结构
|
|
cur.execute("""
|
|
SELECT column_name, data_type, is_nullable
|
|
FROM information_schema.columns
|
|
WHERE table_schema = 'ods' AND table_name = 'settlement_records'
|
|
AND column_name IN ('id', 'paytime', 'payamount', 'memberid')
|
|
ORDER BY column_name
|
|
""")
|
|
|
|
ods_columns = cur.fetchall()
|
|
print("ODS 关键字段:")
|
|
for col_name, data_type, nullable in ods_columns:
|
|
print(f" {col_name}: {data_type} ({'NULL' if nullable == 'YES' else 'NOT NULL'})")
|
|
|
|
# 检查 DWD 表结构
|
|
cur.execute("""
|
|
SELECT column_name, data_type, is_nullable
|
|
FROM information_schema.columns
|
|
WHERE table_schema = 'dwd' AND table_name = 'dwd_settlement_head'
|
|
AND column_name IN ('order_settle_id', 'pay_time', 'pay_amount', 'member_id')
|
|
ORDER BY column_name
|
|
""")
|
|
|
|
dwd_columns = cur.fetchall()
|
|
print("DWD 关键字段:")
|
|
for col_name, data_type, nullable in dwd_columns:
|
|
print(f" {col_name}: {data_type} ({'NULL' if nullable == 'YES' else 'NOT NULL'})")
|
|
|
|
# 3. 检查是否有数据转换问题
|
|
print(f"\n🔄 3. 检查数据转换问题")
|
|
|
|
# 检查 ID 映射
|
|
cur.execute("""
|
|
SELECT
|
|
COUNT(*) as total_ods,
|
|
COUNT(DISTINCT o.id) as unique_ods_ids,
|
|
COUNT(d.order_settle_id) as matched_dwd,
|
|
COUNT(DISTINCT d.order_settle_id) as unique_dwd_ids
|
|
FROM ods.settlement_records o
|
|
LEFT JOIN dwd.dwd_settlement_head d ON o.id = d.order_settle_id
|
|
WHERE o.paytime::date BETWEEN '2026-02-10' AND '2026-02-14'
|
|
""")
|
|
|
|
id_mapping = cur.fetchone()
|
|
print("ID 映射统计:")
|
|
print(f" ODS 总记录: {id_mapping[0]:,}")
|
|
print(f" ODS 唯一ID: {id_mapping[1]:,}")
|
|
print(f" DWD 匹配记录: {id_mapping[2]:,}")
|
|
print(f" DWD 唯一ID: {id_mapping[3]:,}")
|
|
|
|
# 4. 检查最新的 ETL 处理时间
|
|
print(f"\n⏰ 4. 检查 ETL 处理时间戳")
|
|
|
|
# 检查 DWD 表是否有处理时间戳字段
|
|
cur.execute("""
|
|
SELECT column_name
|
|
FROM information_schema.columns
|
|
WHERE table_schema = 'dwd' AND table_name = 'dwd_settlement_head'
|
|
AND (column_name LIKE '%created%' OR column_name LIKE '%updated%' OR column_name LIKE '%processed%')
|
|
""")
|
|
|
|
timestamp_columns = cur.fetchall()
|
|
if timestamp_columns:
|
|
print("发现时间戳字段:")
|
|
for col in timestamp_columns:
|
|
print(f" {col[0]}")
|
|
else:
|
|
print("未发现时间戳字段")
|
|
|
|
# 5. 重新检查总体数据量
|
|
print(f"\n📈 5. 总体数据量对比")
|
|
|
|
cur.execute("SELECT COUNT(*) FROM ods.settlement_records")
|
|
total_ods = cur.fetchone()[0]
|
|
|
|
cur.execute("SELECT COUNT(*) FROM dwd.dwd_settlement_head")
|
|
total_dwd = cur.fetchone()[0]
|
|
|
|
print(f"ODS 总记录数: {total_ods:,}")
|
|
print(f"DWD 总记录数: {total_dwd:,}")
|
|
print(f"总体差异: {total_ods - total_dwd:,} 条")
|
|
|
|
if total_ods != total_dwd:
|
|
# 查找全局缺失模式
|
|
cur.execute("""
|
|
SELECT
|
|
CASE
|
|
WHEN o.id IS NULL THEN 'DWD多余'
|
|
WHEN d.order_settle_id IS NULL THEN 'ODS缺失'
|
|
ELSE '匹配'
|
|
END as status,
|
|
COUNT(*) as count
|
|
FROM ods.settlement_records o
|
|
FULL OUTER JOIN dwd.dwd_settlement_head d ON o.id = d.order_settle_id
|
|
GROUP BY
|
|
CASE
|
|
WHEN o.id IS NULL THEN 'DWD多余'
|
|
WHEN d.order_settle_id IS NULL THEN 'ODS缺失'
|
|
ELSE '匹配'
|
|
END
|
|
""")
|
|
|
|
status_summary = cur.fetchall()
|
|
print("数据匹配状态:")
|
|
for status, count in status_summary:
|
|
print(f" {status}: {count:,} 条")
|
|
|
|
if __name__ == "__main__":
|
|
main() |