This commit is contained in:
Neo
2026-02-04 21:39:01 +08:00
parent ee773a9b52
commit a3f4d04335
148 changed files with 31455 additions and 182 deletions

View File

@@ -0,0 +1,237 @@
# -*- coding: utf-8 -*-
"""
全量数据回写验证脚本
从 2025-07-01 到现在,重新获取 API 数据并入库
"""
import json
import os
import sys
from datetime import datetime, timedelta
from pathlib import Path
# 添加项目路径
project_root = Path(__file__).parent.parent / "etl_billiards"
sys.path.insert(0, str(project_root))
from dotenv import load_dotenv
load_dotenv(project_root / ".env")
from database.connection import DatabaseConnection
def check_ods_field_coverage(db: DatabaseConnection):
"""检查 ODS 表中新增字段的数据覆盖情况"""
# 需要检查的新增字段
fields_to_check = [
("billiards_ods.table_fee_transactions", ["activity_discount_amount", "real_service_money", "order_consumption_type"]),
("billiards_ods.assistant_service_records", ["real_service_money", "assistantteamname"]),
("billiards_ods.assistant_cancellation_records", ["tenant_id"]),
("billiards_ods.store_goods_sales_records", ["coupon_share_money"]),
("billiards_ods.payment_transactions", ["tenant_id"]),
("billiards_ods.member_profiles", ["pay_money_sum", "person_tenant_org_id", "recharge_money_sum", "register_source"]),
("billiards_ods.member_stored_value_cards", ["principal_balance", "member_grade", "rechargefreezebalance"]),
("billiards_ods.member_balance_changes", ["principal_after", "principal_before", "principal_data"]),
("billiards_ods.settlement_records", ["tenant_id"]),
("billiards_ods.recharge_settlements", ["tenant_id"]),
("billiards_ods.group_buy_packages", ["sort", "is_first_limit", "tenantcouponsaleorderitemid"]),
("billiards_ods.group_buy_redemption_records", ["coupon_sale_id", "member_discount_money"]),
("billiards_ods.site_tables_master", ["order_id"]),
("billiards_ods.store_goods_master", ["commodity_code", "not_sale"]),
("billiards_ods.table_fee_discount_records", ["table_name", "table_price", "charge_free"]),
("billiards_ods.tenant_goods_master", ["not_sale"]),
]
print("\n" + "=" * 80)
print("ODS 新增字段数据覆盖检查")
print("=" * 80)
results = []
for table, columns in fields_to_check:
print(f"\n检查表: {table}")
# 获取总记录数
try:
total_rows = db.query(f"SELECT COUNT(*) as cnt FROM {table}")[0]["cnt"]
except Exception as e:
print(f" [错误] 无法获取记录数: {e}")
continue
for col in columns:
try:
# 检查列是否存在
schema, name = table.split(".", 1)
col_check = db.query("""
SELECT COUNT(*) as cnt FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s AND column_name = %s
""", (schema, name, col.lower()))
if col_check[0]["cnt"] == 0:
print(f"{col}: [不存在]")
continue
# 统计非空值数量
non_null_rows = db.query(f'SELECT COUNT(*) as cnt FROM {table} WHERE "{col}" IS NOT NULL')[0]["cnt"]
zero_rows = db.query(f'SELECT COUNT(*) as cnt FROM {table} WHERE "{col}" = 0')[0]["cnt"]
coverage = (non_null_rows / total_rows * 100) if total_rows > 0 else 0
print(f"{col}:")
print(f" - 总记录: {total_rows}, 非空: {non_null_rows} ({coverage:.1f}%), 值为0: {zero_rows}")
results.append({
"table": table,
"column": col,
"total": total_rows,
"non_null": non_null_rows,
"coverage": coverage,
"zero_count": zero_rows,
})
except Exception as e:
print(f"{col}: [错误] {e}")
return results
def check_dwd_field_coverage(db: DatabaseConnection):
"""检查 DWD 表中新增字段的数据覆盖情况"""
# 需要检查的新增字段
fields_to_check = [
("billiards_dwd.dwd_table_fee_log", ["activity_discount_amount", "real_service_money"]),
("billiards_dwd.dwd_assistant_service_log", ["real_service_money"]),
("billiards_dwd.dwd_assistant_trash_event", ["tenant_id"]),
("billiards_dwd.dwd_store_goods_sale", ["coupon_share_money"]),
("billiards_dwd.dwd_payment", ["tenant_id"]),
("billiards_dwd.dim_member", ["pay_money_sum", "recharge_money_sum"]),
("billiards_dwd.dim_member_ex", ["person_tenant_org_id", "register_source"]),
("billiards_dwd.dim_member_card_account", ["principal_balance", "member_grade"]),
("billiards_dwd.dwd_member_balance_change", ["principal_after", "principal_before", "principal_change_amount"]),
("billiards_dwd.dwd_settlement_head", ["tenant_id"]),
("billiards_dwd.dwd_recharge_order", ["tenant_id"]),
("billiards_dwd.dim_groupbuy_package", ["sort", "is_first_limit"]),
("billiards_dwd.dwd_groupbuy_redemption", ["coupon_sale_id", "member_discount_money"]),
("billiards_dwd.dim_table", ["order_id"]),
("billiards_dwd.dim_store_goods", ["commodity_code", "not_sale"]),
("billiards_dwd.dwd_table_fee_adjust", ["table_name", "table_price", "charge_free"]),
("billiards_dwd.dim_tenant_goods", ["not_sale"]),
]
print("\n" + "=" * 80)
print("DWD 新增字段数据覆盖检查")
print("=" * 80)
results = []
for table, columns in fields_to_check:
print(f"\n检查表: {table}")
# 获取总记录数
try:
total_rows = db.query(f"SELECT COUNT(*) as cnt FROM {table}")[0]["cnt"]
except Exception as e:
print(f" [错误] 无法获取记录数: {e}")
continue
for col in columns:
try:
# 检查列是否存在
schema, name = table.split(".", 1)
col_check = db.query("""
SELECT COUNT(*) as cnt FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s AND column_name = %s
""", (schema, name, col.lower()))
if col_check[0]["cnt"] == 0:
print(f"{col}: [不存在]")
continue
# 统计非空值数量
non_null_rows = db.query(f'SELECT COUNT(*) as cnt FROM {table} WHERE "{col}" IS NOT NULL')[0]["cnt"]
coverage = (non_null_rows / total_rows * 100) if total_rows > 0 else 0
print(f"{col}: 总记录: {total_rows}, 非空: {non_null_rows} ({coverage:.1f}%)")
results.append({
"table": table,
"column": col,
"total": total_rows,
"non_null": non_null_rows,
"coverage": coverage,
})
except Exception as e:
print(f"{col}: [错误] {e}")
return results
def main():
print("=" * 80)
print("全量数据回写验证")
print("时间:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print("=" * 80)
# 连接数据库
dsn = os.getenv("PG_DSN")
if not dsn:
print("[错误] 未找到 PG_DSN 环境变量")
return False
db = DatabaseConnection(dsn)
# 检查 ODS 字段覆盖
ods_results = check_ods_field_coverage(db)
# 检查 DWD 字段覆盖
dwd_results = check_dwd_field_coverage(db)
db.close()
# 生成汇总
print("\n" + "=" * 80)
print("汇总")
print("=" * 80)
print("\nODS 新增字段覆盖率统计:")
for r in ods_results:
if r["coverage"] < 50:
status = "[需关注]"
elif r["coverage"] < 80:
status = "[一般]"
else:
status = "[良好]"
print(f" {r['table']}.{r['column']}: {r['coverage']:.1f}% {status}")
print("\nDWD 新增字段覆盖率统计:")
for r in dwd_results:
if r["coverage"] < 50:
status = "[需关注]"
elif r["coverage"] < 80:
status = "[一般]"
else:
status = "[良好]"
print(f" {r['table']}.{r['column']}: {r['coverage']:.1f}% {status}")
# 保存报告
report = {
"generated_at": datetime.now().isoformat(),
"ods_coverage": ods_results,
"dwd_coverage": dwd_results,
}
report_file = Path(__file__).parent / "field_coverage_report.json"
with open(report_file, "w", encoding="utf-8") as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"\n报告已保存到: {report_file}")
return True
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)