# -*- coding: utf-8 -*- """ 全量数据回写验证脚本 从 2025-07-01 到现在,重新获取 API 数据并入库 """ import json import os import sys from datetime import datetime, timedelta from pathlib import Path # 添加项目路径 project_root = Path(__file__).parent.parent / "etl_billiards" sys.path.insert(0, str(project_root)) from dotenv import load_dotenv load_dotenv(project_root / ".env") from database.connection import DatabaseConnection def check_ods_field_coverage(db: DatabaseConnection): """检查 ODS 表中新增字段的数据覆盖情况""" # 需要检查的新增字段 fields_to_check = [ ("billiards_ods.table_fee_transactions", ["activity_discount_amount", "real_service_money", "order_consumption_type"]), ("billiards_ods.assistant_service_records", ["real_service_money", "assistantteamname"]), ("billiards_ods.assistant_cancellation_records", ["tenant_id"]), ("billiards_ods.store_goods_sales_records", ["coupon_share_money"]), ("billiards_ods.payment_transactions", ["tenant_id"]), ("billiards_ods.member_profiles", ["pay_money_sum", "person_tenant_org_id", "recharge_money_sum", "register_source"]), ("billiards_ods.member_stored_value_cards", ["principal_balance", "member_grade", "rechargefreezebalance"]), ("billiards_ods.member_balance_changes", ["principal_after", "principal_before", "principal_data"]), ("billiards_ods.settlement_records", ["tenant_id"]), ("billiards_ods.recharge_settlements", ["tenant_id"]), ("billiards_ods.group_buy_packages", ["sort", "is_first_limit", "tenantcouponsaleorderitemid"]), ("billiards_ods.group_buy_redemption_records", ["coupon_sale_id", "member_discount_money"]), ("billiards_ods.site_tables_master", ["order_id"]), ("billiards_ods.store_goods_master", ["commodity_code", "not_sale"]), ("billiards_ods.table_fee_discount_records", ["table_name", "table_price", "charge_free"]), ("billiards_ods.tenant_goods_master", ["not_sale"]), ] print("\n" + "=" * 80) print("ODS 新增字段数据覆盖检查") print("=" * 80) results = [] for table, columns in fields_to_check: print(f"\n检查表: {table}") # 获取总记录数 try: total_rows = db.query(f"SELECT COUNT(*) as cnt FROM {table}")[0]["cnt"] except Exception as e: print(f" [错误] 无法获取记录数: {e}") continue for col in columns: try: # 检查列是否存在 schema, name = table.split(".", 1) col_check = db.query(""" SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = %s AND table_name = %s AND column_name = %s """, (schema, name, col.lower())) if col_check[0]["cnt"] == 0: print(f" 列 {col}: [不存在]") continue # 统计非空值数量 non_null_rows = db.query(f'SELECT COUNT(*) as cnt FROM {table} WHERE "{col}" IS NOT NULL')[0]["cnt"] zero_rows = db.query(f'SELECT COUNT(*) as cnt FROM {table} WHERE "{col}" = 0')[0]["cnt"] coverage = (non_null_rows / total_rows * 100) if total_rows > 0 else 0 print(f" 列 {col}:") print(f" - 总记录: {total_rows}, 非空: {non_null_rows} ({coverage:.1f}%), 值为0: {zero_rows}") results.append({ "table": table, "column": col, "total": total_rows, "non_null": non_null_rows, "coverage": coverage, "zero_count": zero_rows, }) except Exception as e: print(f" 列 {col}: [错误] {e}") return results def check_dwd_field_coverage(db: DatabaseConnection): """检查 DWD 表中新增字段的数据覆盖情况""" # 需要检查的新增字段 fields_to_check = [ ("billiards_dwd.dwd_table_fee_log", ["activity_discount_amount", "real_service_money"]), ("billiards_dwd.dwd_assistant_service_log", ["real_service_money"]), ("billiards_dwd.dwd_assistant_trash_event", ["tenant_id"]), ("billiards_dwd.dwd_store_goods_sale", ["coupon_share_money"]), ("billiards_dwd.dwd_payment", ["tenant_id"]), ("billiards_dwd.dim_member", ["pay_money_sum", "recharge_money_sum"]), ("billiards_dwd.dim_member_ex", ["person_tenant_org_id", "register_source"]), ("billiards_dwd.dim_member_card_account", ["principal_balance", "member_grade"]), ("billiards_dwd.dwd_member_balance_change", ["principal_after", "principal_before", "principal_change_amount"]), ("billiards_dwd.dwd_settlement_head", ["tenant_id"]), ("billiards_dwd.dwd_recharge_order", ["tenant_id"]), ("billiards_dwd.dim_groupbuy_package", ["sort", "is_first_limit"]), ("billiards_dwd.dwd_groupbuy_redemption", ["coupon_sale_id", "member_discount_money"]), ("billiards_dwd.dim_table", ["order_id"]), ("billiards_dwd.dim_store_goods", ["commodity_code", "not_sale"]), ("billiards_dwd.dwd_table_fee_adjust", ["table_name", "table_price", "charge_free"]), ("billiards_dwd.dim_tenant_goods", ["not_sale"]), ] print("\n" + "=" * 80) print("DWD 新增字段数据覆盖检查") print("=" * 80) results = [] for table, columns in fields_to_check: print(f"\n检查表: {table}") # 获取总记录数 try: total_rows = db.query(f"SELECT COUNT(*) as cnt FROM {table}")[0]["cnt"] except Exception as e: print(f" [错误] 无法获取记录数: {e}") continue for col in columns: try: # 检查列是否存在 schema, name = table.split(".", 1) col_check = db.query(""" SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = %s AND table_name = %s AND column_name = %s """, (schema, name, col.lower())) if col_check[0]["cnt"] == 0: print(f" 列 {col}: [不存在]") continue # 统计非空值数量 non_null_rows = db.query(f'SELECT COUNT(*) as cnt FROM {table} WHERE "{col}" IS NOT NULL')[0]["cnt"] coverage = (non_null_rows / total_rows * 100) if total_rows > 0 else 0 print(f" 列 {col}: 总记录: {total_rows}, 非空: {non_null_rows} ({coverage:.1f}%)") results.append({ "table": table, "column": col, "total": total_rows, "non_null": non_null_rows, "coverage": coverage, }) except Exception as e: print(f" 列 {col}: [错误] {e}") return results def main(): print("=" * 80) print("全量数据回写验证") print("时间:", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) print("=" * 80) # 连接数据库 dsn = os.getenv("PG_DSN") if not dsn: print("[错误] 未找到 PG_DSN 环境变量") return False db = DatabaseConnection(dsn) # 检查 ODS 字段覆盖 ods_results = check_ods_field_coverage(db) # 检查 DWD 字段覆盖 dwd_results = check_dwd_field_coverage(db) db.close() # 生成汇总 print("\n" + "=" * 80) print("汇总") print("=" * 80) print("\nODS 新增字段覆盖率统计:") for r in ods_results: if r["coverage"] < 50: status = "[需关注]" elif r["coverage"] < 80: status = "[一般]" else: status = "[良好]" print(f" {r['table']}.{r['column']}: {r['coverage']:.1f}% {status}") print("\nDWD 新增字段覆盖率统计:") for r in dwd_results: if r["coverage"] < 50: status = "[需关注]" elif r["coverage"] < 80: status = "[一般]" else: status = "[良好]" print(f" {r['table']}.{r['column']}: {r['coverage']:.1f}% {status}") # 保存报告 report = { "generated_at": datetime.now().isoformat(), "ods_coverage": ods_results, "dwd_coverage": dwd_results, } report_file = Path(__file__).parent / "field_coverage_report.json" with open(report_file, "w", encoding="utf-8") as f: json.dump(report, f, ensure_ascii=False, indent=2) print(f"\n报告已保存到: {report_file}") return True if __name__ == "__main__": success = main() sys.exit(0 if success else 1)