# -*- coding: utf-8 -*- """DWD 装载任务:从 ODS 增量写入 DWD(维度 SCD2,事实按时间增量)。""" from __future__ import annotations import os import re import time from datetime import date, datetime from decimal import Decimal, InvalidOperation from typing import Any, Dict, Iterable, List, Sequence from psycopg2.extras import RealDictCursor, execute_batch, execute_values from tasks.base_task import BaseTask, TaskContext class DwdLoadTask(BaseTask): """负责 DWD 装载:维度表做 SCD2 合并,事实表按时间增量写入。""" # DWD -> ODS 表映射(ODS 表名已与示例 JSON 前缀统一) TABLE_MAP: dict[str, str] = { # 维度 # 门店:改用台费流水中的 siteprofile 快照,补齐 org/地址等字段 "billiards_dwd.dim_site": "billiards_ods.table_fee_transactions", "billiards_dwd.dim_site_ex": "billiards_ods.table_fee_transactions", "billiards_dwd.dim_table": "billiards_ods.site_tables_master", "billiards_dwd.dim_table_ex": "billiards_ods.site_tables_master", "billiards_dwd.dim_assistant": "billiards_ods.assistant_accounts_master", "billiards_dwd.dim_assistant_ex": "billiards_ods.assistant_accounts_master", "billiards_dwd.dim_member": "billiards_ods.member_profiles", "billiards_dwd.dim_member_ex": "billiards_ods.member_profiles", "billiards_dwd.dim_member_card_account": "billiards_ods.member_stored_value_cards", "billiards_dwd.dim_member_card_account_ex": "billiards_ods.member_stored_value_cards", "billiards_dwd.dim_tenant_goods": "billiards_ods.tenant_goods_master", "billiards_dwd.dim_tenant_goods_ex": "billiards_ods.tenant_goods_master", "billiards_dwd.dim_store_goods": "billiards_ods.store_goods_master", "billiards_dwd.dim_store_goods_ex": "billiards_ods.store_goods_master", "billiards_dwd.dim_goods_category": "billiards_ods.stock_goods_category_tree", "billiards_dwd.dim_groupbuy_package": "billiards_ods.group_buy_packages", "billiards_dwd.dim_groupbuy_package_ex": "billiards_ods.group_buy_packages", # 事实 "billiards_dwd.dwd_settlement_head": "billiards_ods.settlement_records", "billiards_dwd.dwd_settlement_head_ex": "billiards_ods.settlement_records", "billiards_dwd.dwd_table_fee_log": "billiards_ods.table_fee_transactions", "billiards_dwd.dwd_table_fee_log_ex": "billiards_ods.table_fee_transactions", "billiards_dwd.dwd_table_fee_adjust": "billiards_ods.table_fee_discount_records", "billiards_dwd.dwd_table_fee_adjust_ex": "billiards_ods.table_fee_discount_records", "billiards_dwd.dwd_store_goods_sale": "billiards_ods.store_goods_sales_records", "billiards_dwd.dwd_store_goods_sale_ex": "billiards_ods.store_goods_sales_records", "billiards_dwd.dwd_assistant_service_log": "billiards_ods.assistant_service_records", "billiards_dwd.dwd_assistant_service_log_ex": "billiards_ods.assistant_service_records", "billiards_dwd.dwd_assistant_trash_event": "billiards_ods.assistant_cancellation_records", "billiards_dwd.dwd_assistant_trash_event_ex": "billiards_ods.assistant_cancellation_records", "billiards_dwd.dwd_member_balance_change": "billiards_ods.member_balance_changes", "billiards_dwd.dwd_member_balance_change_ex": "billiards_ods.member_balance_changes", "billiards_dwd.dwd_groupbuy_redemption": "billiards_ods.group_buy_redemption_records", "billiards_dwd.dwd_groupbuy_redemption_ex": "billiards_ods.group_buy_redemption_records", "billiards_dwd.dwd_platform_coupon_redemption": "billiards_ods.platform_coupon_redemption_records", "billiards_dwd.dwd_platform_coupon_redemption_ex": "billiards_ods.platform_coupon_redemption_records", "billiards_dwd.dwd_recharge_order": "billiards_ods.recharge_settlements", "billiards_dwd.dwd_recharge_order_ex": "billiards_ods.recharge_settlements", "billiards_dwd.dwd_payment": "billiards_ods.payment_transactions", "billiards_dwd.dwd_refund": "billiards_ods.refund_transactions", "billiards_dwd.dwd_refund_ex": "billiards_ods.refund_transactions", } SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"} # 增量/窗口过滤优先使用业务时间;fetched_at(入库时间)放最后,避免回溯窗口被“当前入库时间”干扰。 FACT_ORDER_CANDIDATES = [ "pay_time", "create_time", "update_time", "occur_time", "settle_time", "start_use_time", "fetched_at", ] # 对于会出现“回补旧记录”的事实表,额外补齐缺失主键记录 FACT_MISSING_FILL_TABLES = { "billiards_dwd.dwd_assistant_service_log", } _NUMERIC_RE = re.compile(r"^[+-]?\d+(?:\.\d+)?$") _BOOL_STRINGS = {"true", "false", "1", "0", "yes", "no", "y", "n", "t", "f"} def _strip_scd2_keys(self, pk_cols: Sequence[str]) -> list[str]: return [c for c in pk_cols if c.lower() not in self.SCD_COLS] @staticmethod def _pick_snapshot_order_column(ods_cols: Sequence[str]) -> str | None: lower_cols = {c.lower() for c in ods_cols} if "fetched_at" in lower_cols: return "fetched_at" return None @staticmethod def _append_where_condition(where_sql: str, condition: str) -> str: if not condition: return where_sql if not where_sql: return f"WHERE {condition}" return f"{where_sql} AND {condition}" def _log_missing_fetched_at(self, cur, ods_table: str) -> None: """记录 ODS fetched_at 为空的情况(不抛异常)。""" table_sql = self._format_table(ods_table, "billiards_ods") try: cur.execute(f"SELECT 1 FROM {table_sql} WHERE fetched_at IS NULL LIMIT 1") if cur.fetchone(): self.logger.error("ODS 表 %s 存在 fetched_at 为空的记录,已跳过", ods_table) except Exception as exc: # noqa: BLE001 self.logger.warning("检查 fetched_at 为空记录失败:%s, err=%s", ods_table, exc) @staticmethod def _latest_snapshot_select_sql( select_cols_sql: str, ods_table_sql: str, key_exprs: Sequence[str], order_col: str | None, where_sql: str = "", ) -> str: if key_exprs and order_col: distinct_on = ", ".join(key_exprs) order_by = ", ".join([*key_exprs, f'"{order_col}" DESC NULLS LAST']) return ( f"SELECT DISTINCT ON ({distinct_on}) {select_cols_sql} " f"FROM {ods_table_sql} {where_sql} ORDER BY {order_by}" ) return f"SELECT {select_cols_sql} FROM {ods_table_sql} {where_sql}" # 特殊列映射:dwd 列名 -> 源列表达式(可选 CAST) FACT_MAPPINGS: dict[str, list[tuple[str, str, str | None]]] = { # 维度表(补齐主键/字段差异) "billiards_dwd.dim_site": [ ("org_id", "siteprofile->>'org_id'", None), ("shop_name", "siteprofile->>'shop_name'", None), ("site_label", "siteprofile->>'site_label'", None), ("full_address", "siteprofile->>'full_address'", None), ("address", "siteprofile->>'address'", None), ("longitude", "siteprofile->>'longitude'", "numeric"), ("latitude", "siteprofile->>'latitude'", "numeric"), ("tenant_site_region_id", "siteprofile->>'tenant_site_region_id'", None), ("business_tel", "siteprofile->>'business_tel'", None), ("site_type", "siteprofile->>'site_type'", None), ("shop_status", "siteprofile->>'shop_status'", None), ("tenant_id", "siteprofile->>'tenant_id'", None), ], "billiards_dwd.dim_site_ex": [ ("auto_light", "siteprofile->>'auto_light'", None), ("attendance_enabled", "siteprofile->>'attendance_enabled'", None), ("attendance_distance", "siteprofile->>'attendance_distance'", None), ("prod_env", "siteprofile->>'prod_env'", None), ("light_status", "siteprofile->>'light_status'", None), ("light_type", "siteprofile->>'light_type'", None), ("light_token", "siteprofile->>'light_token'", None), ("address", "siteprofile->>'address'", None), ("avatar", "siteprofile->>'avatar'", None), ("wifi_name", "siteprofile->>'wifi_name'", None), ("wifi_password", "siteprofile->>'wifi_password'", None), ("customer_service_qrcode", "siteprofile->>'customer_service_qrcode'", None), ("customer_service_wechat", "siteprofile->>'customer_service_wechat'", None), ("fixed_pay_qrcode", "siteprofile->>'fixed_pay_qrCode'", None), ("longitude", "siteprofile->>'longitude'", "numeric"), ("latitude", "siteprofile->>'latitude'", "numeric"), ("tenant_site_region_id", "siteprofile->>'tenant_site_region_id'", None), ("site_type", "siteprofile->>'site_type'", None), ("site_label", "siteprofile->>'site_label'", None), ("shop_status", "siteprofile->>'shop_status'", None), ("create_time", "siteprofile->>'create_time'", "timestamptz"), ("update_time", "siteprofile->>'update_time'", "timestamptz"), ], "billiards_dwd.dim_table": [ ("table_id", "id", None), ("site_table_area_name", "areaname", None), ("tenant_table_area_id", "site_table_area_id", None), ("order_id", "order_id", None), ], "billiards_dwd.dim_table_ex": [ ("table_id", "id", None), ("table_cloth_use_time", "table_cloth_use_time", None), ], "billiards_dwd.dim_assistant": [("assistant_id", "id", None), ("user_id", "user_id", None)], "billiards_dwd.dim_assistant_ex": [ ("assistant_id", "id", None), ("introduce", "introduce", None), ("group_name", "group_name", None), ("light_equipment_id", "light_equipment_id", None), ], "billiards_dwd.dim_member": [ ("member_id", "id", None), ("pay_money_sum", "pay_money_sum", None), ("recharge_money_sum", "recharge_money_sum", None), ], "billiards_dwd.dim_member_ex": [ ("member_id", "id", None), ("register_site_name", "site_name", None), ("person_tenant_org_id", "person_tenant_org_id", None), ("person_tenant_org_name", "person_tenant_org_name", None), ("register_source", "register_source", None), ], "billiards_dwd.dim_member_card_account": [ ("member_card_id", "id", None), ("principal_balance", "principal_balance", None), ("member_grade", "member_grade", None), ], "billiards_dwd.dim_member_card_account_ex": [ ("member_card_id", "id", None), ("tenant_name", "tenantname", None), ("tenantavatar", "tenantavatar", None), ("card_no", "card_no", None), ("bind_password", "bind_password", None), ("use_scene", "use_scene", None), ("tableareaid", "tableareaid", None), ("goodscategoryid", "goodscategoryid", None), ("able_share_member_discount", "able_share_member_discount", "boolean"), ("electricity_deduct_radio", "electricity_deduct_radio", None), ("electricity_discount", "electricity_discount", None), ("electricity_card_deduct", "electricitycarddeduct", "boolean"), ("recharge_freeze_balance", "rechargefreezebalance", None), ], "billiards_dwd.dim_tenant_goods": [ ("tenant_goods_id", "id", None), ("category_name", "categoryname", None), ("not_sale", "not_sale", None), ], "billiards_dwd.dim_tenant_goods_ex": [ ("tenant_goods_id", "id", None), ("remark_name", "remark_name", None), ("goods_bar_code", "goods_bar_code", None), ("commodity_code_list", "commodity_code", None), ("is_in_site", "isinsite", "boolean"), ], "billiards_dwd.dim_store_goods": [ ("site_goods_id", "id", None), ("category_level1_name", "onecategoryname", None), ("category_level2_name", "twocategoryname", None), ("created_at", "create_time", None), ("updated_at", "update_time", None), ("avg_monthly_sales", "average_monthly_sales", None), ("batch_stock_qty", "stock", None), ("sale_qty", "sale_num", None), ("total_sales_qty", "total_sales", None), ("commodity_code", "commodity_code", None), ("not_sale", "not_sale", None), ], "billiards_dwd.dim_store_goods_ex": [ ("site_goods_id", "id", None), ("goods_barcode", "goods_bar_code", None), ("stock_qty", "stock", None), ("stock_secondary_qty", "stock_a", None), ("safety_stock_qty", "safe_stock", None), ("site_name", "sitename", None), ("goods_cover_url", "goods_cover", None), ("provisional_total_cost", "total_purchase_cost", None), ("is_discountable", "able_discount", None), ("freeze_status", "freeze", None), ("remark", "remark", None), ("days_on_shelf", "days_available", None), ("sort_order", "sort", None), ], "billiards_dwd.dim_goods_category": [ ("category_id", "id", None), ("tenant_id", "tenant_id", None), ("category_name", "category_name", None), ("alias_name", "alias_name", None), ("parent_category_id", "pid", None), ("business_name", "business_name", None), ("tenant_goods_business_id", "tenant_goods_business_id", None), ("sort_order", "sort", None), ("open_salesman", "open_salesman", None), ("is_warehousing", "is_warehousing", None), ("category_level", "CASE WHEN pid = 0 THEN 1 ELSE 2 END", None), ("is_leaf", "CASE WHEN categoryboxes IS NULL OR jsonb_array_length(categoryboxes)=0 THEN 1 ELSE 0 END", None), ], "billiards_dwd.dim_groupbuy_package": [ ("groupbuy_package_id", "id", None), ("package_template_id", "package_id", None), ("coupon_face_value", "coupon_money", None), ("duration_seconds", "duration", None), ("sort", "sort", None), ("is_first_limit", "is_first_limit", "boolean"), ], "billiards_dwd.dim_groupbuy_package_ex": [ ("groupbuy_package_id", "id", None), ("table_area_id", "table_area_id", None), ("tenant_table_area_id", "tenant_table_area_id", None), ("usable_range", "usable_range", None), ("table_area_id_list", "table_area_id_list", None), ("package_type", "type", None), ("tenant_coupon_sale_order_item_id", "tenantcouponsaleorderitemid", None), ], # 事实表主键及关键差异列 "billiards_dwd.dwd_table_fee_log": [ ("table_fee_log_id", "id", None), ("activity_discount_amount", "activity_discount_amount", None), ("real_service_money", "real_service_money", None), ], "billiards_dwd.dwd_table_fee_log_ex": [ ("table_fee_log_id", "id", None), ("salesman_name", "salesman_name", None), ("order_consumption_type", "order_consumption_type", None), ], "billiards_dwd.dwd_table_fee_adjust": [ ("table_fee_adjust_id", "id", None), ("table_id", "site_table_id", None), ("table_area_id", "tenant_table_area_id", None), ("table_area_name", "tableprofile->>'table_area_name'", None), ("adjust_time", "create_time", None), ("table_name", "table_name", None), ("table_price", "table_price", None), ("charge_free", "charge_free", "boolean"), ], "billiards_dwd.dwd_table_fee_adjust_ex": [ ("table_fee_adjust_id", "id", None), ("ledger_name", "ledger_name", None), ("area_type_id", "area_type_id", None), ("site_table_area_id", "site_table_area_id", None), ("site_table_area_name", "site_table_area_name", None), ("site_name", "sitename", None), ("tenant_name", "tenant_name", None), ], "billiards_dwd.dwd_store_goods_sale": [ ("store_goods_sale_id", "id", None), ("discount_price", "discount_money", None), ("coupon_share_money", "coupon_share_money", None), ], "billiards_dwd.dwd_store_goods_sale_ex": [ ("store_goods_sale_id", "id", None), ("option_value_name", "option_value_name", None), ("open_salesman_flag", "opensalesman", "integer"), ("salesman_name", "salesman_name", None), ("salesman_org_id", "sales_man_org_id", None), ("legacy_order_goods_id", "ordergoodsid", None), ("site_name", "sitename", None), ("legacy_site_id", "siteid", None), ], "billiards_dwd.dwd_assistant_service_log": [ ("assistant_service_id", "id", None), ("assistant_no", "assistantno", None), ("site_assistant_id", "order_assistant_id", None), ("level_name", "levelname", None), ("skill_name", "skillname", None), ("real_service_money", "real_service_money", None), ], "billiards_dwd.dwd_assistant_service_log_ex": [ ("assistant_service_id", "id", None), ("assistant_name", "assistantname", None), ("ledger_group_name", "ledger_group_name", None), ("trash_applicant_name", "trash_applicant_name", None), ("trash_reason", "trash_reason", None), ("salesman_name", "salesman_name", None), ("table_name", "tablename", None), ("assistant_team_name", "assistantteamname", None), ], "billiards_dwd.dwd_assistant_trash_event": [ ("assistant_trash_event_id", "id", None), ("assistant_no", "assistantname", None), ("abolish_amount", "assistantabolishamount", None), ("charge_minutes_raw", "pdchargeminutes", None), ("site_id", "siteid", None), ("table_id", "tableid", None), ("table_area_id", "tableareaid", None), ("assistant_name", "assistantname", None), ("trash_reason", "trashreason", None), ("create_time", "createtime", None), ("tenant_id", "tenant_id", None), ], "billiards_dwd.dwd_assistant_trash_event_ex": [ ("assistant_trash_event_id", "id", None), ("table_area_name", "tablearea", None), ("table_name", "tablename", None), ], "billiards_dwd.dwd_member_balance_change": [ ("balance_change_id", "id", None), ("balance_before", "before", None), ("change_amount", "account_data", None), ("balance_after", "after", None), ("card_type_name", "membercardtypename", None), ("change_time", "create_time", None), ("member_name", "membername", None), ("member_mobile", "membermobile", None), ("principal_before", "principal_before", None), ("principal_after", "principal_after", None), ], "billiards_dwd.dwd_member_balance_change_ex": [ ("balance_change_id", "id", None), ("pay_site_name", "paysitename", None), ("register_site_name", "registersitename", None), ("principal_data", "principal_data", None), ], "billiards_dwd.dwd_groupbuy_redemption": [ ("redemption_id", "id", None), ("member_discount_money", "member_discount_money", None), ("coupon_sale_id", "coupon_sale_id", None), ], "billiards_dwd.dwd_groupbuy_redemption_ex": [ ("redemption_id", "id", None), ("table_area_name", "tableareaname", None), ("site_name", "sitename", None), ("table_name", "tablename", None), ("goods_option_price", "goodsoptionprice", None), ("salesman_name", "salesman_name", None), ("salesman_org_id", "sales_man_org_id", None), ("ledger_group_name", "ledger_group_name", None), ("table_share_money", "table_share_money", None), ("table_service_share_money", "table_service_share_money", None), ("goods_share_money", "goods_share_money", None), ("good_service_share_money", "good_service_share_money", None), ("assistant_share_money", "assistant_share_money", None), ("assistant_service_share_money", "assistant_service_share_money", None), ("recharge_share_money", "recharge_share_money", None), ], "billiards_dwd.dwd_platform_coupon_redemption": [("platform_coupon_redemption_id", "id", None)], "billiards_dwd.dwd_platform_coupon_redemption_ex": [ ("platform_coupon_redemption_id", "id", None), ("coupon_cover", "coupon_cover", None), ], "billiards_dwd.dwd_payment": [ ("payment_id", "id", None), ("pay_date", "pay_time", "date"), ("tenant_id", "tenant_id", None), ], "billiards_dwd.dwd_refund": [("refund_id", "id", None)], "billiards_dwd.dwd_refund_ex": [ ("refund_id", "id", None), ("tenant_name", "tenantname", None), ("channel_payer_id", "channel_payer_id", None), ("channel_pay_no", "channel_pay_no", None), ], # 结算头:settlement_records(源列为小写驼峰/无下划线,需要显式映射) "billiards_dwd.dwd_settlement_head": [ ("order_settle_id", "id", None), ("tenant_id", "tenantid", None), ("site_id", "siteid", None), ("site_name", "sitename", None), ("table_id", "tableid", None), ("settle_name", "settlename", None), ("order_trade_no", "settlerelateid", None), ("create_time", "createtime", None), ("pay_time", "paytime", None), ("settle_type", "settletype", None), ("revoke_order_id", "revokeorderid", None), ("member_id", "memberid", None), ("member_name", "membername", None), ("member_phone", "memberphone", None), ("member_card_account_id", "tenantmembercardid", None), ("member_card_type_name", "membercardtypename", None), ("is_bind_member", "isbindmember", None), ("member_discount_amount", "memberdiscountamount", None), ("consume_money", "consumemoney", None), ("table_charge_money", "tablechargemoney", None), ("goods_money", "goodsmoney", None), ("real_goods_money", "realgoodsmoney", None), ("assistant_pd_money", "assistantpdmoney", None), ("assistant_cx_money", "assistantcxmoney", None), ("adjust_amount", "adjustamount", None), ("pay_amount", "payamount", None), ("balance_amount", "balanceamount", None), ("recharge_card_amount", "rechargecardamount", None), ("gift_card_amount", "giftcardamount", None), ("coupon_amount", "couponamount", None), ("rounding_amount", "roundingamount", None), ("point_amount", "pointamount", None), ("electricity_money", "electricitymoney", None), ("real_electricity_money", "realelectricitymoney", None), ("electricity_adjust_money", "electricityadjustmoney", None), ("pl_coupon_sale_amount", "plcouponsaleamount", None), ("mervou_sales_amount", "mervousalesamount", None), ], "billiards_dwd.dwd_settlement_head_ex": [ ("order_settle_id", "id", None), ("serial_number", "serialnumber", None), ("settle_status", "settlestatus", None), ("can_be_revoked", "canberevoked", "boolean"), ("revoke_order_name", "revokeordername", None), ("revoke_time", "revoketime", None), ("is_first_order", "isfirst", "boolean"), ("service_money", "servicemoney", None), ("cash_amount", "cashamount", None), ("card_amount", "cardamount", None), ("online_amount", "onlineamount", None), ("refund_amount", "refundamount", None), ("prepay_money", "prepaymoney", None), ("payment_method", "paymentmethod", None), ("coupon_sale_amount", "couponsaleamount", None), ("all_coupon_discount", "allcoupondiscount", None), ("goods_promotion_money", "goodspromotionmoney", None), ("assistant_promotion_money", "assistantpromotionmoney", None), ("activity_discount", "activitydiscount", None), ("assistant_manual_discount", "assistantmanualdiscount", None), ("point_discount_price", "pointdiscountprice", None), ("point_discount_cost", "pointdiscountcost", None), ("is_use_coupon", "isusecoupon", "boolean"), ("is_use_discount", "isusediscount", "boolean"), ("is_activity", "isactivity", "boolean"), ("operator_name", "operatorname", None), ("salesman_name", "salesmanname", None), ("order_remark", "orderremark", None), ("operator_id", "operatorid", None), ("salesman_user_id", "salesmanuserid", None), # CHANGE: intent=删除 settle_list 映射,该列已从 DWD 表中移除(与 ODS payload 冗余) # assumptions=结算明细可随时从 ODS payload->'settleList' 按需提取 # prompt=P20260214-040000 ], # 充值结算:recharge_settlements(字段风格同 settlement_records) "billiards_dwd.dwd_recharge_order": [ ("recharge_order_id", "id", None), ("tenant_id", "tenantid", None), ("site_id", "siteid", None), ("member_id", "memberid", None), ("member_name_snapshot", "membername", None), ("member_phone_snapshot", "memberphone", None), ("tenant_member_card_id", "tenantmembercardid", None), ("member_card_type_name", "membercardtypename", None), ("settle_relate_id", "settlerelateid", None), ("settle_type", "settletype", None), ("settle_name", "settlename", None), ("is_first", "isfirst", None), ("pay_amount", "payamount", None), ("refund_amount", "refundamount", None), ("point_amount", "pointamount", None), ("cash_amount", "cashamount", None), ("payment_method", "paymentmethod", None), ("create_time", "createtime", None), ("pay_time", "paytime", None), ], "billiards_dwd.dwd_recharge_order_ex": [ ("recharge_order_id", "id", None), ("site_name_snapshot", "sitename", None), ("salesman_name", "salesmanname", None), ("order_remark", "orderremark", None), ("revoke_order_name", "revokeordername", None), ("settle_status", "settlestatus", None), ("is_bind_member", "isbindmember", "boolean"), ("is_activity", "isactivity", "boolean"), ("is_use_coupon", "isusecoupon", "boolean"), ("is_use_discount", "isusediscount", "boolean"), ("can_be_revoked", "canberevoked", "boolean"), ("online_amount", "onlineamount", None), ("balance_amount", "balanceamount", None), ("card_amount", "cardamount", None), ("coupon_amount", "couponamount", None), ("recharge_card_amount", "rechargecardamount", None), ("gift_card_amount", "giftcardamount", None), ("prepay_money", "prepaymoney", None), ("consume_money", "consumemoney", None), ("goods_money", "goodsmoney", None), ("real_goods_money", "realgoodsmoney", None), ("table_charge_money", "tablechargemoney", None), ("service_money", "servicemoney", None), ("activity_discount", "activitydiscount", None), ("all_coupon_discount", "allcoupondiscount", None), ("goods_promotion_money", "goodspromotionmoney", None), ("assistant_promotion_money", "assistantpromotionmoney", None), ("assistant_pd_money", "assistantpdmoney", None), ("assistant_cx_money", "assistantcxmoney", None), ("assistant_manual_discount", "assistantmanualdiscount", None), ("coupon_sale_amount", "couponsaleamount", None), ("member_discount_amount", "memberdiscountamount", None), ("point_discount_price", "pointdiscountprice", None), ("point_discount_cost", "pointdiscountcost", None), ("adjust_amount", "adjustamount", None), ("rounding_amount", "roundingamount", None), ("operator_id", "operatorid", None), ("operator_name_snapshot", "operatorname", None), ("salesman_user_id", "salesmanuserid", None), ("salesman_name", "salesmanname", None), ("order_remark", "orderremark", None), ("table_id", "tableid", None), ("serial_number", "serialnumber", None), ("revoke_order_id", "revokeorderid", None), ("revoke_order_name", "revokeordername", None), ("revoke_time", "revoketime", None), ], } def get_task_code(self) -> str: """返回任务编码。""" return "DWD_LOAD_FROM_ODS" def extract(self, context: TaskContext) -> dict[str, Any]: """准备运行所需的上下文信息。""" return {"now": datetime.now()} def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]: """ 遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。 说明: - 为避免长事务导致锁堆积/中断后遗留 idle-in-tx,本任务按“每张表一次事务”提交; - 单表失败会回滚该表并继续后续表,最终在结果中汇总错误信息。 """ now = extracted["now"] summary: List[Dict[str, Any]] = [] errors: List[Dict[str, Any]] = [] only_tables_cfg = self.config.get("dwd.only_tables") or [] # 也支持通过环境变量 DWD_ONLY_TABLES 传递(GUI 使用此方式) env_only = os.environ.get("DWD_ONLY_TABLES", "").strip() if env_only and not only_tables_cfg: only_tables_cfg = [t.strip() for t in env_only.split(",") if t.strip()] only_tables = {str(t).strip().lower() for t in only_tables_cfg if str(t).strip()} if only_tables_cfg else set() with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur: for dwd_table, ods_table in self.TABLE_MAP.items(): if only_tables and dwd_table.lower() not in only_tables and self._table_base(dwd_table).lower() not in only_tables: continue started = time.monotonic() self.logger.info("DWD 装载开始:%s <= %s", dwd_table, ods_table) try: dwd_cols = self._get_columns(cur, dwd_table) ods_cols = self._get_columns(cur, ods_table) if not dwd_cols: self.logger.warning("跳过 %s:未能获取 DWD 列信息", dwd_table) continue if self._table_base(dwd_table).startswith("dim_"): dim_counts = self._merge_dim(cur, dwd_table, ods_table, dwd_cols, ods_cols, now) self.db.conn.commit() summary.append({"table": dwd_table, "mode": "SCD2", **dim_counts}) else: dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd") ods_types = self._get_column_types(cur, ods_table, "billiards_ods") use_window = bool( self.config.get("run.window_override.start") and self.config.get("run.window_override.end") ) fact_counts = self._merge_fact_increment( cur, dwd_table, ods_table, dwd_cols, ods_cols, dwd_types, ods_types, window_start=context.window_start if use_window else None, window_end=context.window_end if use_window else None, ) self.db.conn.commit() summary.append({"table": dwd_table, "mode": "INCREMENT", **fact_counts}) elapsed = time.monotonic() - started self.logger.info("DWD 装载完成:%s,用时 %.2fs", dwd_table, elapsed) except Exception as exc: # noqa: BLE001 try: self.db.conn.rollback() except Exception: pass elapsed = time.monotonic() - started self.logger.exception("DWD 装载失败:%s,用时 %.2fs,err=%s", dwd_table, elapsed, exc) errors.append({"table": dwd_table, "error": str(exc)}) continue return {"tables": summary, "errors": errors} # ---------------------- 辅助方法 ---------------------- def _get_columns(self, cur, table: str) -> List[str]: """获取指定表的列名(小写)。""" schema, name = self._split_table_name(table, default_schema="billiards_dwd") cur.execute( """ SELECT column_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s """, (schema, name), ) return [r["column_name"].lower() for r in cur.fetchall()] def _get_primary_keys(self, cur, table: str) -> List[str]: """获取表的主键列名列表。""" schema, name = self._split_table_name(table, default_schema="billiards_dwd") cur.execute( """ SELECT kcu.column_name FROM information_schema.table_constraints tc JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name WHERE tc.table_schema = %s AND tc.table_name = %s AND tc.constraint_type = 'PRIMARY KEY' ORDER BY kcu.ordinal_position """, (schema, name), ) return [r["column_name"].lower() for r in cur.fetchall()] def _get_column_types(self, cur, table: str, default_schema: str) -> Dict[str, str]: """获取列的数据类型(information_schema.data_type)。""" schema, name = self._split_table_name(table, default_schema=default_schema) cur.execute( """ SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = %s AND table_name = %s """, (schema, name), ) return {r["column_name"].lower(): r["data_type"].lower() for r in cur.fetchall()} def _build_column_mapping( self, dwd_table: str, pk_cols: Sequence[str], ods_cols: Sequence[str] ) -> Dict[str, tuple[str, str | None]]: """合并显式 FACT_MAPPINGS 与主键兜底映射。""" mapping_entries = self.FACT_MAPPINGS.get(dwd_table, []) mapping: Dict[str, tuple[str, str | None]] = { dst.lower(): (src, cast_type) for dst, src, cast_type in mapping_entries } ods_set = {c.lower() for c in ods_cols} if "fetched_at" not in ods_set: self.logger.error("跳过 %s:ODS 表 %s 缺少 fetched_at 列", dwd_table, ods_table) return {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0} self._log_missing_fetched_at(cur, ods_table) for pk in pk_cols: pk_lower = pk.lower() if pk_lower not in mapping and pk_lower not in ods_set and "id" in ods_set: mapping[pk_lower] = ("id", None) return mapping def _fetch_source_rows( self, cur, table: str, columns: Sequence[str], where_sql: str = "", params: Sequence[Any] = None ) -> List[Dict[str, Any]]: """从源表读取指定列,返回小写键的字典列表。""" schema, name = self._split_table_name(table, default_schema="billiards_ods") cols_sql = ", ".join(f'"{c}"' for c in columns) sql = f'SELECT {cols_sql} FROM "{schema}"."{name}" {where_sql}' cur.execute(sql, params or []) rows = [] for r in cur.fetchall(): rows.append({k.lower(): v for k, v in r.items()}) return rows def _expand_goods_category_rows(self, rows: list[Dict[str, Any]]) -> list[Dict[str, Any]]: """将分类表中的 categoryboxes 元素展开为子类记录。""" expanded: list[Dict[str, Any]] = [] for r in rows: expanded.append(r) boxes = r.get("categoryboxes") if isinstance(boxes, list): for child in boxes: if not isinstance(child, dict): continue child_row: Dict[str, Any] = {} # 继承父级的租户与业务大类信息 child_row["tenant_id"] = r.get("tenant_id") child_row["business_name"] = child.get("business_name", r.get("business_name")) child_row["tenant_goods_business_id"] = child.get( "tenant_goods_business_id", r.get("tenant_goods_business_id") ) # 合并子类字段 child_row.update(child) # 默认父子关系 child_row.setdefault("pid", r.get("id")) # 衍生层级/叶子标记 child_boxes = child_row.get("categoryboxes") if not isinstance(child_boxes, list): is_leaf = 1 else: is_leaf = 1 if len(child_boxes) == 0 else 0 child_row.setdefault("category_level", 2) child_row.setdefault("is_leaf", is_leaf) expanded.append(child_row) return expanded def _merge_dim( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], now: datetime, ) -> Dict[str, int]: """ 维表合并策略: - 若主键包含 scd2 列(如 scd2_start_time/scd2_version),执行真正的 SCD2(关闭旧版+插入新版)。 - 否则(多数现有表主键仅为业务主键),执行 Type1 Upsert,避免重复键异常并保证可重复回放。 """ pk_cols = self._get_primary_keys(cur, dwd_table) if not pk_cols: raise ValueError(f"{dwd_table} 未配置主键,无法执行维表合并") scd_cols_present = any(c.lower() in self.SCD_COLS for c in dwd_cols) if scd_cols_present: return self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now) return self._merge_dim_type1_upsert(cur, dwd_table, ods_table, dwd_cols, ods_cols, pk_cols, now) def _merge_dim_type1_upsert( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], pk_cols: Sequence[str], now: datetime, ) -> Dict[str, int]: """维表 Type1 Upsert(主键冲突则更新),返回真实新增/更新计数。""" mapping = self._build_column_mapping(dwd_table, pk_cols, ods_cols) ods_set = {c.lower() for c in ods_cols} ods_table_sql = self._format_table(ods_table, "billiards_ods") select_exprs: list[str] = [] added: set[str] = set() for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue if lc in mapping: src, cast_type = mapping[lc] select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"") added.add(lc) elif lc in ods_set: select_exprs.append(f'\"{lc}\" AS \"{lc}\"') added.add(lc) for pk in pk_cols: lc = pk.lower() if lc in added: continue if lc in mapping: src, cast_type = mapping[lc] select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"") elif lc in ods_set: select_exprs.append(f'\"{lc}\" AS \"{lc}\"') added.add(lc) if not select_exprs: return {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0} order_col = self._pick_snapshot_order_column(ods_cols) business_keys = self._strip_scd2_keys(pk_cols) key_exprs: list[str] = [] for key in business_keys: lc = key.lower() if lc in mapping: src, cast_type = mapping[lc] key_exprs.append(self._cast_expr(src, cast_type)) elif lc in ods_set: key_exprs.append(f'"{lc}"') select_cols_sql = ", ".join(select_exprs) where_sql = self._append_where_condition("", '"fetched_at" IS NOT NULL') sql = self._latest_snapshot_select_sql( select_cols_sql, ods_table_sql, key_exprs, order_col, where_sql ) cur.execute(sql) rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()] if dwd_table == "billiards_dwd.dim_goods_category": rows = self._expand_goods_category_rows(rows) # 按主键去重 seen_pk: set[tuple[Any, ...]] = set() src_rows: list[Dict[str, Any]] = [] pk_lower = [c.lower() for c in pk_cols] for row in rows: pk_key = tuple(row.get(pk) for pk in pk_lower) if pk_key in seen_pk: continue if any(v is None for v in pk_key): self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_key))) continue seen_pk.add(pk_key) src_rows.append(row) if not src_rows: return {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0} dwd_table_sql = self._format_table(dwd_table, "billiards_dwd") sorted_cols = [c.lower() for c in sorted(dwd_cols)] insert_cols_sql = ", ".join(f'\"{c}\"' for c in sorted_cols) def build_row(src_row: Dict[str, Any]) -> list[Any]: values: list[Any] = [] for c in sorted_cols: if c == "scd2_start_time": values.append(now) elif c == "scd2_end_time": values.append(datetime(9999, 12, 31, 0, 0, 0)) elif c == "scd2_is_current": values.append(1) elif c == "scd2_version": values.append(1) else: values.append(src_row.get(c)) return values pk_sql = ", ".join(f'\"{c.lower()}\"' for c in pk_cols) pk_lower_set = {c.lower() for c in pk_cols} set_exprs: list[str] = [] for c in sorted_cols: if c in pk_lower_set: continue if c == "scd2_start_time": set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")') elif c == "scd2_version": set_exprs.append(f'\"{c}\" = COALESCE({dwd_table_sql}.\"{c}\", EXCLUDED.\"{c}\")') else: set_exprs.append(f'\"{c}\" = EXCLUDED.\"{c}\"') compare_cols = [c for c in sorted_cols if c not in pk_lower_set and c not in self.SCD_COLS] diff_exprs = [f'{dwd_table_sql}."{c}" IS DISTINCT FROM EXCLUDED."{c}"' for c in compare_cols] where_clause = f" WHERE {' OR '.join(diff_exprs)}" if diff_exprs else "" upsert_sql = ( f"INSERT INTO {dwd_table_sql} ({insert_cols_sql}) VALUES %s " f"ON CONFLICT ({pk_sql}) DO UPDATE SET {', '.join(set_exprs)}{where_clause} " f"RETURNING (xmax = 0) AS inserted" ) rows = execute_values(cur, upsert_sql, [build_row(r) for r in src_rows], page_size=500, fetch=True) inserted, updated = self._count_returning_flags(rows or []) processed = len(src_rows) skipped = max(0, processed - inserted - updated) return {"processed": processed, "inserted": inserted, "updated": updated, "skipped": skipped} def _merge_dim_scd2( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], now: datetime, ) -> Dict[str, int]: """对维表执行 SCD2 合并:对比变更关闭旧版并插入新版。""" pk_cols = self._get_primary_keys(cur, dwd_table) if not pk_cols: raise ValueError(f"{dwd_table} 未配置主键,无法执行 SCD2 合并") business_keys = self._strip_scd2_keys(pk_cols) if not business_keys: raise ValueError(f"{dwd_table} primary key only contains SCD2 columns; cannot merge") mapping = self._build_column_mapping(dwd_table, business_keys, ods_cols) ods_set = {c.lower() for c in ods_cols} if "fetched_at" not in ods_set: self.logger.error("跳过 %s:ODS 表 %s 缺少 fetched_at 列", dwd_table, ods_table) return {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0} self._log_missing_fetched_at(cur, ods_table) table_sql = self._format_table(ods_table, "billiards_ods") # 构造 SELECT 表达式,支持 JSON/expression 映射 select_exprs: list[str] = [] added: set[str] = set() for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue if lc in mapping: src, cast_type = mapping[lc] select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"") added.add(lc) elif lc in ods_set: select_exprs.append(f'"{lc}" AS "{lc}"') added.add(lc) # 分类维度需要额外读取 categoryboxes 以展开子类 if dwd_table == "billiards_dwd.dim_goods_category" and "categoryboxes" not in added and "categoryboxes" in ods_set: select_exprs.append('"categoryboxes" AS "categoryboxes"') added.add("categoryboxes") # 主键兜底确保被选出 for pk in business_keys: lc = pk.lower() if lc not in added: if lc in mapping: src, cast_type = mapping[lc] select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"") elif lc in ods_set: select_exprs.append(f'"{lc}" AS "{lc}"') added.add(lc) if not select_exprs: return {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0} order_col = self._pick_snapshot_order_column(ods_cols) key_exprs: list[str] = [] for key in business_keys: lc = key.lower() if lc in mapping: src, cast_type = mapping[lc] key_exprs.append(self._cast_expr(src, cast_type)) elif lc in ods_set: key_exprs.append(f'"{lc}"') select_cols_sql = ", ".join(select_exprs) where_sql = self._append_where_condition("", '"fetched_at" IS NOT NULL') sql = self._latest_snapshot_select_sql( select_cols_sql, table_sql, key_exprs, order_col, where_sql ) cur.execute(sql) rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()] # 特殊:分类维度展开子类 if dwd_table == "billiards_dwd.dim_goods_category": rows = self._expand_goods_category_rows(rows) # 归一化源行并按主键去重 seen_pk = set() src_rows_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {} for row in rows: mapped_row: Dict[str, Any] = {} for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue value = row.get(lc) if value is None and lc in mapping: src, _ = mapping[lc] value = row.get(src.lower()) mapped_row[lc] = value pk_key = tuple(mapped_row.get(pk) for pk in business_keys) if pk_key in seen_pk: continue if any(v is None for v in pk_key): self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(business_keys, pk_key))) continue seen_pk.add(pk_key) src_rows_by_pk[pk_key] = mapped_row if not src_rows_by_pk: return {"processed": 0, "inserted": 0, "updated": 0, "skipped": 0} # 预加载当前版本(scd2_is_current=1),避免逐行 SELECT 造成大量 round-trip table_sql_dwd = self._format_table(dwd_table, "billiards_dwd") where_current = " AND ".join([f"COALESCE(scd2_is_current,1)=1"]) cur.execute(f"SELECT * FROM {table_sql_dwd} WHERE {where_current}") current_rows = cur.fetchall() or [] current_by_pk: dict[tuple[Any, ...], Dict[str, Any]] = {} for r in current_rows: rr = {k.lower(): v for k, v in r.items()} pk_key = tuple(rr.get(pk) for pk in business_keys) current_by_pk[pk_key] = rr # 计算需要关闭/插入的主键集合 to_close: list[tuple[Any, ...]] = [] to_insert: list[tuple[Dict[str, Any], int]] = [] for pk_key, incoming in src_rows_by_pk.items(): current = current_by_pk.get(pk_key) if current and not self._is_row_changed(current, incoming, dwd_cols): continue if current: version = (current.get("scd2_version") or 1) + 1 to_close.append(pk_key) else: version = 1 to_insert.append((incoming, version)) # 先关闭旧版本(同一批次统一 end_time) if to_close: self._close_current_dim_bulk(cur, dwd_table, business_keys, to_close, now) # 批量插入新版本 if to_insert: self._insert_dim_rows_bulk(cur, dwd_table, dwd_cols, to_insert, now) processed = len(src_rows_by_pk) updated = len(to_close) inserted = max(0, len(to_insert) - updated) skipped = max(0, processed - inserted - updated) return {"processed": processed, "inserted": inserted, "updated": updated, "skipped": skipped} def _close_current_dim_bulk( self, cur, table: str, pk_cols: Sequence[str], pk_keys: Sequence[tuple[Any, ...]], now: datetime, ) -> None: """批量关闭当前版本(scd2_is_current=0 + 填充结束时间)。""" table_sql = self._format_table(table, "billiards_dwd") if len(pk_cols) == 1: pk = pk_cols[0] ids = [k[0] for k in pk_keys] cur.execute( f'UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 ' f'WHERE COALESCE(scd2_is_current,1)=1 AND "{pk}" = ANY(%s)', (now, ids), ) return # 复合主键:对“发生变更的键”逐条关闭(数量通常远小于全量行数) where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols) sql = ( f"UPDATE {table_sql} SET scd2_end_time=%s, scd2_is_current=0 " f"WHERE COALESCE(scd2_is_current,1)=1 AND {where_clause}" ) args_list = [(now, *pk_key) for pk_key in pk_keys] execute_batch(cur, sql, args_list, page_size=500) def _insert_dim_rows_bulk( self, cur, table: str, dwd_cols: Sequence[str], rows_with_version: Sequence[tuple[Dict[str, Any], int]], now: datetime, ) -> None: """批量插入新的 SCD2 版本行。""" sorted_cols = [c.lower() for c in sorted(dwd_cols)] insert_cols_sql = ", ".join(f'"{c}"' for c in sorted_cols) table_sql = self._format_table(table, "billiards_dwd") def build_row(src_row: Dict[str, Any], version: int) -> list[Any]: values: list[Any] = [] for c in sorted_cols: if c == "scd2_start_time": values.append(now) elif c == "scd2_end_time": values.append(datetime(9999, 12, 31, 0, 0, 0)) elif c == "scd2_is_current": values.append(1) elif c == "scd2_version": values.append(version) else: values.append(src_row.get(c)) return values values_rows = [build_row(r, ver) for r, ver in rows_with_version] insert_sql = f"INSERT INTO {table_sql} ({insert_cols_sql}) VALUES %s" execute_values(cur, insert_sql, values_rows, page_size=500) def _upsert_scd2_row( self, cur, dwd_table: str, dwd_cols: Sequence[str], pk_cols: Sequence[str], src_row: Dict[str, Any], now: datetime, ) -> bool: """SCD2 合并:若有变更则关闭旧版并插入新版本。""" pk_values = [src_row.get(pk) for pk in pk_cols] if any(v is None for v in pk_values): self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_values))) return False where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols) table_sql = self._format_table(dwd_table, "billiards_dwd") cur.execute( f"SELECT * FROM {table_sql} WHERE {where_clause} AND COALESCE(scd2_is_current,1)=1 LIMIT 1", pk_values, ) current = cur.fetchone() if current: current = {k.lower(): v for k, v in current.items()} if current and not self._is_row_changed(current, src_row, dwd_cols): return False if current: version = (current.get("scd2_version") or 1) + 1 self._close_current_dim(cur, dwd_table, pk_cols, pk_values, now) else: version = 1 self._insert_dim_row(cur, dwd_table, dwd_cols, src_row, now, version) return True def _close_current_dim(self, cur, table: str, pk_cols: Sequence[str], pk_values: Sequence[Any], now: datetime) -> None: """关闭当前版本,标记 scd2_is_current=0 并填充结束时间。""" set_sql = "scd2_end_time = %s, scd2_is_current = 0" where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols) table_sql = self._format_table(table, "billiards_dwd") cur.execute(f"UPDATE {table_sql} SET {set_sql} WHERE {where_clause} AND COALESCE(scd2_is_current,1)=1", [now, *pk_values]) def _insert_dim_row( self, cur, table: str, dwd_cols: Sequence[str], src_row: Dict[str, Any], now: datetime, version: int, ) -> None: """插入新的 SCD2 版本行。""" insert_cols: List[str] = [] placeholders: List[str] = [] values: List[Any] = [] for col in sorted(dwd_cols): lc = col.lower() insert_cols.append(f'"{lc}"') placeholders.append("%s") if lc == "scd2_start_time": values.append(now) elif lc == "scd2_end_time": values.append(datetime(9999, 12, 31, 0, 0, 0)) elif lc == "scd2_is_current": values.append(1) elif lc == "scd2_version": values.append(version) else: values.append(src_row.get(lc)) table_sql = self._format_table(table, "billiards_dwd") sql = f'INSERT INTO {table_sql} ({", ".join(insert_cols)}) VALUES ({", ".join(placeholders)})' cur.execute(sql, values) def _is_row_changed(self, current: Dict[str, Any], incoming: Dict[str, Any], dwd_cols: Sequence[str]) -> bool: """比较非 SCD2 列,判断是否存在变更。""" for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue if not self._values_equal(current.get(lc), incoming.get(lc)): return True return False def _values_equal(self, current_val: Any, incoming_val: Any) -> bool: """Normalize common type mismatches (numeric/text, naive/aware datetime) before compare.""" current_val = self._normalize_empty(current_val) incoming_val = self._normalize_empty(incoming_val) if current_val is None and incoming_val is None: return True # 日期时间标准化(朴素时间 vs 时区感知时间) if isinstance(current_val, (datetime, date)) or isinstance(incoming_val, (datetime, date)): return self._normalize_datetime(current_val) == self._normalize_datetime(incoming_val) # 布尔值标准化 if self._looks_bool(current_val) or self._looks_bool(incoming_val): cur_bool = self._coerce_bool(current_val) inc_bool = self._coerce_bool(incoming_val) if cur_bool is not None and inc_bool is not None: return cur_bool == inc_bool # 数值标准化(字符串 vs 数值) if self._looks_numeric(current_val) or self._looks_numeric(incoming_val): cur_num = self._coerce_numeric(current_val) inc_num = self._coerce_numeric(incoming_val) if cur_num is not None and inc_num is not None: return cur_num == inc_num return current_val == incoming_val def _normalize_empty(self, value: Any) -> Any: if isinstance(value, str): stripped = value.strip() return None if stripped == "" else stripped return value def _normalize_datetime(self, value: Any) -> Any: if value is None: return None if isinstance(value, date) and not isinstance(value, datetime): value = datetime.combine(value, datetime.min.time()) if not isinstance(value, datetime): return value try: if value.tzinfo is None: return value.replace(tzinfo=self.tz) return value.astimezone(self.tz) except (OverflowError, OSError): # 极端日期值(如 9999-12-31)无法转换时区,直接返回原值 return value def _looks_numeric(self, value: Any) -> bool: if isinstance(value, (int, float, Decimal)) and not isinstance(value, bool): return True if isinstance(value, str): return bool(self._NUMERIC_RE.match(value.strip())) return False def _coerce_numeric(self, value: Any) -> Decimal | None: value = self._normalize_empty(value) if value is None: return None if isinstance(value, bool): return Decimal(int(value)) if isinstance(value, (int, float, Decimal)): try: return Decimal(str(value)) except InvalidOperation: return None if isinstance(value, str): s = value.strip() if not self._NUMERIC_RE.match(s): return None try: return Decimal(s) except InvalidOperation: return None return None def _looks_bool(self, value: Any) -> bool: if isinstance(value, bool): return True if isinstance(value, str): return value.strip().lower() in self._BOOL_STRINGS return False def _coerce_bool(self, value: Any) -> bool | None: value = self._normalize_empty(value) if value is None: return None if isinstance(value, bool): return value if isinstance(value, (int, Decimal)) and not isinstance(value, bool): return bool(int(value)) if isinstance(value, str): s = value.strip().lower() if s in {"true", "1", "yes", "y", "t"}: return True if s in {"false", "0", "no", "n", "f"}: return False return None @staticmethod def _count_returning_flags(rows: Iterable[Any]) -> tuple[int, int]: """Count inserted vs updated from RETURNING (xmax = 0) rows.""" inserted = 0 updated = 0 for row in rows: if isinstance(row, dict): flag = row.get("inserted") else: flag = row[0] if row else None if flag: inserted += 1 else: updated += 1 return inserted, updated def _merge_fact_increment( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], dwd_types: Dict[str, str], ods_types: Dict[str, str], window_start: datetime | None = None, window_end: datetime | None = None, ) -> Dict[str, int]: """事实表按时间增量插入,返回真实新增/更新计数。""" mapping_entries = self.FACT_MAPPINGS.get(dwd_table) or [] mapping: Dict[str, tuple[str, str | None]] = { dst.lower(): (src, cast_type) for dst, src, cast_type in mapping_entries } ods_set = {c.lower() for c in ods_cols} if "fetched_at" not in ods_set: self.logger.error("跳过 %s:ODS 表 %s 缺少 fetched_at 列", dwd_table, ods_table) return {"inserted": 0, "updated": 0, "processed": 0} self._log_missing_fetched_at(cur, ods_table) snapshot_mode = "content_hash" in ods_set fact_upsert = bool(self.config.get("dwd.fact_upsert", True)) mapping_dest = [dst for dst, _, _ in mapping_entries] insert_cols: List[str] = list(mapping_dest) for col in dwd_cols: if col in self.SCD_COLS: continue if col in insert_cols: continue if col in ods_cols: insert_cols.append(col) pk_cols = self._get_primary_keys(cur, dwd_table) existing_lower = [c.lower() for c in insert_cols] for pk in pk_cols: pk_lower = pk.lower() if pk_lower in existing_lower: continue if pk_lower in ods_set: insert_cols.append(pk) existing_lower.append(pk_lower) elif "id" in ods_set: insert_cols.append(pk) existing_lower.append(pk_lower) mapping[pk_lower] = ("id", None) # 保持列顺序同时去重 seen_cols: set[str] = set() ordered_cols: list[str] = [] for col in insert_cols: lc = col.lower() if lc not in seen_cols: seen_cols.add(lc) ordered_cols.append(col) insert_cols = ordered_cols if not insert_cols: self.logger.warning("跳过 %s:未找到可插入的列", dwd_table) return 0 # 事实表统一按 fetched_at 做窗口/水位 order_col = "fetched_at" where_sql = "" params: List[Any] = [] dwd_table_sql = self._format_table(dwd_table, "billiards_dwd") ods_table_sql = self._format_table(ods_table, "billiards_ods") watermark = None if order_col and window_start and window_end: where_sql = f'WHERE "{order_col}" >= %s AND "{order_col}" < %s' params.extend([window_start, window_end]) elif order_col: watermark = self._get_fact_watermark(cur, dwd_table, ods_table, order_col, dwd_cols, ods_cols) where_sql = f'WHERE "{order_col}" > %s' params.append(watermark) where_sql = self._append_where_condition(where_sql, '"fetched_at" IS NOT NULL') default_cols = [c for c in insert_cols if c.lower() not in mapping] default_expr_map: Dict[str, str] = {} if default_cols: default_exprs = self._build_fact_select_exprs(default_cols, dwd_types, ods_types) default_expr_map = dict(zip(default_cols, default_exprs)) select_exprs: List[str] = [] for col in insert_cols: key = col.lower() if key in mapping: src, cast_type = mapping[key] select_exprs.append(self._cast_expr(src, cast_type)) else: select_exprs.append(default_expr_map[col]) select_cols_sql = ", ".join(select_exprs) insert_cols_sql = ", ".join(f'"{c}"' for c in insert_cols) if snapshot_mode and pk_cols: key_exprs: list[str] = [] for pk in pk_cols: pk_lower = pk.lower() if pk_lower in mapping: src, cast_type = mapping[pk_lower] key_exprs.append(self._cast_expr(src, cast_type)) elif pk_lower in ods_set: key_exprs.append(f'"{pk_lower}"') elif "id" in ods_set: key_exprs.append('"id"') select_sql = self._latest_snapshot_select_sql( select_cols_sql, ods_table_sql, key_exprs, order_col, where_sql, ) else: select_sql = ( f'SELECT {select_cols_sql} FROM {ods_table_sql} {where_sql}' ) sql = f'INSERT INTO {dwd_table_sql} ({insert_cols_sql}) {select_sql}' pk_cols = self._get_primary_keys(cur, dwd_table) if pk_cols: pk_sql = ", ".join(f'"{c}"' for c in pk_cols) pk_lower = {c.lower() for c in pk_cols} set_exprs = [f'"{c}" = EXCLUDED."{c}"' for c in insert_cols if c.lower() not in pk_lower] if snapshot_mode or fact_upsert: if set_exprs: compare_cols = [c for c in insert_cols if c.lower() not in pk_lower] diff_exprs = [f'{dwd_table_sql}."{c}" IS DISTINCT FROM EXCLUDED."{c}"' for c in compare_cols] where_clause = f" WHERE {' OR '.join(diff_exprs)}" if diff_exprs else "" sql += f" ON CONFLICT ({pk_sql}) DO UPDATE SET {', '.join(set_exprs)}{where_clause}" else: sql += f" ON CONFLICT ({pk_sql}) DO NOTHING" else: sql += f" ON CONFLICT ({pk_sql}) DO NOTHING" sql += " RETURNING (xmax = 0) AS inserted" cur.execute(sql, params) inserted = 0 updated = 0 while True: rows = cur.fetchmany(10000) if not rows: break ins, upd = self._count_returning_flags(rows) inserted += ins updated += upd # 回补缺失主键记录(基于 fetched_at 窗口/水位,避免全表扫描) missing_inserted = self._insert_missing_by_pk( cur, dwd_table, ods_table, dwd_cols, ods_cols, mapping, insert_cols, dwd_types, ods_types, order_col=order_col, window_start=window_start, window_end=window_end, watermark=watermark, ) inserted += missing_inserted return {"inserted": inserted, "updated": updated, "processed": inserted + updated} def _pick_order_column(self, dwd_table: str, dwd_cols: Iterable[str], ods_cols: Iterable[str]) -> str | None: """Pick an incremental order column that exists in both DWD and ODS.""" lower_cols = {c.lower() for c in dwd_cols} & {c.lower() for c in ods_cols} for candidate in self.FACT_ORDER_CANDIDATES: if candidate.lower() in lower_cols: return candidate.lower() return None def _get_fact_watermark( self, cur, dwd_table: str, ods_table: str, order_col: str, dwd_cols: Iterable[str], ods_cols: Iterable[str], ) -> Any: """Fetch incremental watermark; default from DWD, fallback from ODS join.""" dwd_table_sql = self._format_table(dwd_table, "billiards_dwd") ods_table_sql = self._format_table(ods_table, "billiards_ods") dwd_set = {c.lower() for c in dwd_cols} ods_set = {c.lower() for c in ods_cols} if order_col.lower() in dwd_set: cur.execute( f'SELECT COALESCE(MAX("{order_col}"), %s) FROM {dwd_table_sql}', ("1970-01-01",) ) row = cur.fetchone() or {} return list(row.values())[0] if row else "1970-01-01" pk_cols = self._get_primary_keys(cur, dwd_table) if not pk_cols or order_col.lower() not in ods_set: return "1970-01-01" join_cond = " AND ".join(f'd."{pk}" = o."{pk}"' for pk in pk_cols if pk.lower() in ods_set) if not join_cond: return "1970-01-01" cur.execute( f'SELECT COALESCE(MAX(o."{order_col}"), %s) FROM {ods_table_sql} o JOIN {dwd_table_sql} d ON {join_cond}', ("1970-01-01",), ) row = cur.fetchone() or {} return list(row.values())[0] if row else "1970-01-01" def _insert_missing_by_pk( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], mapping: Dict[str, tuple[str, str | None]], insert_cols: Sequence[str], dwd_types: Dict[str, str], ods_types: Dict[str, str], order_col: str | None = None, window_start: datetime | None = None, window_end: datetime | None = None, watermark: Any | None = None, ) -> int: """Backfill missing PK rows for facts that can receive late data.""" pk_cols = self._get_primary_keys(cur, dwd_table) if not pk_cols: return 0 ods_set = {c.lower() for c in ods_cols} dwd_table_sql = self._format_table(dwd_table, "billiards_dwd") ods_table_sql = self._format_table(ods_table, "billiards_ods") join_pairs = [] for pk in pk_cols: pk_lower = pk.lower() if pk_lower in mapping: src, _ = mapping[pk_lower] elif pk_lower in ods_set: src = pk elif "id" in ods_set: src = "id" else: src = None if not src: return 0 join_pairs.append((pk, src)) join_cond = " AND ".join( f'd."{pk}" = o."{src}"' for pk, src in join_pairs ) null_cond = " AND ".join(f'd."{pk}" IS NULL' for pk, _ in join_pairs) # 类型转换需要的类型集合 numeric_types = {"integer", "bigint", "smallint", "numeric", "double precision", "real", "decimal"} text_types = {"text", "character varying", "varchar"} select_exprs = [] for col in insert_cols: key = col.lower() if key in mapping: src, cast_type = mapping[key] if src.isidentifier(): expr = self._cast_expr(f'o."{src}"', cast_type) else: expr = self._cast_expr(src, cast_type) select_exprs.append(expr) elif key in ods_set: # 检查是否需要类型转换 (ODS text -> DWD numeric) d_type = dwd_types.get(col) o_type = ods_types.get(col) if d_type in numeric_types and o_type in text_types: select_exprs.append(f'CAST(NULLIF(CAST(o."{col}" AS text), \'\') AS {d_type})') else: select_exprs.append(f'o."{col}"') else: select_exprs.append("NULL") select_cols_sql = ", ".join(select_exprs) insert_cols_sql = ", ".join(f'"{c}"' for c in insert_cols) where_filters: list[str] = [] params: list[Any] = [] if order_col and window_start and window_end: where_filters.append(f'o."{order_col}" >= %s AND o."{order_col}" < %s') params.extend([window_start, window_end]) elif order_col and watermark is not None: where_filters.append(f'o."{order_col}" > %s') params.append(watermark) if order_col: where_filters.append(f'o."{order_col}" IS NOT NULL') extra_where = f" AND {' AND '.join(where_filters)}" if where_filters else "" sql = ( f'INSERT INTO {dwd_table_sql} ({insert_cols_sql}) ' f'SELECT {select_cols_sql} ' f'FROM {ods_table_sql} o ' f'LEFT JOIN {dwd_table_sql} d ON {join_cond} ' f'WHERE {null_cond}{extra_where}' ) pk_sql = ", ".join(f'"{c}"' for c in pk_cols) sql += f" ON CONFLICT ({pk_sql}) DO NOTHING" cur.execute(sql, params) return cur.rowcount def _build_fact_select_exprs( self, insert_cols: Sequence[str], dwd_types: Dict[str, str], ods_types: Dict[str, str], ) -> List[str]: """构造事实表 SELECT 列表,需要时做类型转换。""" numeric_types = {"integer", "bigint", "smallint", "numeric", "double precision", "real", "decimal"} text_types = {"text", "character varying", "varchar"} exprs = [] for col in insert_cols: d_type = dwd_types.get(col) o_type = ods_types.get(col) if d_type in numeric_types and o_type in text_types: exprs.append(f"CAST(NULLIF(CAST(\"{col}\" AS text), '') AS numeric):: {d_type}") else: exprs.append(f'"{col}"') return exprs def _split_table_name(self, name: str, default_schema: str) -> tuple[str, str]: """拆分 schema.table,若无 schema 则补默认 schema。""" parts = name.split(".") if len(parts) == 2: return parts[0], parts[1].lower() return default_schema, name.lower() def _table_base(self, name: str) -> str: """获取不含 schema 的表名。""" return name.split(".")[-1] def _format_table(self, name: str, default_schema: str) -> str: """返回带引号的 schema.table 名称。""" schema, table = self._split_table_name(name, default_schema) return f'"{schema}"."{table}"' def _cast_expr(self, col: str, cast_type: str | None) -> str: """构造带可选 CAST 的列表达式。""" if col.upper() == "NULL": base = "NULL" else: is_expr = not col.isidentifier() or "->" in col or "#>>" in col or "::" in col or "'" in col base = col if is_expr else f'"{col}"' if cast_type: cast_lower = cast_type.lower() if cast_lower in {"bigint", "integer", "numeric", "decimal"}: return f"CAST(NULLIF(CAST({base} AS text), '') AS numeric):: {cast_type}" if cast_lower == "timestamptz": return f"({base})::timestamptz" return f"{base}::{cast_type}" return base # AI_CHANGELOG: # - 日期: 2026-02-14 # - Prompt: P20260214-023000 — "settlement_records 的 settlelist 和 payload 数据重复,删掉 ODS 此字段" # - 直接原因: ODS 层 settlelist 列被删除后,DWD 加载映射需改为从 payload 提取 settleList # - 变更摘要: FACT_MAPPINGS 中 dwd_settlement_head_ex 的 settle_list 映射从 ("settlelist", None) 改为 ("payload->'settleList'", None) # - 风险与验证: payload IS NULL 的行 settle_list 将为 NULL;验证:确认 payload->'settleList' 在 settlement_records 中有 54937 行非空 # AI_CHANGELOG: # - 日期: 2026-02-14 # - Prompt: P20260214-040000 — "dwd_settlement_head_ex.settle_list 也没有必要保留了" # - 直接原因: settle_list 列与 ODS payload 中的 settleList 完全冗余,DWD 层无需存储该 jsonb 副本 # - 变更摘要: 删除 FACT_MAPPINGS 中 dwd_settlement_head_ex 的 settle_list 映射行及相关 CHANGE 注释 # - 风险与验证: DWD 装载不再写入 settle_list;验证:information_schema 确认列已删除,DWD_LOAD_FROM_ODS 试运行无报错