# -*- coding: utf-8 -*- """DWD 装载任务:从 ODS 增量写入 DWD(维度 SCD2,事实按时间增量)。""" from __future__ import annotations from datetime import datetime from typing import Any, Dict, Iterable, List, Sequence from psycopg2.extras import RealDictCursor from .base_task import BaseTask, TaskContext class DwdLoadTask(BaseTask): """负责 DWD 装载:维度表做 SCD2 合并,事实表按时间增量写入。""" # DWD -> ODS 表映射(ODS 表名已与示例 JSON 前缀统一) TABLE_MAP: dict[str, str] = { # 维度 # 门店:改用台费流水中的 siteprofile 快照,补齐 org/地址等字段 "billiards_dwd.dim_site": "billiards_ods.table_fee_transactions", "billiards_dwd.dim_site_ex": "billiards_ods.table_fee_transactions", "billiards_dwd.dim_table": "billiards_ods.site_tables_master", "billiards_dwd.dim_table_ex": "billiards_ods.site_tables_master", "billiards_dwd.dim_assistant": "billiards_ods.assistant_accounts_master", "billiards_dwd.dim_assistant_ex": "billiards_ods.assistant_accounts_master", "billiards_dwd.dim_member": "billiards_ods.member_profiles", "billiards_dwd.dim_member_ex": "billiards_ods.member_profiles", "billiards_dwd.dim_member_card_account": "billiards_ods.member_stored_value_cards", "billiards_dwd.dim_member_card_account_ex": "billiards_ods.member_stored_value_cards", "billiards_dwd.dim_tenant_goods": "billiards_ods.tenant_goods_master", "billiards_dwd.dim_tenant_goods_ex": "billiards_ods.tenant_goods_master", "billiards_dwd.dim_store_goods": "billiards_ods.store_goods_master", "billiards_dwd.dim_store_goods_ex": "billiards_ods.store_goods_master", "billiards_dwd.dim_goods_category": "billiards_ods.stock_goods_category_tree", "billiards_dwd.dim_groupbuy_package": "billiards_ods.group_buy_packages", "billiards_dwd.dim_groupbuy_package_ex": "billiards_ods.group_buy_packages", # 事实 "billiards_dwd.dwd_settlement_head": "billiards_ods.settlement_records", "billiards_dwd.dwd_settlement_head_ex": "billiards_ods.settlement_records", "billiards_dwd.dwd_table_fee_log": "billiards_ods.table_fee_transactions", "billiards_dwd.dwd_table_fee_log_ex": "billiards_ods.table_fee_transactions", "billiards_dwd.dwd_table_fee_adjust": "billiards_ods.table_fee_discount_records", "billiards_dwd.dwd_table_fee_adjust_ex": "billiards_ods.table_fee_discount_records", "billiards_dwd.dwd_store_goods_sale": "billiards_ods.store_goods_sales_records", "billiards_dwd.dwd_store_goods_sale_ex": "billiards_ods.store_goods_sales_records", "billiards_dwd.dwd_assistant_service_log": "billiards_ods.assistant_service_records", "billiards_dwd.dwd_assistant_service_log_ex": "billiards_ods.assistant_service_records", "billiards_dwd.dwd_assistant_trash_event": "billiards_ods.assistant_cancellation_records", "billiards_dwd.dwd_assistant_trash_event_ex": "billiards_ods.assistant_cancellation_records", "billiards_dwd.dwd_member_balance_change": "billiards_ods.member_balance_changes", "billiards_dwd.dwd_member_balance_change_ex": "billiards_ods.member_balance_changes", "billiards_dwd.dwd_groupbuy_redemption": "billiards_ods.group_buy_redemption_records", "billiards_dwd.dwd_groupbuy_redemption_ex": "billiards_ods.group_buy_redemption_records", "billiards_dwd.dwd_platform_coupon_redemption": "billiards_ods.platform_coupon_redemption_records", "billiards_dwd.dwd_platform_coupon_redemption_ex": "billiards_ods.platform_coupon_redemption_records", "billiards_dwd.dwd_recharge_order": "billiards_ods.recharge_settlements", "billiards_dwd.dwd_recharge_order_ex": "billiards_ods.recharge_settlements", "billiards_dwd.dwd_payment": "billiards_ods.payment_transactions", "billiards_dwd.dwd_refund": "billiards_ods.refund_transactions", "billiards_dwd.dwd_refund_ex": "billiards_ods.refund_transactions", } SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"} FACT_ORDER_CANDIDATES = [ "fetched_at", "pay_time", "create_time", "update_time", "occur_time", "settle_time", "start_use_time", ] # 特殊列映射:dwd 列名 -> 源列表达式(可选 CAST) FACT_MAPPINGS: dict[str, list[tuple[str, str, str | None]]] = { # 维度表(补齐主键/字段差异) "billiards_dwd.dim_site": [ ("org_id", "siteprofile->>'org_id'", None), ("shop_name", "siteprofile->>'shop_name'", None), ("site_label", "siteprofile->>'site_label'", None), ("full_address", "siteprofile->>'full_address'", None), ("address", "siteprofile->>'address'", None), ("longitude", "siteprofile->>'longitude'", "numeric"), ("latitude", "siteprofile->>'latitude'", "numeric"), ("tenant_site_region_id", "siteprofile->>'tenant_site_region_id'", None), ("business_tel", "siteprofile->>'business_tel'", None), ("site_type", "siteprofile->>'site_type'", None), ("shop_status", "siteprofile->>'shop_status'", None), ("tenant_id", "siteprofile->>'tenant_id'", None), ], "billiards_dwd.dim_site_ex": [ ("auto_light", "siteprofile->>'auto_light'", None), ("attendance_enabled", "siteprofile->>'attendance_enabled'", None), ("attendance_distance", "siteprofile->>'attendance_distance'", None), ("prod_env", "siteprofile->>'prod_env'", None), ("light_status", "siteprofile->>'light_status'", None), ("light_type", "siteprofile->>'light_type'", None), ("light_token", "siteprofile->>'light_token'", None), ("address", "siteprofile->>'address'", None), ("avatar", "siteprofile->>'avatar'", None), ("wifi_name", "siteprofile->>'wifi_name'", None), ("wifi_password", "siteprofile->>'wifi_password'", None), ("customer_service_qrcode", "siteprofile->>'customer_service_qrcode'", None), ("customer_service_wechat", "siteprofile->>'customer_service_wechat'", None), ("fixed_pay_qrcode", "siteprofile->>'fixed_pay_qrCode'", None), ("longitude", "siteprofile->>'longitude'", "numeric"), ("latitude", "siteprofile->>'latitude'", "numeric"), ("tenant_site_region_id", "siteprofile->>'tenant_site_region_id'", None), ("site_type", "siteprofile->>'site_type'", None), ("site_label", "siteprofile->>'site_label'", None), ("shop_status", "siteprofile->>'shop_status'", None), ("create_time", "siteprofile->>'create_time'", "timestamptz"), ("update_time", "siteprofile->>'update_time'", "timestamptz"), ], "billiards_dwd.dim_table": [ ("table_id", "id", None), ("site_table_area_name", "areaname", None), ("tenant_table_area_id", "site_table_area_id", None), ], "billiards_dwd.dim_table_ex": [ ("table_id", "id", None), ("table_cloth_use_time", "table_cloth_use_time", None), ], "billiards_dwd.dim_assistant": [("assistant_id", "id", None), ("user_id", "staff_id", None)], "billiards_dwd.dim_assistant_ex": [ ("assistant_id", "id", None), ("introduce", "introduce", None), ("group_name", "group_name", None), ("light_equipment_id", "light_equipment_id", None), ], "billiards_dwd.dim_member": [("member_id", "id", None)], "billiards_dwd.dim_member_ex": [ ("member_id", "id", None), ("register_site_name", "site_name", None), ], "billiards_dwd.dim_member_card_account": [("member_card_id", "id", None)], "billiards_dwd.dim_member_card_account_ex": [ ("member_card_id", "id", None), ("tenant_name", "tenantname", None), ("tenantavatar", "tenantavatar", None), ("card_no", "card_no", None), ("bind_password", "bind_password", None), ("use_scene", "use_scene", None), ("tableareaid", "tableareaid", None), ("goodscategoryid", "goodscategoryid", None), ], "billiards_dwd.dim_tenant_goods": [ ("tenant_goods_id", "id", None), ("category_name", "categoryname", None), ], "billiards_dwd.dim_tenant_goods_ex": [ ("tenant_goods_id", "id", None), ("remark_name", "remark_name", None), ("goods_bar_code", "goods_bar_code", None), ("commodity_code_list", "commodity_code", None), ("is_in_site", "isinsite", "boolean"), ], "billiards_dwd.dim_store_goods": [ ("site_goods_id", "id", None), ("category_level1_name", "onecategoryname", None), ("category_level2_name", "twocategoryname", None), ("created_at", "create_time", None), ("updated_at", "update_time", None), ("avg_monthly_sales", "average_monthly_sales", None), ("batch_stock_qty", "stock", None), ("sale_qty", "sale_num", None), ("total_sales_qty", "total_sales", None), ], "billiards_dwd.dim_store_goods_ex": [ ("site_goods_id", "id", None), ("goods_barcode", "goods_bar_code", None), ("stock_qty", "stock", None), ("stock_secondary_qty", "stock_a", None), ("safety_stock_qty", "safe_stock", None), ("site_name", "sitename", None), ("goods_cover_url", "goods_cover", None), ("provisional_total_cost", "total_purchase_cost", None), ("is_discountable", "able_discount", None), ("freeze_status", "freeze", None), ("remark", "remark", None), ("days_on_shelf", "days_available", None), ("sort_order", "sort", None), ], "billiards_dwd.dim_goods_category": [ ("category_id", "id", None), ("tenant_id", "tenant_id", None), ("category_name", "category_name", None), ("alias_name", "alias_name", None), ("parent_category_id", "pid", None), ("business_name", "business_name", None), ("tenant_goods_business_id", "tenant_goods_business_id", None), ("sort_order", "sort", None), ("open_salesman", "open_salesman", None), ("is_warehousing", "is_warehousing", None), ("category_level", "CASE WHEN pid = 0 THEN 1 ELSE 2 END", None), ("is_leaf", "CASE WHEN categoryboxes IS NULL OR jsonb_array_length(categoryboxes)=0 THEN 1 ELSE 0 END", None), ], "billiards_dwd.dim_groupbuy_package": [ ("groupbuy_package_id", "id", None), ("package_template_id", "package_id", None), ("coupon_face_value", "coupon_money", None), ("duration_seconds", "duration", None), ], "billiards_dwd.dim_groupbuy_package_ex": [ ("groupbuy_package_id", "id", None), ("table_area_id", "table_area_id", None), ("tenant_table_area_id", "tenant_table_area_id", None), ("usable_range", "usable_range", None), ("table_area_id_list", "table_area_id_list", None), ("package_type", "type", None), ], # 事实表主键及关键差异列 "billiards_dwd.dwd_table_fee_log": [("table_fee_log_id", "id", None)], "billiards_dwd.dwd_table_fee_log_ex": [ ("table_fee_log_id", "id", None), ("salesman_name", "salesman_name", None), ], "billiards_dwd.dwd_table_fee_adjust": [ ("table_fee_adjust_id", "id", None), ("table_id", "site_table_id", None), ("table_area_id", "tenant_table_area_id", None), ("table_area_name", "tableprofile->>'table_area_name'", None), ("adjust_time", "create_time", None), ], "billiards_dwd.dwd_table_fee_adjust_ex": [ ("table_fee_adjust_id", "id", None), ("ledger_name", "ledger_name", None), ], "billiards_dwd.dwd_store_goods_sale": [("store_goods_sale_id", "id", None), ("discount_price", "discount_money", None)], "billiards_dwd.dwd_store_goods_sale_ex": [ ("store_goods_sale_id", "id", None), ("option_value_name", "option_value_name", None), ("open_salesman_flag", "opensalesman", "integer"), ("salesman_name", "salesman_name", None), ("salesman_org_id", "sales_man_org_id", None), ("legacy_order_goods_id", "ordergoodsid", None), ("site_name", "sitename", None), ("legacy_site_id", "siteid", None), ], "billiards_dwd.dwd_assistant_service_log": [ ("assistant_service_id", "id", None), ("assistant_no", "assistantno", None), ("site_assistant_id", "order_assistant_id", None), ("level_name", "levelname", None), ("skill_name", "skillname", None), ], "billiards_dwd.dwd_assistant_service_log_ex": [ ("assistant_service_id", "id", None), ("assistant_name", "assistantname", None), ("ledger_group_name", "ledger_group_name", None), ("trash_applicant_name", "trash_applicant_name", None), ("trash_reason", "trash_reason", None), ("salesman_name", "salesman_name", None), ("table_name", "tablename", None), ], "billiards_dwd.dwd_assistant_trash_event": [ ("assistant_trash_event_id", "id", None), ("assistant_no", "assistantname", None), ("abolish_amount", "assistantabolishamount", None), ("charge_minutes_raw", "pdchargeminutes", None), ("site_id", "siteid", None), ("table_id", "tableid", None), ("table_area_id", "tableareaid", None), ("assistant_name", "assistantname", None), ("trash_reason", "trashreason", None), ("create_time", "createtime", None), ], "billiards_dwd.dwd_assistant_trash_event_ex": [ ("assistant_trash_event_id", "id", None), ("table_area_name", "tablearea", None), ("table_name", "tablename", None), ], "billiards_dwd.dwd_member_balance_change": [ ("balance_change_id", "id", None), ("balance_before", "before", None), ("change_amount", "account_data", None), ("balance_after", "after", None), ("card_type_name", "membercardtypename", None), ("change_time", "create_time", None), ("member_name", "membername", None), ("member_mobile", "membermobile", None), ], "billiards_dwd.dwd_member_balance_change_ex": [ ("balance_change_id", "id", None), ("pay_site_name", "paysitename", None), ("register_site_name", "registersitename", None), ], "billiards_dwd.dwd_groupbuy_redemption": [("redemption_id", "id", None)], "billiards_dwd.dwd_groupbuy_redemption_ex": [ ("redemption_id", "id", None), ("table_area_name", "tableareaname", None), ("site_name", "sitename", None), ("table_name", "tablename", None), ("goods_option_price", "goodsoptionprice", None), ("salesman_name", "salesman_name", None), ("salesman_org_id", "sales_man_org_id", None), ("ledger_group_name", "ledger_group_name", None), ], "billiards_dwd.dwd_platform_coupon_redemption": [("platform_coupon_redemption_id", "id", None)], "billiards_dwd.dwd_platform_coupon_redemption_ex": [ ("platform_coupon_redemption_id", "id", None), ("coupon_cover", "coupon_cover", None), ], "billiards_dwd.dwd_payment": [("payment_id", "id", None), ("pay_date", "pay_time", "date")], "billiards_dwd.dwd_refund": [("refund_id", "id", None)], "billiards_dwd.dwd_refund_ex": [ ("refund_id", "id", None), ("tenant_name", "tenantname", None), ("channel_payer_id", "channel_payer_id", None), ("channel_pay_no", "channel_pay_no", None), ], # 结算头:settlement_records(源列为小写驼峰/无下划线,需要显式映射) "billiards_dwd.dwd_settlement_head": [ ("order_settle_id", "id", None), ("tenant_id", "tenantid", None), ("site_id", "siteid", None), ("site_name", "sitename", None), ("table_id", "tableid", None), ("settle_name", "settlename", None), ("order_trade_no", "settlerelateid", None), ("create_time", "createtime", None), ("pay_time", "paytime", None), ("settle_type", "settletype", None), ("revoke_order_id", "revokeorderid", None), ("member_id", "memberid", None), ("member_name", "membername", None), ("member_phone", "memberphone", None), ("member_card_account_id", "tenantmembercardid", None), ("member_card_type_name", "membercardtypename", None), ("is_bind_member", "isbindmember", None), ("member_discount_amount", "memberdiscountamount", None), ("consume_money", "consumemoney", None), ("table_charge_money", "tablechargemoney", None), ("goods_money", "goodsmoney", None), ("real_goods_money", "realgoodsmoney", None), ("assistant_pd_money", "assistantpdmoney", None), ("assistant_cx_money", "assistantcxmoney", None), ("adjust_amount", "adjustamount", None), ("pay_amount", "payamount", None), ("balance_amount", "balanceamount", None), ("recharge_card_amount", "rechargecardamount", None), ("gift_card_amount", "giftcardamount", None), ("coupon_amount", "couponamount", None), ("rounding_amount", "roundingamount", None), ("point_amount", "pointamount", None), ], "billiards_dwd.dwd_settlement_head_ex": [ ("order_settle_id", "id", None), ("serial_number", "serialnumber", None), ("settle_status", "settlestatus", None), ("can_be_revoked", "canberevoked", "boolean"), ("revoke_order_name", "revokeordername", None), ("revoke_time", "revoketime", None), ("is_first_order", "isfirst", "boolean"), ("service_money", "servicemoney", None), ("cash_amount", "cashamount", None), ("card_amount", "cardamount", None), ("online_amount", "onlineamount", None), ("refund_amount", "refundamount", None), ("prepay_money", "prepaymoney", None), ("payment_method", "paymentmethod", None), ("coupon_sale_amount", "couponsaleamount", None), ("all_coupon_discount", "allcoupondiscount", None), ("goods_promotion_money", "goodspromotionmoney", None), ("assistant_promotion_money", "assistantpromotionmoney", None), ("activity_discount", "activitydiscount", None), ("assistant_manual_discount", "assistantmanualdiscount", None), ("point_discount_price", "pointdiscountprice", None), ("point_discount_cost", "pointdiscountcost", None), ("is_use_coupon", "isusecoupon", "boolean"), ("is_use_discount", "isusediscount", "boolean"), ("is_activity", "isactivity", "boolean"), ("operator_name", "operatorname", None), ("salesman_name", "salesmanname", None), ("order_remark", "orderremark", None), ("operator_id", "operatorid", None), ("salesman_user_id", "salesmanuserid", None), ], # 充值结算:recharge_settlements(字段风格同 settlement_records) "billiards_dwd.dwd_recharge_order": [ ("recharge_order_id", "id", None), ("tenant_id", "tenantid", None), ("site_id", "siteid", None), ("member_id", "memberid", None), ("member_name_snapshot", "membername", None), ("member_phone_snapshot", "memberphone", None), ("tenant_member_card_id", "tenantmembercardid", None), ("member_card_type_name", "membercardtypename", None), ("settle_relate_id", "settlerelateid", None), ("settle_type", "settletype", None), ("settle_name", "settlename", None), ("is_first", "isfirst", None), ("pay_amount", "payamount", None), ("refund_amount", "refundamount", None), ("point_amount", "pointamount", None), ("cash_amount", "cashamount", None), ("payment_method", "paymentmethod", None), ("create_time", "createtime", None), ("pay_time", "paytime", None), ], "billiards_dwd.dwd_recharge_order_ex": [ ("recharge_order_id", "id", None), ("site_name_snapshot", "sitename", None), ("salesman_name", "salesmanname", None), ("order_remark", "orderremark", None), ("revoke_order_name", "revokeordername", None), ("settle_status", "settlestatus", None), ("is_bind_member", "isbindmember", "boolean"), ("is_activity", "isactivity", "boolean"), ("is_use_coupon", "isusecoupon", "boolean"), ("is_use_discount", "isusediscount", "boolean"), ("can_be_revoked", "canberevoked", "boolean"), ("online_amount", "onlineamount", None), ("balance_amount", "balanceamount", None), ("card_amount", "cardamount", None), ("coupon_amount", "couponamount", None), ("recharge_card_amount", "rechargecardamount", None), ("gift_card_amount", "giftcardamount", None), ("prepay_money", "prepaymoney", None), ("consume_money", "consumemoney", None), ("goods_money", "goodsmoney", None), ("real_goods_money", "realgoodsmoney", None), ("table_charge_money", "tablechargemoney", None), ("service_money", "servicemoney", None), ("activity_discount", "activitydiscount", None), ("all_coupon_discount", "allcoupondiscount", None), ("goods_promotion_money", "goodspromotionmoney", None), ("assistant_promotion_money", "assistantpromotionmoney", None), ("assistant_pd_money", "assistantpdmoney", None), ("assistant_cx_money", "assistantcxmoney", None), ("assistant_manual_discount", "assistantmanualdiscount", None), ("coupon_sale_amount", "couponsaleamount", None), ("member_discount_amount", "memberdiscountamount", None), ("point_discount_price", "pointdiscountprice", None), ("point_discount_cost", "pointdiscountcost", None), ("adjust_amount", "adjustamount", None), ("rounding_amount", "roundingamount", None), ("operator_id", "operatorid", None), ("operator_name_snapshot", "operatorname", None), ("salesman_user_id", "salesmanuserid", None), ("salesman_name", "salesmanname", None), ("order_remark", "orderremark", None), ("table_id", "tableid", None), ("serial_number", "serialnumber", None), ("revoke_order_id", "revokeorderid", None), ("revoke_order_name", "revokeordername", None), ("revoke_time", "revoketime", None), ], } def get_task_code(self) -> str: """返回任务编码。""" return "DWD_LOAD_FROM_ODS" def extract(self, context: TaskContext) -> dict[str, Any]: """准备运行所需的上下文信息。""" return {"now": datetime.now()} def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]: """遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。""" now = extracted["now"] summary: List[Dict[str, Any]] = [] with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur: for dwd_table, ods_table in self.TABLE_MAP.items(): dwd_cols = self._get_columns(cur, dwd_table) ods_cols = self._get_columns(cur, ods_table) if not dwd_cols: self.logger.warning("跳过 %s,未能获取 DWD 列信息", dwd_table) continue if self._table_base(dwd_table).startswith("dim_"): processed = self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now) summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed}) else: dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd") ods_types = self._get_column_types(cur, ods_table, "billiards_ods") inserted = self._merge_fact_increment( cur, dwd_table, ods_table, dwd_cols, ods_cols, dwd_types, ods_types ) summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted}) self.db.conn.commit() return {"tables": summary} # ---------------------- helpers ---------------------- def _get_columns(self, cur, table: str) -> List[str]: """获取指定表的列名(小写)。""" schema, name = self._split_table_name(table, default_schema="billiards_dwd") cur.execute( """ SELECT column_name FROM information_schema.columns WHERE table_schema = %s AND table_name = %s """, (schema, name), ) return [r["column_name"].lower() for r in cur.fetchall()] def _get_primary_keys(self, cur, table: str) -> List[str]: """获取表的主键列名列表。""" schema, name = self._split_table_name(table, default_schema="billiards_dwd") cur.execute( """ SELECT kcu.column_name FROM information_schema.table_constraints tc JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema AND tc.table_name = kcu.table_name WHERE tc.table_schema = %s AND tc.table_name = %s AND tc.constraint_type = 'PRIMARY KEY' ORDER BY kcu.ordinal_position """, (schema, name), ) return [r["column_name"].lower() for r in cur.fetchall()] def _get_column_types(self, cur, table: str, default_schema: str) -> Dict[str, str]: """获取列的数据类型(information_schema.data_type)。""" schema, name = self._split_table_name(table, default_schema=default_schema) cur.execute( """ SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = %s AND table_name = %s """, (schema, name), ) return {r["column_name"].lower(): r["data_type"].lower() for r in cur.fetchall()} def _build_column_mapping( self, dwd_table: str, pk_cols: Sequence[str], ods_cols: Sequence[str] ) -> Dict[str, tuple[str, str | None]]: """合并显式 FACT_MAPPINGS 与主键兜底映射。""" mapping_entries = self.FACT_MAPPINGS.get(dwd_table, []) mapping: Dict[str, tuple[str, str | None]] = { dst.lower(): (src, cast_type) for dst, src, cast_type in mapping_entries } ods_set = {c.lower() for c in ods_cols} for pk in pk_cols: pk_lower = pk.lower() if pk_lower not in mapping and pk_lower not in ods_set and "id" in ods_set: mapping[pk_lower] = ("id", None) return mapping def _fetch_source_rows( self, cur, table: str, columns: Sequence[str], where_sql: str = "", params: Sequence[Any] = None ) -> List[Dict[str, Any]]: """从源表读取指定列,返回小写键的字典列表。""" schema, name = self._split_table_name(table, default_schema="billiards_ods") cols_sql = ", ".join(f'"{c}"' for c in columns) sql = f'SELECT {cols_sql} FROM "{schema}"."{name}" {where_sql}' cur.execute(sql, params or []) rows = [] for r in cur.fetchall(): rows.append({k.lower(): v for k, v in r.items()}) return rows def _expand_goods_category_rows(self, rows: list[Dict[str, Any]]) -> list[Dict[str, Any]]: """将分类表中的 categoryboxes 元素展开为子类记录。""" expanded: list[Dict[str, Any]] = [] for r in rows: expanded.append(r) boxes = r.get("categoryboxes") if isinstance(boxes, list): for child in boxes: if not isinstance(child, dict): continue child_row: Dict[str, Any] = {} # 继承父级的租户与业务大类信息 child_row["tenant_id"] = r.get("tenant_id") child_row["business_name"] = child.get("business_name", r.get("business_name")) child_row["tenant_goods_business_id"] = child.get( "tenant_goods_business_id", r.get("tenant_goods_business_id") ) # 合并子类字段 child_row.update(child) # 默认父子关系 child_row.setdefault("pid", r.get("id")) # 衍生层级/叶子标记 child_boxes = child_row.get("categoryboxes") if not isinstance(child_boxes, list): is_leaf = 1 else: is_leaf = 1 if len(child_boxes) == 0 else 0 child_row.setdefault("category_level", 2) child_row.setdefault("is_leaf", is_leaf) expanded.append(child_row) return expanded def _merge_dim_scd2( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], now: datetime, ) -> int: """对维表执行 SCD2 合并:对比变更关闭旧版并插入新版。""" pk_cols = self._get_primary_keys(cur, dwd_table) if not pk_cols: raise ValueError(f"{dwd_table} 未配置主键,无法执行 SCD2 合并") mapping = self._build_column_mapping(dwd_table, pk_cols, ods_cols) ods_set = {c.lower() for c in ods_cols} table_sql = self._format_table(ods_table, "billiards_ods") # 构造 SELECT 表达式,支持 JSON/expression 映射 select_exprs: list[str] = [] added: set[str] = set() for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue if lc in mapping: src, cast_type = mapping[lc] select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"") added.add(lc) elif lc in ods_set: select_exprs.append(f'"{lc}" AS "{lc}"') added.add(lc) # 分类维度需要额外读取 categoryboxes 以展开子类 if dwd_table == "billiards_dwd.dim_goods_category" and "categoryboxes" not in added and "categoryboxes" in ods_set: select_exprs.append('"categoryboxes" AS "categoryboxes"') added.add("categoryboxes") # 主键兜底确保被选出 for pk in pk_cols: lc = pk.lower() if lc not in added: if lc in mapping: src, cast_type = mapping[lc] select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"") elif lc in ods_set: select_exprs.append(f'"{lc}" AS "{lc}"') added.add(lc) if not select_exprs: return 0 sql = f"SELECT {', '.join(select_exprs)} FROM {table_sql}" cur.execute(sql) rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()] # 特殊:分类维度展开子类 if dwd_table == "billiards_dwd.dim_goods_category": rows = self._expand_goods_category_rows(rows) inserted_or_updated = 0 seen_pk = set() for row in rows: mapped_row: Dict[str, Any] = {} for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue value = row.get(lc) if value is None and lc in mapping: src, _ = mapping[lc] value = row.get(src.lower()) mapped_row[lc] = value pk_key = tuple(mapped_row.get(pk) for pk in pk_cols) if pk_key in seen_pk: continue seen_pk.add(pk_key) if self._upsert_scd2_row(cur, dwd_table, dwd_cols, pk_cols, mapped_row, now): inserted_or_updated += 1 return len(rows) def _upsert_scd2_row( self, cur, dwd_table: str, dwd_cols: Sequence[str], pk_cols: Sequence[str], src_row: Dict[str, Any], now: datetime, ) -> bool: """SCD2 合并:若有变更则关闭旧版并插入新版本。""" pk_values = [src_row.get(pk) for pk in pk_cols] if any(v is None for v in pk_values): self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_values))) return False where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols) table_sql = self._format_table(dwd_table, "billiards_dwd") cur.execute( f"SELECT * FROM {table_sql} WHERE {where_clause} AND COALESCE(scd2_is_current,1)=1 LIMIT 1", pk_values, ) current = cur.fetchone() if current: current = {k.lower(): v for k, v in current.items()} if current and not self._is_row_changed(current, src_row, dwd_cols): return False if current: version = (current.get("scd2_version") or 1) + 1 self._close_current_dim(cur, dwd_table, pk_cols, pk_values, now) else: version = 1 self._insert_dim_row(cur, dwd_table, dwd_cols, src_row, now, version) return True def _close_current_dim(self, cur, table: str, pk_cols: Sequence[str], pk_values: Sequence[Any], now: datetime) -> None: """关闭当前版本,标记 scd2_is_current=0 并填充结束时间。""" set_sql = "scd2_end_time = %s, scd2_is_current = 0" where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols) table_sql = self._format_table(table, "billiards_dwd") cur.execute(f"UPDATE {table_sql} SET {set_sql} WHERE {where_clause} AND COALESCE(scd2_is_current,1)=1", [now, *pk_values]) def _insert_dim_row( self, cur, table: str, dwd_cols: Sequence[str], src_row: Dict[str, Any], now: datetime, version: int, ) -> None: """插入新的 SCD2 版本行。""" insert_cols: List[str] = [] placeholders: List[str] = [] values: List[Any] = [] for col in sorted(dwd_cols): lc = col.lower() insert_cols.append(f'"{lc}"') placeholders.append("%s") if lc == "scd2_start_time": values.append(now) elif lc == "scd2_end_time": values.append(datetime(9999, 12, 31, 0, 0, 0)) elif lc == "scd2_is_current": values.append(1) elif lc == "scd2_version": values.append(version) else: values.append(src_row.get(lc)) table_sql = self._format_table(table, "billiards_dwd") sql = f'INSERT INTO {table_sql} ({", ".join(insert_cols)}) VALUES ({", ".join(placeholders)})' cur.execute(sql, values) def _is_row_changed(self, current: Dict[str, Any], incoming: Dict[str, Any], dwd_cols: Sequence[str]) -> bool: """比较非 SCD2 列,判断是否存在变更。""" for col in dwd_cols: lc = col.lower() if lc in self.SCD_COLS: continue if current.get(lc) != incoming.get(lc): return True return False def _merge_fact_increment( self, cur, dwd_table: str, ods_table: str, dwd_cols: Sequence[str], ods_cols: Sequence[str], dwd_types: Dict[str, str], ods_types: Dict[str, str], ) -> int: """事实表按时间增量插入,默认按列名交集写入。""" mapping_entries = self.FACT_MAPPINGS.get(dwd_table) or [] mapping: Dict[str, tuple[str, str | None]] = { dst.lower(): (src, cast_type) for dst, src, cast_type in mapping_entries } mapping_dest = [dst for dst, _, _ in mapping_entries] insert_cols: List[str] = list(mapping_dest) for col in dwd_cols: if col in self.SCD_COLS: continue if col in insert_cols: continue if col in ods_cols: insert_cols.append(col) pk_cols = self._get_primary_keys(cur, dwd_table) ods_set = {c.lower() for c in ods_cols} existing_lower = [c.lower() for c in insert_cols] for pk in pk_cols: pk_lower = pk.lower() if pk_lower in existing_lower: continue if pk_lower in ods_set: insert_cols.append(pk) existing_lower.append(pk_lower) elif "id" in ods_set: insert_cols.append(pk) existing_lower.append(pk_lower) mapping[pk_lower] = ("id", None) # 保持列顺序同时去重 seen_cols: set[str] = set() ordered_cols: list[str] = [] for col in insert_cols: lc = col.lower() if lc not in seen_cols: seen_cols.add(lc) ordered_cols.append(col) insert_cols = ordered_cols if not insert_cols: self.logger.warning("跳过 %s:未找到可插入的列", dwd_table) return 0 order_col = self._pick_order_column(dwd_cols, ods_cols) where_sql = "" params: List[Any] = [] dwd_table_sql = self._format_table(dwd_table, "billiards_dwd") ods_table_sql = self._format_table(ods_table, "billiards_ods") if order_col: cur.execute(f'SELECT COALESCE(MAX("{order_col}"), %s) FROM {dwd_table_sql}', ("1970-01-01",)) row = cur.fetchone() or {} watermark = list(row.values())[0] if row else "1970-01-01" where_sql = f'WHERE "{order_col}" > %s' params.append(watermark) default_cols = [c for c in insert_cols if c.lower() not in mapping] default_expr_map: Dict[str, str] = {} if default_cols: default_exprs = self._build_fact_select_exprs(default_cols, dwd_types, ods_types) default_expr_map = dict(zip(default_cols, default_exprs)) select_exprs: List[str] = [] for col in insert_cols: key = col.lower() if key in mapping: src, cast_type = mapping[key] select_exprs.append(self._cast_expr(src, cast_type)) else: select_exprs.append(default_expr_map[col]) select_cols_sql = ", ".join(select_exprs) insert_cols_sql = ", ".join(f'"{c}"' for c in insert_cols) sql = f'INSERT INTO {dwd_table_sql} ({insert_cols_sql}) SELECT {select_cols_sql} FROM {ods_table_sql} {where_sql}' pk_cols = self._get_primary_keys(cur, dwd_table) if pk_cols: pk_sql = ", ".join(f'"{c}"' for c in pk_cols) sql += f" ON CONFLICT ({pk_sql}) DO NOTHING" cur.execute(sql, params) return cur.rowcount def _pick_order_column(self, dwd_cols: Iterable[str], ods_cols: Iterable[str]) -> str | None: """选择用于增量的时间列(需同时存在于 DWD 与 ODS)。""" lower_cols = {c.lower() for c in dwd_cols} & {c.lower() for c in ods_cols} for candidate in self.FACT_ORDER_CANDIDATES: if candidate.lower() in lower_cols: return candidate.lower() return None def _build_fact_select_exprs( self, insert_cols: Sequence[str], dwd_types: Dict[str, str], ods_types: Dict[str, str], ) -> List[str]: """构造事实表 SELECT 列表,需要时做类型转换。""" numeric_types = {"integer", "bigint", "smallint", "numeric", "double precision", "real", "decimal"} text_types = {"text", "character varying", "varchar"} exprs = [] for col in insert_cols: d_type = dwd_types.get(col) o_type = ods_types.get(col) if d_type in numeric_types and o_type in text_types: exprs.append(f"CAST(NULLIF(CAST(\"{col}\" AS text), '') AS numeric):: {d_type}") else: exprs.append(f'"{col}"') return exprs def _split_table_name(self, name: str, default_schema: str) -> tuple[str, str]: """拆分 schema.table,若无 schema 则补默认 schema。""" parts = name.split(".") if len(parts) == 2: return parts[0], parts[1].lower() return default_schema, name.lower() def _table_base(self, name: str) -> str: """获取不含 schema 的表名。""" return name.split(".")[-1] def _format_table(self, name: str, default_schema: str) -> str: """返回带引号的 schema.table 名称。""" schema, table = self._split_table_name(name, default_schema) return f'"{schema}"."{table}"' def _cast_expr(self, col: str, cast_type: str | None) -> str: """构造带可选 CAST 的列表达式。""" if col.upper() == "NULL": base = "NULL" else: is_expr = not col.isidentifier() or "->" in col or "#>>" in col or "::" in col or "'" in col base = col if is_expr else f'"{col}"' if cast_type: cast_lower = cast_type.lower() if cast_lower in {"bigint", "integer", "numeric", "decimal"}: return f"CAST(NULLIF(CAST({base} AS text), '') AS numeric):: {cast_type}" if cast_lower == "timestamptz": return f"({base})::timestamptz" return f"{base}::{cast_type}" return base