Files
feiqiu-ETL/etl_billiards/tasks/dwd_load_task.py
2025-12-09 04:57:05 +08:00

908 lines
42 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""DWD 装载任务:从 ODS 增量写入 DWD维度 SCD2事实按时间增量"""
from __future__ import annotations
from datetime import datetime
from typing import Any, Dict, Iterable, List, Sequence
from psycopg2.extras import RealDictCursor
from .base_task import BaseTask, TaskContext
class DwdLoadTask(BaseTask):
"""负责 DWD 装载:维度表做 SCD2 合并,事实表按时间增量写入。"""
# DWD -> ODS 表映射ODS 表名已与示例 JSON 前缀统一)
TABLE_MAP: dict[str, str] = {
# 维度
# 门店:改用台费流水中的 siteprofile 快照,补齐 org/地址等字段
"billiards_dwd.dim_site": "billiards_ods.table_fee_transactions",
"billiards_dwd.dim_site_ex": "billiards_ods.table_fee_transactions",
"billiards_dwd.dim_table": "billiards_ods.site_tables_master",
"billiards_dwd.dim_table_ex": "billiards_ods.site_tables_master",
"billiards_dwd.dim_assistant": "billiards_ods.assistant_accounts_master",
"billiards_dwd.dim_assistant_ex": "billiards_ods.assistant_accounts_master",
"billiards_dwd.dim_member": "billiards_ods.member_profiles",
"billiards_dwd.dim_member_ex": "billiards_ods.member_profiles",
"billiards_dwd.dim_member_card_account": "billiards_ods.member_stored_value_cards",
"billiards_dwd.dim_member_card_account_ex": "billiards_ods.member_stored_value_cards",
"billiards_dwd.dim_tenant_goods": "billiards_ods.tenant_goods_master",
"billiards_dwd.dim_tenant_goods_ex": "billiards_ods.tenant_goods_master",
"billiards_dwd.dim_store_goods": "billiards_ods.store_goods_master",
"billiards_dwd.dim_store_goods_ex": "billiards_ods.store_goods_master",
"billiards_dwd.dim_goods_category": "billiards_ods.stock_goods_category_tree",
"billiards_dwd.dim_groupbuy_package": "billiards_ods.group_buy_packages",
"billiards_dwd.dim_groupbuy_package_ex": "billiards_ods.group_buy_packages",
# 事实
"billiards_dwd.dwd_settlement_head": "billiards_ods.settlement_records",
"billiards_dwd.dwd_settlement_head_ex": "billiards_ods.settlement_records",
"billiards_dwd.dwd_table_fee_log": "billiards_ods.table_fee_transactions",
"billiards_dwd.dwd_table_fee_log_ex": "billiards_ods.table_fee_transactions",
"billiards_dwd.dwd_table_fee_adjust": "billiards_ods.table_fee_discount_records",
"billiards_dwd.dwd_table_fee_adjust_ex": "billiards_ods.table_fee_discount_records",
"billiards_dwd.dwd_store_goods_sale": "billiards_ods.store_goods_sales_records",
"billiards_dwd.dwd_store_goods_sale_ex": "billiards_ods.store_goods_sales_records",
"billiards_dwd.dwd_assistant_service_log": "billiards_ods.assistant_service_records",
"billiards_dwd.dwd_assistant_service_log_ex": "billiards_ods.assistant_service_records",
"billiards_dwd.dwd_assistant_trash_event": "billiards_ods.assistant_cancellation_records",
"billiards_dwd.dwd_assistant_trash_event_ex": "billiards_ods.assistant_cancellation_records",
"billiards_dwd.dwd_member_balance_change": "billiards_ods.member_balance_changes",
"billiards_dwd.dwd_member_balance_change_ex": "billiards_ods.member_balance_changes",
"billiards_dwd.dwd_groupbuy_redemption": "billiards_ods.group_buy_redemption_records",
"billiards_dwd.dwd_groupbuy_redemption_ex": "billiards_ods.group_buy_redemption_records",
"billiards_dwd.dwd_platform_coupon_redemption": "billiards_ods.platform_coupon_redemption_records",
"billiards_dwd.dwd_platform_coupon_redemption_ex": "billiards_ods.platform_coupon_redemption_records",
"billiards_dwd.dwd_recharge_order": "billiards_ods.recharge_settlements",
"billiards_dwd.dwd_recharge_order_ex": "billiards_ods.recharge_settlements",
"billiards_dwd.dwd_payment": "billiards_ods.payment_transactions",
"billiards_dwd.dwd_refund": "billiards_ods.refund_transactions",
"billiards_dwd.dwd_refund_ex": "billiards_ods.refund_transactions",
}
SCD_COLS = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
FACT_ORDER_CANDIDATES = [
"fetched_at",
"pay_time",
"create_time",
"update_time",
"occur_time",
"settle_time",
"start_use_time",
]
# 特殊列映射dwd 列名 -> 源列表达式(可选 CAST
FACT_MAPPINGS: dict[str, list[tuple[str, str, str | None]]] = {
# 维度表(补齐主键/字段差异)
"billiards_dwd.dim_site": [
("org_id", "siteprofile->>'org_id'", None),
("shop_name", "siteprofile->>'shop_name'", None),
("site_label", "siteprofile->>'site_label'", None),
("full_address", "siteprofile->>'full_address'", None),
("address", "siteprofile->>'address'", None),
("longitude", "siteprofile->>'longitude'", "numeric"),
("latitude", "siteprofile->>'latitude'", "numeric"),
("tenant_site_region_id", "siteprofile->>'tenant_site_region_id'", None),
("business_tel", "siteprofile->>'business_tel'", None),
("site_type", "siteprofile->>'site_type'", None),
("shop_status", "siteprofile->>'shop_status'", None),
("tenant_id", "siteprofile->>'tenant_id'", None),
],
"billiards_dwd.dim_site_ex": [
("auto_light", "siteprofile->>'auto_light'", None),
("attendance_enabled", "siteprofile->>'attendance_enabled'", None),
("attendance_distance", "siteprofile->>'attendance_distance'", None),
("prod_env", "siteprofile->>'prod_env'", None),
("light_status", "siteprofile->>'light_status'", None),
("light_type", "siteprofile->>'light_type'", None),
("light_token", "siteprofile->>'light_token'", None),
("address", "siteprofile->>'address'", None),
("avatar", "siteprofile->>'avatar'", None),
("wifi_name", "siteprofile->>'wifi_name'", None),
("wifi_password", "siteprofile->>'wifi_password'", None),
("customer_service_qrcode", "siteprofile->>'customer_service_qrcode'", None),
("customer_service_wechat", "siteprofile->>'customer_service_wechat'", None),
("fixed_pay_qrcode", "siteprofile->>'fixed_pay_qrCode'", None),
("longitude", "siteprofile->>'longitude'", "numeric"),
("latitude", "siteprofile->>'latitude'", "numeric"),
("tenant_site_region_id", "siteprofile->>'tenant_site_region_id'", None),
("site_type", "siteprofile->>'site_type'", None),
("site_label", "siteprofile->>'site_label'", None),
("shop_status", "siteprofile->>'shop_status'", None),
("create_time", "siteprofile->>'create_time'", "timestamptz"),
("update_time", "siteprofile->>'update_time'", "timestamptz"),
],
"billiards_dwd.dim_table": [
("table_id", "id", None),
("site_table_area_name", "areaname", None),
("tenant_table_area_id", "site_table_area_id", None),
],
"billiards_dwd.dim_table_ex": [
("table_id", "id", None),
("table_cloth_use_time", "table_cloth_use_time", None),
],
"billiards_dwd.dim_assistant": [("assistant_id", "id", None), ("user_id", "staff_id", None)],
"billiards_dwd.dim_assistant_ex": [
("assistant_id", "id", None),
("introduce", "introduce", None),
("group_name", "group_name", None),
("light_equipment_id", "light_equipment_id", None),
],
"billiards_dwd.dim_member": [("member_id", "id", None)],
"billiards_dwd.dim_member_ex": [
("member_id", "id", None),
("register_site_name", "site_name", None),
],
"billiards_dwd.dim_member_card_account": [("member_card_id", "id", None)],
"billiards_dwd.dim_member_card_account_ex": [
("member_card_id", "id", None),
("tenant_name", "tenantname", None),
("tenantavatar", "tenantavatar", None),
("card_no", "card_no", None),
("bind_password", "bind_password", None),
("use_scene", "use_scene", None),
("tableareaid", "tableareaid", None),
("goodscategoryid", "goodscategoryid", None),
],
"billiards_dwd.dim_tenant_goods": [
("tenant_goods_id", "id", None),
("category_name", "categoryname", None),
],
"billiards_dwd.dim_tenant_goods_ex": [
("tenant_goods_id", "id", None),
("remark_name", "remark_name", None),
("goods_bar_code", "goods_bar_code", None),
("commodity_code_list", "commodity_code", None),
("is_in_site", "isinsite", "boolean"),
],
"billiards_dwd.dim_store_goods": [
("site_goods_id", "id", None),
("category_level1_name", "onecategoryname", None),
("category_level2_name", "twocategoryname", None),
("created_at", "create_time", None),
("updated_at", "update_time", None),
("avg_monthly_sales", "average_monthly_sales", None),
("batch_stock_qty", "stock", None),
("sale_qty", "sale_num", None),
("total_sales_qty", "total_sales", None),
],
"billiards_dwd.dim_store_goods_ex": [
("site_goods_id", "id", None),
("goods_barcode", "goods_bar_code", None),
("stock_qty", "stock", None),
("stock_secondary_qty", "stock_a", None),
("safety_stock_qty", "safe_stock", None),
("site_name", "sitename", None),
("goods_cover_url", "goods_cover", None),
("provisional_total_cost", "total_purchase_cost", None),
("is_discountable", "able_discount", None),
("freeze_status", "freeze", None),
("remark", "remark", None),
("days_on_shelf", "days_available", None),
("sort_order", "sort", None),
],
"billiards_dwd.dim_goods_category": [
("category_id", "id", None),
("tenant_id", "tenant_id", None),
("category_name", "category_name", None),
("alias_name", "alias_name", None),
("parent_category_id", "pid", None),
("business_name", "business_name", None),
("tenant_goods_business_id", "tenant_goods_business_id", None),
("sort_order", "sort", None),
("open_salesman", "open_salesman", None),
("is_warehousing", "is_warehousing", None),
("category_level", "CASE WHEN pid = 0 THEN 1 ELSE 2 END", None),
("is_leaf", "CASE WHEN categoryboxes IS NULL OR jsonb_array_length(categoryboxes)=0 THEN 1 ELSE 0 END", None),
],
"billiards_dwd.dim_groupbuy_package": [
("groupbuy_package_id", "id", None),
("package_template_id", "package_id", None),
("coupon_face_value", "coupon_money", None),
("duration_seconds", "duration", None),
],
"billiards_dwd.dim_groupbuy_package_ex": [
("groupbuy_package_id", "id", None),
("table_area_id", "table_area_id", None),
("tenant_table_area_id", "tenant_table_area_id", None),
("usable_range", "usable_range", None),
("table_area_id_list", "table_area_id_list", None),
("package_type", "type", None),
],
# 事实表主键及关键差异列
"billiards_dwd.dwd_table_fee_log": [("table_fee_log_id", "id", None)],
"billiards_dwd.dwd_table_fee_log_ex": [
("table_fee_log_id", "id", None),
("salesman_name", "salesman_name", None),
],
"billiards_dwd.dwd_table_fee_adjust": [
("table_fee_adjust_id", "id", None),
("table_id", "site_table_id", None),
("table_area_id", "tenant_table_area_id", None),
("table_area_name", "tableprofile->>'table_area_name'", None),
("adjust_time", "create_time", None),
],
"billiards_dwd.dwd_table_fee_adjust_ex": [
("table_fee_adjust_id", "id", None),
("ledger_name", "ledger_name", None),
],
"billiards_dwd.dwd_store_goods_sale": [("store_goods_sale_id", "id", None), ("discount_price", "discount_money", None)],
"billiards_dwd.dwd_store_goods_sale_ex": [
("store_goods_sale_id", "id", None),
("option_value_name", "option_value_name", None),
("open_salesman_flag", "opensalesman", "integer"),
("salesman_name", "salesman_name", None),
("salesman_org_id", "sales_man_org_id", None),
("legacy_order_goods_id", "ordergoodsid", None),
("site_name", "sitename", None),
("legacy_site_id", "siteid", None),
],
"billiards_dwd.dwd_assistant_service_log": [
("assistant_service_id", "id", None),
("assistant_no", "assistantno", None),
("site_assistant_id", "order_assistant_id", None),
("level_name", "levelname", None),
("skill_name", "skillname", None),
],
"billiards_dwd.dwd_assistant_service_log_ex": [
("assistant_service_id", "id", None),
("assistant_name", "assistantname", None),
("ledger_group_name", "ledger_group_name", None),
("trash_applicant_name", "trash_applicant_name", None),
("trash_reason", "trash_reason", None),
("salesman_name", "salesman_name", None),
("table_name", "tablename", None),
],
"billiards_dwd.dwd_assistant_trash_event": [
("assistant_trash_event_id", "id", None),
("assistant_no", "assistantname", None),
("abolish_amount", "assistantabolishamount", None),
("charge_minutes_raw", "pdchargeminutes", None),
("site_id", "siteid", None),
("table_id", "tableid", None),
("table_area_id", "tableareaid", None),
("assistant_name", "assistantname", None),
("trash_reason", "trashreason", None),
("create_time", "createtime", None),
],
"billiards_dwd.dwd_assistant_trash_event_ex": [
("assistant_trash_event_id", "id", None),
("table_area_name", "tablearea", None),
("table_name", "tablename", None),
],
"billiards_dwd.dwd_member_balance_change": [
("balance_change_id", "id", None),
("balance_before", "before", None),
("change_amount", "account_data", None),
("balance_after", "after", None),
("card_type_name", "membercardtypename", None),
("change_time", "create_time", None),
("member_name", "membername", None),
("member_mobile", "membermobile", None),
],
"billiards_dwd.dwd_member_balance_change_ex": [
("balance_change_id", "id", None),
("pay_site_name", "paysitename", None),
("register_site_name", "registersitename", None),
],
"billiards_dwd.dwd_groupbuy_redemption": [("redemption_id", "id", None)],
"billiards_dwd.dwd_groupbuy_redemption_ex": [
("redemption_id", "id", None),
("table_area_name", "tableareaname", None),
("site_name", "sitename", None),
("table_name", "tablename", None),
("goods_option_price", "goodsoptionprice", None),
("salesman_name", "salesman_name", None),
("salesman_org_id", "sales_man_org_id", None),
("ledger_group_name", "ledger_group_name", None),
],
"billiards_dwd.dwd_platform_coupon_redemption": [("platform_coupon_redemption_id", "id", None)],
"billiards_dwd.dwd_platform_coupon_redemption_ex": [
("platform_coupon_redemption_id", "id", None),
("coupon_cover", "coupon_cover", None),
],
"billiards_dwd.dwd_payment": [("payment_id", "id", None), ("pay_date", "pay_time", "date")],
"billiards_dwd.dwd_refund": [("refund_id", "id", None)],
"billiards_dwd.dwd_refund_ex": [
("refund_id", "id", None),
("tenant_name", "tenantname", None),
("channel_payer_id", "channel_payer_id", None),
("channel_pay_no", "channel_pay_no", None),
],
# 结算头settlement_records源列为小写驼峰/无下划线,需要显式映射)
"billiards_dwd.dwd_settlement_head": [
("order_settle_id", "id", None),
("tenant_id", "tenantid", None),
("site_id", "siteid", None),
("site_name", "sitename", None),
("table_id", "tableid", None),
("settle_name", "settlename", None),
("order_trade_no", "settlerelateid", None),
("create_time", "createtime", None),
("pay_time", "paytime", None),
("settle_type", "settletype", None),
("revoke_order_id", "revokeorderid", None),
("member_id", "memberid", None),
("member_name", "membername", None),
("member_phone", "memberphone", None),
("member_card_account_id", "tenantmembercardid", None),
("member_card_type_name", "membercardtypename", None),
("is_bind_member", "isbindmember", None),
("member_discount_amount", "memberdiscountamount", None),
("consume_money", "consumemoney", None),
("table_charge_money", "tablechargemoney", None),
("goods_money", "goodsmoney", None),
("real_goods_money", "realgoodsmoney", None),
("assistant_pd_money", "assistantpdmoney", None),
("assistant_cx_money", "assistantcxmoney", None),
("adjust_amount", "adjustamount", None),
("pay_amount", "payamount", None),
("balance_amount", "balanceamount", None),
("recharge_card_amount", "rechargecardamount", None),
("gift_card_amount", "giftcardamount", None),
("coupon_amount", "couponamount", None),
("rounding_amount", "roundingamount", None),
("point_amount", "pointamount", None),
],
"billiards_dwd.dwd_settlement_head_ex": [
("order_settle_id", "id", None),
("serial_number", "serialnumber", None),
("settle_status", "settlestatus", None),
("can_be_revoked", "canberevoked", "boolean"),
("revoke_order_name", "revokeordername", None),
("revoke_time", "revoketime", None),
("is_first_order", "isfirst", "boolean"),
("service_money", "servicemoney", None),
("cash_amount", "cashamount", None),
("card_amount", "cardamount", None),
("online_amount", "onlineamount", None),
("refund_amount", "refundamount", None),
("prepay_money", "prepaymoney", None),
("payment_method", "paymentmethod", None),
("coupon_sale_amount", "couponsaleamount", None),
("all_coupon_discount", "allcoupondiscount", None),
("goods_promotion_money", "goodspromotionmoney", None),
("assistant_promotion_money", "assistantpromotionmoney", None),
("activity_discount", "activitydiscount", None),
("assistant_manual_discount", "assistantmanualdiscount", None),
("point_discount_price", "pointdiscountprice", None),
("point_discount_cost", "pointdiscountcost", None),
("is_use_coupon", "isusecoupon", "boolean"),
("is_use_discount", "isusediscount", "boolean"),
("is_activity", "isactivity", "boolean"),
("operator_name", "operatorname", None),
("salesman_name", "salesmanname", None),
("order_remark", "orderremark", None),
("operator_id", "operatorid", None),
("salesman_user_id", "salesmanuserid", None),
],
# 充值结算recharge_settlements字段风格同 settlement_records
"billiards_dwd.dwd_recharge_order": [
("recharge_order_id", "id", None),
("tenant_id", "tenantid", None),
("site_id", "siteid", None),
("member_id", "memberid", None),
("member_name_snapshot", "membername", None),
("member_phone_snapshot", "memberphone", None),
("tenant_member_card_id", "tenantmembercardid", None),
("member_card_type_name", "membercardtypename", None),
("settle_relate_id", "settlerelateid", None),
("settle_type", "settletype", None),
("settle_name", "settlename", None),
("is_first", "isfirst", None),
("pay_amount", "payamount", None),
("refund_amount", "refundamount", None),
("point_amount", "pointamount", None),
("cash_amount", "cashamount", None),
("payment_method", "paymentmethod", None),
("create_time", "createtime", None),
("pay_time", "paytime", None),
],
"billiards_dwd.dwd_recharge_order_ex": [
("recharge_order_id", "id", None),
("site_name_snapshot", "sitename", None),
("salesman_name", "salesmanname", None),
("order_remark", "orderremark", None),
("revoke_order_name", "revokeordername", None),
("settle_status", "settlestatus", None),
("is_bind_member", "isbindmember", "boolean"),
("is_activity", "isactivity", "boolean"),
("is_use_coupon", "isusecoupon", "boolean"),
("is_use_discount", "isusediscount", "boolean"),
("can_be_revoked", "canberevoked", "boolean"),
("online_amount", "onlineamount", None),
("balance_amount", "balanceamount", None),
("card_amount", "cardamount", None),
("coupon_amount", "couponamount", None),
("recharge_card_amount", "rechargecardamount", None),
("gift_card_amount", "giftcardamount", None),
("prepay_money", "prepaymoney", None),
("consume_money", "consumemoney", None),
("goods_money", "goodsmoney", None),
("real_goods_money", "realgoodsmoney", None),
("table_charge_money", "tablechargemoney", None),
("service_money", "servicemoney", None),
("activity_discount", "activitydiscount", None),
("all_coupon_discount", "allcoupondiscount", None),
("goods_promotion_money", "goodspromotionmoney", None),
("assistant_promotion_money", "assistantpromotionmoney", None),
("assistant_pd_money", "assistantpdmoney", None),
("assistant_cx_money", "assistantcxmoney", None),
("assistant_manual_discount", "assistantmanualdiscount", None),
("coupon_sale_amount", "couponsaleamount", None),
("member_discount_amount", "memberdiscountamount", None),
("point_discount_price", "pointdiscountprice", None),
("point_discount_cost", "pointdiscountcost", None),
("adjust_amount", "adjustamount", None),
("rounding_amount", "roundingamount", None),
("operator_id", "operatorid", None),
("operator_name_snapshot", "operatorname", None),
("salesman_user_id", "salesmanuserid", None),
("salesman_name", "salesmanname", None),
("order_remark", "orderremark", None),
("table_id", "tableid", None),
("serial_number", "serialnumber", None),
("revoke_order_id", "revokeorderid", None),
("revoke_order_name", "revokeordername", None),
("revoke_time", "revoketime", None),
],
}
def get_task_code(self) -> str:
"""返回任务编码。"""
return "DWD_LOAD_FROM_ODS"
def extract(self, context: TaskContext) -> dict[str, Any]:
"""准备运行所需的上下文信息。"""
return {"now": datetime.now()}
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
"""遍历映射关系,维度执行 SCD2 合并,事实表按时间增量插入。"""
now = extracted["now"]
summary: List[Dict[str, Any]] = []
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
for dwd_table, ods_table in self.TABLE_MAP.items():
dwd_cols = self._get_columns(cur, dwd_table)
ods_cols = self._get_columns(cur, ods_table)
if not dwd_cols:
self.logger.warning("跳过 %s,未能获取 DWD 列信息", dwd_table)
continue
if self._table_base(dwd_table).startswith("dim_"):
processed = self._merge_dim_scd2(cur, dwd_table, ods_table, dwd_cols, ods_cols, now)
summary.append({"table": dwd_table, "mode": "SCD2", "processed": processed})
else:
dwd_types = self._get_column_types(cur, dwd_table, "billiards_dwd")
ods_types = self._get_column_types(cur, ods_table, "billiards_ods")
inserted = self._merge_fact_increment(
cur, dwd_table, ods_table, dwd_cols, ods_cols, dwd_types, ods_types
)
summary.append({"table": dwd_table, "mode": "INCREMENT", "inserted": inserted})
self.db.conn.commit()
return {"tables": summary}
# ---------------------- helpers ----------------------
def _get_columns(self, cur, table: str) -> List[str]:
"""获取指定表的列名(小写)。"""
schema, name = self._split_table_name(table, default_schema="billiards_dwd")
cur.execute(
"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
""",
(schema, name),
)
return [r["column_name"].lower() for r in cur.fetchall()]
def _get_primary_keys(self, cur, table: str) -> List[str]:
"""获取表的主键列名列表。"""
schema, name = self._split_table_name(table, default_schema="billiards_dwd")
cur.execute(
"""
SELECT kcu.column_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
ON tc.constraint_name = kcu.constraint_name
AND tc.table_schema = kcu.table_schema
AND tc.table_name = kcu.table_name
WHERE tc.table_schema = %s
AND tc.table_name = %s
AND tc.constraint_type = 'PRIMARY KEY'
ORDER BY kcu.ordinal_position
""",
(schema, name),
)
return [r["column_name"].lower() for r in cur.fetchall()]
def _get_column_types(self, cur, table: str, default_schema: str) -> Dict[str, str]:
"""获取列的数据类型information_schema.data_type"""
schema, name = self._split_table_name(table, default_schema=default_schema)
cur.execute(
"""
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
""",
(schema, name),
)
return {r["column_name"].lower(): r["data_type"].lower() for r in cur.fetchall()}
def _build_column_mapping(
self, dwd_table: str, pk_cols: Sequence[str], ods_cols: Sequence[str]
) -> Dict[str, tuple[str, str | None]]:
"""合并显式 FACT_MAPPINGS 与主键兜底映射。"""
mapping_entries = self.FACT_MAPPINGS.get(dwd_table, [])
mapping: Dict[str, tuple[str, str | None]] = {
dst.lower(): (src, cast_type) for dst, src, cast_type in mapping_entries
}
ods_set = {c.lower() for c in ods_cols}
for pk in pk_cols:
pk_lower = pk.lower()
if pk_lower not in mapping and pk_lower not in ods_set and "id" in ods_set:
mapping[pk_lower] = ("id", None)
return mapping
def _fetch_source_rows(
self, cur, table: str, columns: Sequence[str], where_sql: str = "", params: Sequence[Any] = None
) -> List[Dict[str, Any]]:
"""从源表读取指定列,返回小写键的字典列表。"""
schema, name = self._split_table_name(table, default_schema="billiards_ods")
cols_sql = ", ".join(f'"{c}"' for c in columns)
sql = f'SELECT {cols_sql} FROM "{schema}"."{name}" {where_sql}'
cur.execute(sql, params or [])
rows = []
for r in cur.fetchall():
rows.append({k.lower(): v for k, v in r.items()})
return rows
def _expand_goods_category_rows(self, rows: list[Dict[str, Any]]) -> list[Dict[str, Any]]:
"""将分类表中的 categoryboxes 元素展开为子类记录。"""
expanded: list[Dict[str, Any]] = []
for r in rows:
expanded.append(r)
boxes = r.get("categoryboxes")
if isinstance(boxes, list):
for child in boxes:
if not isinstance(child, dict):
continue
child_row: Dict[str, Any] = {}
# 继承父级的租户与业务大类信息
child_row["tenant_id"] = r.get("tenant_id")
child_row["business_name"] = child.get("business_name", r.get("business_name"))
child_row["tenant_goods_business_id"] = child.get(
"tenant_goods_business_id", r.get("tenant_goods_business_id")
)
# 合并子类字段
child_row.update(child)
# 默认父子关系
child_row.setdefault("pid", r.get("id"))
# 衍生层级/叶子标记
child_boxes = child_row.get("categoryboxes")
if not isinstance(child_boxes, list):
is_leaf = 1
else:
is_leaf = 1 if len(child_boxes) == 0 else 0
child_row.setdefault("category_level", 2)
child_row.setdefault("is_leaf", is_leaf)
expanded.append(child_row)
return expanded
def _merge_dim_scd2(
self,
cur,
dwd_table: str,
ods_table: str,
dwd_cols: Sequence[str],
ods_cols: Sequence[str],
now: datetime,
) -> int:
"""对维表执行 SCD2 合并:对比变更关闭旧版并插入新版。"""
pk_cols = self._get_primary_keys(cur, dwd_table)
if not pk_cols:
raise ValueError(f"{dwd_table} 未配置主键,无法执行 SCD2 合并")
mapping = self._build_column_mapping(dwd_table, pk_cols, ods_cols)
ods_set = {c.lower() for c in ods_cols}
table_sql = self._format_table(ods_table, "billiards_ods")
# 构造 SELECT 表达式,支持 JSON/expression 映射
select_exprs: list[str] = []
added: set[str] = set()
for col in dwd_cols:
lc = col.lower()
if lc in self.SCD_COLS:
continue
if lc in mapping:
src, cast_type = mapping[lc]
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
added.add(lc)
elif lc in ods_set:
select_exprs.append(f'"{lc}" AS "{lc}"')
added.add(lc)
# 分类维度需要额外读取 categoryboxes 以展开子类
if dwd_table == "billiards_dwd.dim_goods_category" and "categoryboxes" not in added and "categoryboxes" in ods_set:
select_exprs.append('"categoryboxes" AS "categoryboxes"')
added.add("categoryboxes")
# 主键兜底确保被选出
for pk in pk_cols:
lc = pk.lower()
if lc not in added:
if lc in mapping:
src, cast_type = mapping[lc]
select_exprs.append(f"{self._cast_expr(src, cast_type)} AS \"{lc}\"")
elif lc in ods_set:
select_exprs.append(f'"{lc}" AS "{lc}"')
added.add(lc)
if not select_exprs:
return 0
sql = f"SELECT {', '.join(select_exprs)} FROM {table_sql}"
cur.execute(sql)
rows = [{k.lower(): v for k, v in r.items()} for r in cur.fetchall()]
# 特殊:分类维度展开子类
if dwd_table == "billiards_dwd.dim_goods_category":
rows = self._expand_goods_category_rows(rows)
inserted_or_updated = 0
seen_pk = set()
for row in rows:
mapped_row: Dict[str, Any] = {}
for col in dwd_cols:
lc = col.lower()
if lc in self.SCD_COLS:
continue
value = row.get(lc)
if value is None and lc in mapping:
src, _ = mapping[lc]
value = row.get(src.lower())
mapped_row[lc] = value
pk_key = tuple(mapped_row.get(pk) for pk in pk_cols)
if pk_key in seen_pk:
continue
seen_pk.add(pk_key)
if self._upsert_scd2_row(cur, dwd_table, dwd_cols, pk_cols, mapped_row, now):
inserted_or_updated += 1
return len(rows)
def _upsert_scd2_row(
self,
cur,
dwd_table: str,
dwd_cols: Sequence[str],
pk_cols: Sequence[str],
src_row: Dict[str, Any],
now: datetime,
) -> bool:
"""SCD2 合并:若有变更则关闭旧版并插入新版本。"""
pk_values = [src_row.get(pk) for pk in pk_cols]
if any(v is None for v in pk_values):
self.logger.warning("跳过 %s:主键缺失 %s", dwd_table, dict(zip(pk_cols, pk_values)))
return False
where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols)
table_sql = self._format_table(dwd_table, "billiards_dwd")
cur.execute(
f"SELECT * FROM {table_sql} WHERE {where_clause} AND COALESCE(scd2_is_current,1)=1 LIMIT 1",
pk_values,
)
current = cur.fetchone()
if current:
current = {k.lower(): v for k, v in current.items()}
if current and not self._is_row_changed(current, src_row, dwd_cols):
return False
if current:
version = (current.get("scd2_version") or 1) + 1
self._close_current_dim(cur, dwd_table, pk_cols, pk_values, now)
else:
version = 1
self._insert_dim_row(cur, dwd_table, dwd_cols, src_row, now, version)
return True
def _close_current_dim(self, cur, table: str, pk_cols: Sequence[str], pk_values: Sequence[Any], now: datetime) -> None:
"""关闭当前版本,标记 scd2_is_current=0 并填充结束时间。"""
set_sql = "scd2_end_time = %s, scd2_is_current = 0"
where_clause = " AND ".join(f'"{pk}" = %s' for pk in pk_cols)
table_sql = self._format_table(table, "billiards_dwd")
cur.execute(f"UPDATE {table_sql} SET {set_sql} WHERE {where_clause} AND COALESCE(scd2_is_current,1)=1", [now, *pk_values])
def _insert_dim_row(
self,
cur,
table: str,
dwd_cols: Sequence[str],
src_row: Dict[str, Any],
now: datetime,
version: int,
) -> None:
"""插入新的 SCD2 版本行。"""
insert_cols: List[str] = []
placeholders: List[str] = []
values: List[Any] = []
for col in sorted(dwd_cols):
lc = col.lower()
insert_cols.append(f'"{lc}"')
placeholders.append("%s")
if lc == "scd2_start_time":
values.append(now)
elif lc == "scd2_end_time":
values.append(datetime(9999, 12, 31, 0, 0, 0))
elif lc == "scd2_is_current":
values.append(1)
elif lc == "scd2_version":
values.append(version)
else:
values.append(src_row.get(lc))
table_sql = self._format_table(table, "billiards_dwd")
sql = f'INSERT INTO {table_sql} ({", ".join(insert_cols)}) VALUES ({", ".join(placeholders)})'
cur.execute(sql, values)
def _is_row_changed(self, current: Dict[str, Any], incoming: Dict[str, Any], dwd_cols: Sequence[str]) -> bool:
"""比较非 SCD2 列,判断是否存在变更。"""
for col in dwd_cols:
lc = col.lower()
if lc in self.SCD_COLS:
continue
if current.get(lc) != incoming.get(lc):
return True
return False
def _merge_fact_increment(
self,
cur,
dwd_table: str,
ods_table: str,
dwd_cols: Sequence[str],
ods_cols: Sequence[str],
dwd_types: Dict[str, str],
ods_types: Dict[str, str],
) -> int:
"""事实表按时间增量插入,默认按列名交集写入。"""
mapping_entries = self.FACT_MAPPINGS.get(dwd_table) or []
mapping: Dict[str, tuple[str, str | None]] = {
dst.lower(): (src, cast_type) for dst, src, cast_type in mapping_entries
}
mapping_dest = [dst for dst, _, _ in mapping_entries]
insert_cols: List[str] = list(mapping_dest)
for col in dwd_cols:
if col in self.SCD_COLS:
continue
if col in insert_cols:
continue
if col in ods_cols:
insert_cols.append(col)
pk_cols = self._get_primary_keys(cur, dwd_table)
ods_set = {c.lower() for c in ods_cols}
existing_lower = [c.lower() for c in insert_cols]
for pk in pk_cols:
pk_lower = pk.lower()
if pk_lower in existing_lower:
continue
if pk_lower in ods_set:
insert_cols.append(pk)
existing_lower.append(pk_lower)
elif "id" in ods_set:
insert_cols.append(pk)
existing_lower.append(pk_lower)
mapping[pk_lower] = ("id", None)
# 保持列顺序同时去重
seen_cols: set[str] = set()
ordered_cols: list[str] = []
for col in insert_cols:
lc = col.lower()
if lc not in seen_cols:
seen_cols.add(lc)
ordered_cols.append(col)
insert_cols = ordered_cols
if not insert_cols:
self.logger.warning("跳过 %s:未找到可插入的列", dwd_table)
return 0
order_col = self._pick_order_column(dwd_cols, ods_cols)
where_sql = ""
params: List[Any] = []
dwd_table_sql = self._format_table(dwd_table, "billiards_dwd")
ods_table_sql = self._format_table(ods_table, "billiards_ods")
if order_col:
cur.execute(f'SELECT COALESCE(MAX("{order_col}"), %s) FROM {dwd_table_sql}', ("1970-01-01",))
row = cur.fetchone() or {}
watermark = list(row.values())[0] if row else "1970-01-01"
where_sql = f'WHERE "{order_col}" > %s'
params.append(watermark)
default_cols = [c for c in insert_cols if c.lower() not in mapping]
default_expr_map: Dict[str, str] = {}
if default_cols:
default_exprs = self._build_fact_select_exprs(default_cols, dwd_types, ods_types)
default_expr_map = dict(zip(default_cols, default_exprs))
select_exprs: List[str] = []
for col in insert_cols:
key = col.lower()
if key in mapping:
src, cast_type = mapping[key]
select_exprs.append(self._cast_expr(src, cast_type))
else:
select_exprs.append(default_expr_map[col])
select_cols_sql = ", ".join(select_exprs)
insert_cols_sql = ", ".join(f'"{c}"' for c in insert_cols)
sql = f'INSERT INTO {dwd_table_sql} ({insert_cols_sql}) SELECT {select_cols_sql} FROM {ods_table_sql} {where_sql}'
pk_cols = self._get_primary_keys(cur, dwd_table)
if pk_cols:
pk_sql = ", ".join(f'"{c}"' for c in pk_cols)
sql += f" ON CONFLICT ({pk_sql}) DO NOTHING"
cur.execute(sql, params)
return cur.rowcount
def _pick_order_column(self, dwd_cols: Iterable[str], ods_cols: Iterable[str]) -> str | None:
"""选择用于增量的时间列(需同时存在于 DWD 与 ODS"""
lower_cols = {c.lower() for c in dwd_cols} & {c.lower() for c in ods_cols}
for candidate in self.FACT_ORDER_CANDIDATES:
if candidate.lower() in lower_cols:
return candidate.lower()
return None
def _build_fact_select_exprs(
self,
insert_cols: Sequence[str],
dwd_types: Dict[str, str],
ods_types: Dict[str, str],
) -> List[str]:
"""构造事实表 SELECT 列表,需要时做类型转换。"""
numeric_types = {"integer", "bigint", "smallint", "numeric", "double precision", "real", "decimal"}
text_types = {"text", "character varying", "varchar"}
exprs = []
for col in insert_cols:
d_type = dwd_types.get(col)
o_type = ods_types.get(col)
if d_type in numeric_types and o_type in text_types:
exprs.append(f"CAST(NULLIF(CAST(\"{col}\" AS text), '') AS numeric):: {d_type}")
else:
exprs.append(f'"{col}"')
return exprs
def _split_table_name(self, name: str, default_schema: str) -> tuple[str, str]:
"""拆分 schema.table若无 schema 则补默认 schema。"""
parts = name.split(".")
if len(parts) == 2:
return parts[0], parts[1].lower()
return default_schema, name.lower()
def _table_base(self, name: str) -> str:
"""获取不含 schema 的表名。"""
return name.split(".")[-1]
def _format_table(self, name: str, default_schema: str) -> str:
"""返回带引号的 schema.table 名称。"""
schema, table = self._split_table_name(name, default_schema)
return f'"{schema}"."{table}"'
def _cast_expr(self, col: str, cast_type: str | None) -> str:
"""构造带可选 CAST 的列表达式。"""
if col.upper() == "NULL":
base = "NULL"
else:
is_expr = not col.isidentifier() or "->" in col or "#>>" in col or "::" in col or "'" in col
base = col if is_expr else f'"{col}"'
if cast_type:
cast_lower = cast_type.lower()
if cast_lower in {"bigint", "integer", "numeric", "decimal"}:
return f"CAST(NULLIF(CAST({base} AS text), '') AS numeric):: {cast_type}"
if cast_lower == "timestamptz":
return f"({base})::timestamptz"
return f"{base}::{cast_type}"
return base