# -*- coding: utf-8 -*- """ 客户召回指数计算任务 功能说明: - 衡量客户召回的必要性和紧急程度 - 尊重客户个人到店周期(μ=中位数, σ=MAD) - 对新客户、刚充值客户增加召回倾向 - 检测"热了又断"的情况 算法公式: Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop 其中: - overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性 - new_bonus = decay(d_first, h_new) # 新客户加分 - re_bonus = decay(d_recharge, h_re) # 刚充值加分 - hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分 数据来源: - dwd_settlement_head: 会员到店记录 - dwd_recharge_order: 充值记录 - dim_member: 首访时间 更新频率:每2小时 作者:ETL团队 创建日期:2026-02-03 """ from __future__ import annotations import math from dataclasses import dataclass from datetime import date, datetime, timedelta from decimal import Decimal from typing import Any, Dict, List, Optional, Tuple from .base_index_task import BaseIndexTask, PercentileHistory from ..base_dws_task import TaskContext # ============================================================================= # 数据类定义 # ============================================================================= @dataclass class MemberRecallData: """会员召回数据""" member_id: int site_id: int tenant_id: int # 计算输入特征 days_since_last_visit: Optional[int] = None visit_interval_median: Optional[float] = None visit_interval_mad: Optional[float] = None days_since_first_visit: Optional[int] = None days_since_last_recharge: Optional[int] = None visits_last_14_days: int = 0 visits_last_60_days: int = 0 # 分项得分 score_overdue: float = 0.0 score_new_bonus: float = 0.0 score_recharge_bonus: float = 0.0 score_hot_drop: float = 0.0 # 最终分数 raw_score: float = 0.0 display_score: float = 0.0 # ============================================================================= # 召回指数任务 # ============================================================================= class RecallIndexTask(BaseIndexTask): """ 客户召回指数计算任务 计算流程: 1. 提取近60天有到店记录的会员 2. 计算每个会员的到店间隔特征(中位数、MAD) 3. 计算4项分数(超期、新客、充值、热度断档) 4. 汇总Raw Score 5. 分位截断 + MinMax映射到0-10 6. 写入DWS表 """ INDEX_TYPE = "RECALL" # 默认参数 DEFAULT_PARAMS = { 'lookback_days': 60, 'sigma_min': 2.0, 'halflife_new': 7.0, 'halflife_recharge': 10.0, 'weight_overdue': 3.0, 'weight_new': 1.0, 'weight_recharge': 1.0, 'weight_hot': 1.0, 'percentile_lower': 5, 'percentile_upper': 95, } # ========================================================================== # 抽象方法实现 # ========================================================================== def get_task_code(self) -> str: return "DWS_RECALL_INDEX" def get_target_table(self) -> str: return "dws_member_recall_index" def get_primary_keys(self) -> List[str]: return ['site_id', 'member_id'] def get_index_type(self) -> str: return self.INDEX_TYPE # ========================================================================== # 任务执行 # ========================================================================== def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]: """执行召回指数计算""" self.logger.info("开始计算客户召回指数") # 获取门店ID site_id = self._get_site_id(context) tenant_id = self._get_tenant_id() # 加载参数 params = self._load_params() lookback_days = int(params['lookback_days']) # 计算基准日期 base_date = date.today() start_date = base_date - timedelta(days=lookback_days) self.logger.info( "参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f", lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge'] ) # 1. 提取会员到店数据 member_visits = self._extract_member_visits(site_id, start_date, base_date) self.logger.info("提取到 %d 个会员的到店记录", len(member_visits)) if not member_visits: self.logger.warning("没有会员到店记录,跳过计算") return {'status': 'skipped', 'reason': 'no_data'} # 2. 提取充值记录 recharge_data = self._extract_recharge_data(site_id, start_date, base_date) self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data)) # 3. 提取首访时间 first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys())) self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data)) # 4. 计算每个会员的召回数据 recall_data_list: List[MemberRecallData] = [] for member_id, visit_dates in member_visits.items(): data = MemberRecallData( member_id=member_id, site_id=site_id, tenant_id=tenant_id ) # 计算特征 self._calculate_visit_features(data, visit_dates, base_date, params) # 补充充值特征 if member_id in recharge_data: last_recharge_date = recharge_data[member_id] data.days_since_last_recharge = (base_date - last_recharge_date).days # 补充首访特征 if member_id in first_visit_data: first_visit_date = first_visit_data[member_id] data.days_since_first_visit = (base_date - first_visit_date).days # 计算分项得分 self._calculate_component_scores(data, params) # 汇总Raw Score data.raw_score = ( params['weight_overdue'] * data.score_overdue + params['weight_new'] * data.score_new_bonus + params['weight_recharge'] * data.score_recharge_bonus + params['weight_hot'] * data.score_hot_drop ) recall_data_list.append(data) self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list)) # 5. 归一化到Display Score raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list] normalized = self.batch_normalize_to_display( raw_scores, use_log=False, percentile_lower=int(params['percentile_lower']), percentile_upper=int(params['percentile_upper']), use_smoothing=True, site_id=site_id ) # 更新display_score score_map = {member_id: (raw, display) for member_id, raw, display in normalized} for data in recall_data_list: if data.member_id in score_map: _, data.display_score = score_map[data.member_id] # 6. 保存分位点历史 if recall_data_list: all_raw = [d.raw_score for d in recall_data_list] q_l, q_u = self.calculate_percentiles( all_raw, int(params['percentile_lower']), int(params['percentile_upper']) ) smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u) self.save_percentile_history( site_id=site_id, percentile_5=q_l, percentile_95=q_u, percentile_5_smoothed=smoothed_l, percentile_95_smoothed=smoothed_u, record_count=len(all_raw), min_raw=min(all_raw), max_raw=max(all_raw), avg_raw=sum(all_raw) / len(all_raw) ) # 7. 写入DWS表 inserted = self._save_recall_data(recall_data_list) self.logger.info("召回指数计算完成,写入 %d 条记录", inserted) return { 'status': 'success', 'member_count': len(recall_data_list), 'records_inserted': inserted } # ========================================================================== # 数据提取方法 # ========================================================================== def _extract_member_visits( self, site_id: int, start_date: date, end_date: date ) -> Dict[int, List[date]]: """ 提取会员到店记录 Returns: {member_id: [visit_date1, visit_date2, ...]} """ sql = """ SELECT member_id, DATE(pay_time) AS visit_date FROM billiards_dwd.dwd_settlement_head WHERE site_id = %s AND member_id > 0 -- 排除散客 AND settle_type = 1 -- 台桌结账 AND pay_time >= %s AND pay_time < %s + INTERVAL '1 day' GROUP BY member_id, DATE(pay_time) ORDER BY member_id, visit_date """ rows = self.db.query(sql, (site_id, start_date, end_date)) result: Dict[int, List[date]] = {} for row in (rows or []): row_dict = dict(row) member_id = int(row_dict['member_id']) visit_date = row_dict['visit_date'] if member_id not in result: result[member_id] = [] result[member_id].append(visit_date) return result def _extract_recharge_data( self, site_id: int, start_date: date, end_date: date ) -> Dict[int, date]: """ 提取最近充值记录 Returns: {member_id: last_recharge_date} """ sql = """ SELECT member_id, MAX(DATE(pay_time)) AS last_recharge_date FROM billiards_dwd.dwd_recharge_order WHERE site_id = %s AND member_id > 0 AND settle_type = 5 -- 充值订单 AND pay_time >= %s AND pay_time < %s + INTERVAL '1 day' GROUP BY member_id """ rows = self.db.query(sql, (site_id, start_date, end_date)) result: Dict[int, date] = {} for row in (rows or []): row_dict = dict(row) result[int(row_dict['member_id'])] = row_dict['last_recharge_date'] return result def _extract_first_visit_data( self, site_id: int, member_ids: List[int] ) -> Dict[int, date]: """ 提取首访时间 优先使用dim_member.create_time,如果没有则使用dwd_settlement_head中的首次消费时间 Returns: {member_id: first_visit_date} """ if not member_ids: return {} # 使用dim_member的create_time作为首访时间 member_ids_str = ','.join(str(m) for m in member_ids) sql = f""" SELECT member_id, DATE(create_time) AS first_visit_date FROM billiards_dwd.dim_member WHERE member_id IN ({member_ids_str}) AND scd2_is_current = 1 """ rows = self.db.query(sql) result: Dict[int, date] = {} for row in (rows or []): row_dict = dict(row) member_id = int(row_dict['member_id']) first_date = row_dict['first_visit_date'] if first_date: result[member_id] = first_date return result # ========================================================================== # 特征计算方法 # ========================================================================== def _calculate_visit_features( self, data: MemberRecallData, visit_dates: List[date], base_date: date, params: Dict[str, float] ) -> None: """计算到店特征""" if not visit_dates: return # 最近一次到店 last_visit = max(visit_dates) data.days_since_last_visit = (base_date - last_visit).days # 到店间隔 sorted_dates = sorted(visit_dates) intervals = [] for i in range(1, len(sorted_dates)): interval = (sorted_dates[i] - sorted_dates[i-1]).days intervals.append(float(interval)) if intervals: # 中位数(μ) data.visit_interval_median = self.calculate_median(intervals) # MAD(σ),下限为sigma_min mad = self.calculate_mad(intervals) data.visit_interval_mad = max(mad, params['sigma_min']) else: # 只有一次到店,使用默认值 data.visit_interval_median = 7.0 # 默认周期7天 data.visit_interval_mad = params['sigma_min'] # 近14天/60天到店次数 days_14_ago = base_date - timedelta(days=14) days_60_ago = base_date - timedelta(days=60) data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago) data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago) def _calculate_component_scores( self, data: MemberRecallData, params: Dict[str, float] ) -> None: """计算4项分数""" # 1. 超期紧急性 if data.days_since_last_visit is not None and data.visit_interval_median is not None: t = data.days_since_last_visit mu = data.visit_interval_median sigma = data.visit_interval_mad or params['sigma_min'] # z = max(0, (t - μ) / σ) z = max(0.0, (t - mu) / sigma) # overdue = 1 - exp(-z) data.score_overdue = 1.0 - math.exp(-z) # 2. 新客户加分 lookback_days = int(params['lookback_days']) if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days: data.score_new_bonus = self.decay( data.days_since_first_visit, params['halflife_new'] ) # 3. 刚充值加分 if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days: data.score_recharge_bonus = self.decay( data.days_since_last_recharge, params['halflife_recharge'] ) # 4. 热度断档加分 epsilon = 1e-6 n14 = data.visits_last_14_days n60 = data.visits_last_60_days r14 = n14 / 14.0 r60 = (n60 + 1) / 60.0 # +1 平滑 hot_ratio = r14 / (r60 + epsilon) # hot_drop = max(0, ln(1 + (hot_ratio - 1))) if hot_ratio > 1: data.score_hot_drop = self.safe_ln1p(hot_ratio - 1) else: data.score_hot_drop = 0.0 # ========================================================================== # 数据保存方法 # ========================================================================== def _save_recall_data(self, data_list: List[MemberRecallData]) -> int: """保存召回数据到DWS表""" if not data_list: return 0 # 先删除已存在的记录 site_id = data_list[0].site_id member_ids = [d.member_id for d in data_list] member_ids_str = ','.join(str(m) for m in member_ids) delete_sql = f""" DELETE FROM billiards_dws.dws_member_recall_index WHERE site_id = %s AND member_id IN ({member_ids_str}) """ with self.db.conn.cursor() as cur: cur.execute(delete_sql, (site_id,)) # 插入新记录 insert_sql = """ INSERT INTO billiards_dws.dws_member_recall_index ( site_id, tenant_id, member_id, days_since_last_visit, visit_interval_median, visit_interval_mad, days_since_first_visit, days_since_last_recharge, visits_last_14_days, visits_last_60_days, score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop, raw_score, display_score, calc_time, created_at, updated_at ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW(), NOW(), NOW() ) """ inserted = 0 with self.db.conn.cursor() as cur: for data in data_list: cur.execute(insert_sql, ( data.site_id, data.tenant_id, data.member_id, data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad, data.days_since_first_visit, data.days_since_last_recharge, data.visits_last_14_days, data.visits_last_60_days, data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop, data.raw_score, data.display_score )) inserted += cur.rowcount # 提交事务 self.db.conn.commit() return inserted # ========================================================================== # 辅助方法 # ========================================================================== def _load_params(self) -> Dict[str, float]: """加载参数,缺失时使用默认值""" params = self.load_index_parameters() result = dict(self.DEFAULT_PARAMS) result.update(params) return result def _get_site_id(self, context: Optional[TaskContext]) -> int: """获取门店ID""" if context and hasattr(context, 'store_id') and context.store_id: return context.store_id # 从配置获取默认门店ID site_id = self.config.get('app.default_site_id') if site_id: return int(site_id) # 查询数据库获取第一个门店 sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head LIMIT 1" rows = self.db.query(sql) if rows: return int(dict(rows[0])['site_id']) raise ValueError("无法确定门店ID") def _get_tenant_id(self) -> int: """获取租户ID""" tenant_id = self.config.get('app.tenant_id') if tenant_id: return int(tenant_id) sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head LIMIT 1" rows = self.db.query(sql) if rows: return int(dict(rows[0])['tenant_id']) return 0