588 lines
20 KiB
Python
588 lines
20 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
客户召回指数计算任务
|
||
|
||
功能说明:
|
||
- 衡量客户召回的必要性和紧急程度
|
||
- 尊重客户个人到店周期(μ=中位数, σ=MAD)
|
||
- 对新客户、刚充值客户增加召回倾向
|
||
- 检测"热了又断"的情况
|
||
|
||
算法公式:
|
||
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
|
||
|
||
其中:
|
||
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
|
||
- new_bonus = decay(d_first, h_new) # 新客户加分
|
||
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
|
||
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
|
||
|
||
数据来源:
|
||
- dwd_settlement_head: 会员到店记录
|
||
- dwd_recharge_order: 充值记录
|
||
- dim_member: 首访时间
|
||
|
||
更新频率:每2小时
|
||
|
||
作者:ETL团队
|
||
创建日期:2026-02-03
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import math
|
||
from dataclasses import dataclass
|
||
from datetime import date, datetime, timedelta
|
||
from decimal import Decimal
|
||
from typing import Any, Dict, List, Optional, Tuple
|
||
|
||
from .base_index_task import BaseIndexTask, PercentileHistory
|
||
from ..base_dws_task import TaskContext
|
||
|
||
|
||
# =============================================================================
|
||
# 数据类定义
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class MemberRecallData:
|
||
"""会员召回数据"""
|
||
member_id: int
|
||
site_id: int
|
||
tenant_id: int
|
||
|
||
# 计算输入特征
|
||
days_since_last_visit: Optional[int] = None
|
||
visit_interval_median: Optional[float] = None
|
||
visit_interval_mad: Optional[float] = None
|
||
days_since_first_visit: Optional[int] = None
|
||
days_since_last_recharge: Optional[int] = None
|
||
visits_last_14_days: int = 0
|
||
visits_last_60_days: int = 0
|
||
|
||
# 分项得分
|
||
score_overdue: float = 0.0
|
||
score_new_bonus: float = 0.0
|
||
score_recharge_bonus: float = 0.0
|
||
score_hot_drop: float = 0.0
|
||
|
||
# 最终分数
|
||
raw_score: float = 0.0
|
||
display_score: float = 0.0
|
||
|
||
|
||
# =============================================================================
|
||
# 召回指数任务
|
||
# =============================================================================
|
||
|
||
class RecallIndexTask(BaseIndexTask):
|
||
"""
|
||
客户召回指数计算任务
|
||
|
||
计算流程:
|
||
1. 提取近60天有到店记录的会员
|
||
2. 计算每个会员的到店间隔特征(中位数、MAD)
|
||
3. 计算4项分数(超期、新客、充值、热度断档)
|
||
4. 汇总Raw Score
|
||
5. 分位截断 + MinMax映射到0-10
|
||
6. 写入DWS表
|
||
"""
|
||
|
||
INDEX_TYPE = "RECALL"
|
||
|
||
# 默认参数
|
||
DEFAULT_PARAMS = {
|
||
'lookback_days': 60,
|
||
'sigma_min': 2.0,
|
||
'halflife_new': 7.0,
|
||
'halflife_recharge': 10.0,
|
||
'weight_overdue': 3.0,
|
||
'weight_new': 1.0,
|
||
'weight_recharge': 1.0,
|
||
'weight_hot': 1.0,
|
||
'percentile_lower': 5,
|
||
'percentile_upper': 95,
|
||
}
|
||
|
||
# ==========================================================================
|
||
# 抽象方法实现
|
||
# ==========================================================================
|
||
|
||
def get_task_code(self) -> str:
|
||
return "DWS_RECALL_INDEX"
|
||
|
||
def get_target_table(self) -> str:
|
||
return "dws_member_recall_index"
|
||
|
||
def get_primary_keys(self) -> List[str]:
|
||
return ['site_id', 'member_id']
|
||
|
||
def get_index_type(self) -> str:
|
||
return self.INDEX_TYPE
|
||
|
||
# ==========================================================================
|
||
# 任务执行
|
||
# ==========================================================================
|
||
|
||
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
|
||
"""执行召回指数计算"""
|
||
self.logger.info("开始计算客户召回指数")
|
||
|
||
# 获取门店ID
|
||
site_id = self._get_site_id(context)
|
||
tenant_id = self._get_tenant_id()
|
||
|
||
# 加载参数
|
||
params = self._load_params()
|
||
lookback_days = int(params['lookback_days'])
|
||
|
||
# 计算基准日期
|
||
base_date = date.today()
|
||
start_date = base_date - timedelta(days=lookback_days)
|
||
|
||
self.logger.info(
|
||
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
|
||
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
|
||
)
|
||
|
||
# 1. 提取会员到店数据
|
||
member_visits = self._extract_member_visits(site_id, start_date, base_date)
|
||
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
|
||
|
||
if not member_visits:
|
||
self.logger.warning("没有会员到店记录,跳过计算")
|
||
return {'status': 'skipped', 'reason': 'no_data'}
|
||
|
||
# 2. 提取充值记录
|
||
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
|
||
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
|
||
|
||
# 3. 提取首访时间
|
||
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
|
||
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
|
||
|
||
# 4. 计算每个会员的召回数据
|
||
recall_data_list: List[MemberRecallData] = []
|
||
|
||
for member_id, visit_dates in member_visits.items():
|
||
data = MemberRecallData(
|
||
member_id=member_id,
|
||
site_id=site_id,
|
||
tenant_id=tenant_id
|
||
)
|
||
|
||
# 计算特征
|
||
self._calculate_visit_features(data, visit_dates, base_date, params)
|
||
|
||
# 补充充值特征
|
||
if member_id in recharge_data:
|
||
last_recharge_date = recharge_data[member_id]
|
||
data.days_since_last_recharge = (base_date - last_recharge_date).days
|
||
|
||
# 补充首访特征
|
||
if member_id in first_visit_data:
|
||
first_visit_date = first_visit_data[member_id]
|
||
data.days_since_first_visit = (base_date - first_visit_date).days
|
||
|
||
# 计算分项得分
|
||
self._calculate_component_scores(data, params)
|
||
|
||
# 汇总Raw Score
|
||
data.raw_score = (
|
||
params['weight_overdue'] * data.score_overdue +
|
||
params['weight_new'] * data.score_new_bonus +
|
||
params['weight_recharge'] * data.score_recharge_bonus +
|
||
params['weight_hot'] * data.score_hot_drop
|
||
)
|
||
|
||
recall_data_list.append(data)
|
||
|
||
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
|
||
|
||
# 5. 归一化到Display Score
|
||
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
|
||
normalized = self.batch_normalize_to_display(
|
||
raw_scores,
|
||
use_log=False,
|
||
percentile_lower=int(params['percentile_lower']),
|
||
percentile_upper=int(params['percentile_upper']),
|
||
use_smoothing=True,
|
||
site_id=site_id
|
||
)
|
||
|
||
# 更新display_score
|
||
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
|
||
for data in recall_data_list:
|
||
if data.member_id in score_map:
|
||
_, data.display_score = score_map[data.member_id]
|
||
|
||
# 6. 保存分位点历史
|
||
if recall_data_list:
|
||
all_raw = [d.raw_score for d in recall_data_list]
|
||
q_l, q_u = self.calculate_percentiles(
|
||
all_raw,
|
||
int(params['percentile_lower']),
|
||
int(params['percentile_upper'])
|
||
)
|
||
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
|
||
|
||
self.save_percentile_history(
|
||
site_id=site_id,
|
||
percentile_5=q_l,
|
||
percentile_95=q_u,
|
||
percentile_5_smoothed=smoothed_l,
|
||
percentile_95_smoothed=smoothed_u,
|
||
record_count=len(all_raw),
|
||
min_raw=min(all_raw),
|
||
max_raw=max(all_raw),
|
||
avg_raw=sum(all_raw) / len(all_raw)
|
||
)
|
||
|
||
# 7. 写入DWS表
|
||
inserted = self._save_recall_data(recall_data_list)
|
||
|
||
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
|
||
|
||
return {
|
||
'status': 'success',
|
||
'member_count': len(recall_data_list),
|
||
'records_inserted': inserted
|
||
}
|
||
|
||
# ==========================================================================
|
||
# 数据提取方法
|
||
# ==========================================================================
|
||
|
||
def _extract_member_visits(
|
||
self,
|
||
site_id: int,
|
||
start_date: date,
|
||
end_date: date
|
||
) -> Dict[int, List[date]]:
|
||
"""
|
||
提取会员到店记录
|
||
|
||
Returns:
|
||
{member_id: [visit_date1, visit_date2, ...]}
|
||
"""
|
||
sql = """
|
||
SELECT
|
||
member_id,
|
||
DATE(pay_time) AS visit_date
|
||
FROM billiards_dwd.dwd_settlement_head s
|
||
WHERE s.site_id = %s
|
||
AND s.member_id > 0 -- 排除散客
|
||
AND s.pay_time >= %s
|
||
AND s.pay_time < %s + INTERVAL '1 day'
|
||
AND (
|
||
s.settle_type = 1
|
||
OR (
|
||
s.settle_type = 3
|
||
AND EXISTS (
|
||
SELECT 1
|
||
FROM billiards_dwd.dwd_assistant_service_log asl
|
||
JOIN billiards_dws.cfg_skill_type st
|
||
ON asl.skill_id = st.skill_id
|
||
AND st.course_type_code = 'BONUS'
|
||
AND st.is_active = TRUE
|
||
WHERE asl.order_settle_id = s.order_settle_id
|
||
AND asl.site_id = s.site_id
|
||
AND asl.tenant_member_id = s.member_id
|
||
AND asl.is_delete = 0
|
||
)
|
||
)
|
||
)
|
||
GROUP BY member_id, DATE(pay_time)
|
||
ORDER BY member_id, visit_date
|
||
"""
|
||
|
||
rows = self.db.query(sql, (site_id, start_date, end_date))
|
||
|
||
result: Dict[int, List[date]] = {}
|
||
for row in (rows or []):
|
||
row_dict = dict(row)
|
||
member_id = int(row_dict['member_id'])
|
||
visit_date = row_dict['visit_date']
|
||
|
||
if member_id not in result:
|
||
result[member_id] = []
|
||
result[member_id].append(visit_date)
|
||
|
||
return result
|
||
|
||
def _extract_recharge_data(
|
||
self,
|
||
site_id: int,
|
||
start_date: date,
|
||
end_date: date
|
||
) -> Dict[int, date]:
|
||
"""
|
||
提取最近充值记录
|
||
|
||
Returns:
|
||
{member_id: last_recharge_date}
|
||
"""
|
||
sql = """
|
||
SELECT
|
||
member_id,
|
||
MAX(DATE(pay_time)) AS last_recharge_date
|
||
FROM billiards_dwd.dwd_recharge_order
|
||
WHERE site_id = %s
|
||
AND member_id > 0
|
||
AND settle_type = 5 -- 充值订单
|
||
AND pay_time >= %s
|
||
AND pay_time < %s + INTERVAL '1 day'
|
||
GROUP BY member_id
|
||
"""
|
||
|
||
rows = self.db.query(sql, (site_id, start_date, end_date))
|
||
|
||
result: Dict[int, date] = {}
|
||
for row in (rows or []):
|
||
row_dict = dict(row)
|
||
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
|
||
|
||
return result
|
||
|
||
def _extract_first_visit_data(
|
||
self,
|
||
site_id: int,
|
||
member_ids: List[int]
|
||
) -> Dict[int, date]:
|
||
"""
|
||
提取首访时间
|
||
|
||
优先使用dim_member.create_time,如果没有则使用dwd_settlement_head中的首次消费时间
|
||
|
||
Returns:
|
||
{member_id: first_visit_date}
|
||
"""
|
||
if not member_ids:
|
||
return {}
|
||
|
||
# 使用dim_member的create_time作为首访时间
|
||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||
sql = f"""
|
||
SELECT
|
||
member_id,
|
||
DATE(create_time) AS first_visit_date
|
||
FROM billiards_dwd.dim_member
|
||
WHERE member_id IN ({member_ids_str})
|
||
AND scd2_is_current = 1
|
||
"""
|
||
|
||
rows = self.db.query(sql)
|
||
|
||
result: Dict[int, date] = {}
|
||
for row in (rows or []):
|
||
row_dict = dict(row)
|
||
member_id = int(row_dict['member_id'])
|
||
first_date = row_dict['first_visit_date']
|
||
if first_date:
|
||
result[member_id] = first_date
|
||
|
||
return result
|
||
|
||
# ==========================================================================
|
||
# 特征计算方法
|
||
# ==========================================================================
|
||
|
||
def _calculate_visit_features(
|
||
self,
|
||
data: MemberRecallData,
|
||
visit_dates: List[date],
|
||
base_date: date,
|
||
params: Dict[str, float]
|
||
) -> None:
|
||
"""计算到店特征"""
|
||
if not visit_dates:
|
||
return
|
||
|
||
# 最近一次到店
|
||
last_visit = max(visit_dates)
|
||
data.days_since_last_visit = (base_date - last_visit).days
|
||
|
||
# 到店间隔
|
||
sorted_dates = sorted(visit_dates)
|
||
intervals = []
|
||
for i in range(1, len(sorted_dates)):
|
||
interval = (sorted_dates[i] - sorted_dates[i-1]).days
|
||
intervals.append(float(interval))
|
||
|
||
if intervals:
|
||
# 中位数(μ)
|
||
data.visit_interval_median = self.calculate_median(intervals)
|
||
|
||
# MAD(σ),下限为sigma_min
|
||
mad = self.calculate_mad(intervals)
|
||
data.visit_interval_mad = max(mad, params['sigma_min'])
|
||
else:
|
||
# 只有一次到店,使用默认值
|
||
data.visit_interval_median = 7.0 # 默认周期7天
|
||
data.visit_interval_mad = params['sigma_min']
|
||
|
||
# 近14天/60天到店次数
|
||
days_14_ago = base_date - timedelta(days=14)
|
||
days_60_ago = base_date - timedelta(days=60)
|
||
|
||
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
|
||
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
|
||
|
||
def _calculate_component_scores(
|
||
self,
|
||
data: MemberRecallData,
|
||
params: Dict[str, float]
|
||
) -> None:
|
||
"""计算4项分数"""
|
||
|
||
# 1. 超期紧急性
|
||
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
|
||
t = data.days_since_last_visit
|
||
mu = data.visit_interval_median
|
||
sigma = data.visit_interval_mad or params['sigma_min']
|
||
|
||
# z = max(0, (t - μ) / σ)
|
||
z = max(0.0, (t - mu) / sigma)
|
||
# overdue = 1 - exp(-z)
|
||
data.score_overdue = 1.0 - math.exp(-z)
|
||
|
||
# 2. 新客户加分
|
||
lookback_days = int(params['lookback_days'])
|
||
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
|
||
data.score_new_bonus = self.decay(
|
||
data.days_since_first_visit,
|
||
params['halflife_new']
|
||
)
|
||
|
||
# 3. 刚充值加分
|
||
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
|
||
data.score_recharge_bonus = self.decay(
|
||
data.days_since_last_recharge,
|
||
params['halflife_recharge']
|
||
)
|
||
|
||
# 4. 热度断档加分
|
||
epsilon = 1e-6
|
||
n14 = data.visits_last_14_days
|
||
n60 = data.visits_last_60_days
|
||
|
||
r14 = n14 / 14.0
|
||
r60 = (n60 + 1) / 60.0 # +1 平滑
|
||
|
||
hot_ratio = r14 / (r60 + epsilon)
|
||
|
||
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
|
||
if hot_ratio > 1:
|
||
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
|
||
else:
|
||
data.score_hot_drop = 0.0
|
||
|
||
# ==========================================================================
|
||
# 数据保存方法
|
||
# ==========================================================================
|
||
|
||
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
|
||
"""保存召回数据到DWS表"""
|
||
if not data_list:
|
||
return 0
|
||
|
||
# 先删除已存在的记录
|
||
site_id = data_list[0].site_id
|
||
member_ids = [d.member_id for d in data_list]
|
||
|
||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||
delete_sql = f"""
|
||
DELETE FROM billiards_dws.dws_member_recall_index
|
||
WHERE site_id = %s AND member_id IN ({member_ids_str})
|
||
"""
|
||
|
||
with self.db.conn.cursor() as cur:
|
||
cur.execute(delete_sql, (site_id,))
|
||
|
||
# 插入新记录
|
||
insert_sql = """
|
||
INSERT INTO billiards_dws.dws_member_recall_index (
|
||
site_id, tenant_id, member_id,
|
||
days_since_last_visit, visit_interval_median, visit_interval_mad,
|
||
days_since_first_visit, days_since_last_recharge,
|
||
visits_last_14_days, visits_last_60_days,
|
||
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
|
||
raw_score, display_score,
|
||
calc_time, created_at, updated_at
|
||
) VALUES (
|
||
%s, %s, %s,
|
||
%s, %s, %s,
|
||
%s, %s,
|
||
%s, %s,
|
||
%s, %s, %s, %s,
|
||
%s, %s,
|
||
NOW(), NOW(), NOW()
|
||
)
|
||
"""
|
||
|
||
inserted = 0
|
||
with self.db.conn.cursor() as cur:
|
||
for data in data_list:
|
||
cur.execute(insert_sql, (
|
||
data.site_id, data.tenant_id, data.member_id,
|
||
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
|
||
data.days_since_first_visit, data.days_since_last_recharge,
|
||
data.visits_last_14_days, data.visits_last_60_days,
|
||
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
|
||
data.raw_score, data.display_score
|
||
))
|
||
inserted += cur.rowcount
|
||
|
||
# 提交事务
|
||
self.db.conn.commit()
|
||
|
||
return inserted
|
||
|
||
# ==========================================================================
|
||
# 辅助方法
|
||
# ==========================================================================
|
||
|
||
def _load_params(self) -> Dict[str, float]:
|
||
"""加载参数,缺失时使用默认值"""
|
||
params = self.load_index_parameters()
|
||
result = dict(self.DEFAULT_PARAMS)
|
||
result.update(params)
|
||
return result
|
||
|
||
def _get_site_id(self, context: Optional[TaskContext]) -> int:
|
||
"""获取门店ID"""
|
||
if context and hasattr(context, 'store_id') and context.store_id:
|
||
return context.store_id
|
||
|
||
# 从配置获取默认门店ID
|
||
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
|
||
if site_id is not None:
|
||
return int(site_id)
|
||
|
||
# 查询数据库获取第一个门店
|
||
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
|
||
rows = self.db.query(sql)
|
||
if rows:
|
||
value = dict(rows[0]).get('site_id')
|
||
if value is not None:
|
||
return int(value)
|
||
|
||
self.logger.warning("无法确定门店ID,使用 0 继续执行")
|
||
return 0
|
||
|
||
def _get_tenant_id(self) -> int:
|
||
"""获取租户ID"""
|
||
tenant_id = self.config.get('app.tenant_id')
|
||
if tenant_id is not None:
|
||
return int(tenant_id)
|
||
|
||
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
|
||
rows = self.db.query(sql)
|
||
if rows:
|
||
value = dict(rows[0]).get('tenant_id')
|
||
if value is not None:
|
||
return int(value)
|
||
|
||
self.logger.warning("无法确定租户ID,使用 0 继续执行")
|
||
return 0
|