Files
ZQYY.FQ-ETL/tasks/dws/index/recall_index_task.py

588 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
客户召回指数计算任务
功能说明:
- 衡量客户召回的必要性和紧急程度
- 尊重客户个人到店周期(μ=中位数, σ=MAD
- 对新客户、刚充值客户增加召回倾向
- 检测"热了又断"的情况
算法公式:
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
其中:
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
- new_bonus = decay(d_first, h_new) # 新客户加分
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
数据来源:
- dwd_settlement_head: 会员到店记录
- dwd_recharge_order: 充值记录
- dim_member: 首访时间
更新频率每2小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class MemberRecallData:
"""会员召回数据"""
member_id: int
site_id: int
tenant_id: int
# 计算输入特征
days_since_last_visit: Optional[int] = None
visit_interval_median: Optional[float] = None
visit_interval_mad: Optional[float] = None
days_since_first_visit: Optional[int] = None
days_since_last_recharge: Optional[int] = None
visits_last_14_days: int = 0
visits_last_60_days: int = 0
# 分项得分
score_overdue: float = 0.0
score_new_bonus: float = 0.0
score_recharge_bonus: float = 0.0
score_hot_drop: float = 0.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# =============================================================================
# 召回指数任务
# =============================================================================
class RecallIndexTask(BaseIndexTask):
"""
客户召回指数计算任务
计算流程:
1. 提取近60天有到店记录的会员
2. 计算每个会员的到店间隔特征中位数、MAD
3. 计算4项分数超期、新客、充值、热度断档
4. 汇总Raw Score
5. 分位截断 + MinMax映射到0-10
6. 写入DWS表
"""
INDEX_TYPE = "RECALL"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'sigma_min': 2.0,
'halflife_new': 7.0,
'halflife_recharge': 10.0,
'weight_overdue': 3.0,
'weight_new': 1.0,
'weight_recharge': 1.0,
'weight_hot': 1.0,
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_RECALL_INDEX"
def get_target_table(self) -> str:
return "dws_member_recall_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行召回指数计算"""
self.logger.info("开始计算客户召回指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期
base_date = date.today()
start_date = base_date - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
)
# 1. 提取会员到店数据
member_visits = self._extract_member_visits(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
if not member_visits:
self.logger.warning("没有会员到店记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 提取充值记录
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
# 3. 提取首访时间
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
# 4. 计算每个会员的召回数据
recall_data_list: List[MemberRecallData] = []
for member_id, visit_dates in member_visits.items():
data = MemberRecallData(
member_id=member_id,
site_id=site_id,
tenant_id=tenant_id
)
# 计算特征
self._calculate_visit_features(data, visit_dates, base_date, params)
# 补充充值特征
if member_id in recharge_data:
last_recharge_date = recharge_data[member_id]
data.days_since_last_recharge = (base_date - last_recharge_date).days
# 补充首访特征
if member_id in first_visit_data:
first_visit_date = first_visit_data[member_id]
data.days_since_first_visit = (base_date - first_visit_date).days
# 计算分项得分
self._calculate_component_scores(data, params)
# 汇总Raw Score
data.raw_score = (
params['weight_overdue'] * data.score_overdue +
params['weight_new'] * data.score_new_bonus +
params['weight_recharge'] * data.score_recharge_bonus +
params['weight_hot'] * data.score_hot_drop
)
recall_data_list.append(data)
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
# 5. 归一化到Display Score
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
use_log=False,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=True,
site_id=site_id
)
# 更新display_score
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
for data in recall_data_list:
if data.member_id in score_map:
_, data.display_score = score_map[data.member_id]
# 6. 保存分位点历史
if recall_data_list:
all_raw = [d.raw_score for d in recall_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_recall_data(recall_data_list)
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'member_count': len(recall_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_member_visits(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, List[date]]:
"""
提取会员到店记录
Returns:
{member_id: [visit_date1, visit_date2, ...]}
"""
sql = """
SELECT
member_id,
DATE(pay_time) AS visit_date
FROM billiards_dwd.dwd_settlement_head s
WHERE s.site_id = %s
AND s.member_id > 0 -- 排除散客
AND s.pay_time >= %s
AND s.pay_time < %s + INTERVAL '1 day'
AND (
s.settle_type = 1
OR (
s.settle_type = 3
AND EXISTS (
SELECT 1
FROM billiards_dwd.dwd_assistant_service_log asl
JOIN billiards_dws.cfg_skill_type st
ON asl.skill_id = st.skill_id
AND st.course_type_code = 'BONUS'
AND st.is_active = TRUE
WHERE asl.order_settle_id = s.order_settle_id
AND asl.site_id = s.site_id
AND asl.tenant_member_id = s.member_id
AND asl.is_delete = 0
)
)
)
GROUP BY member_id, DATE(pay_time)
ORDER BY member_id, visit_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, List[date]] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
visit_date = row_dict['visit_date']
if member_id not in result:
result[member_id] = []
result[member_id].append(visit_date)
return result
def _extract_recharge_data(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, date]:
"""
提取最近充值记录
Returns:
{member_id: last_recharge_date}
"""
sql = """
SELECT
member_id,
MAX(DATE(pay_time)) AS last_recharge_date
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id > 0
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
GROUP BY member_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
return result
def _extract_first_visit_data(
self,
site_id: int,
member_ids: List[int]
) -> Dict[int, date]:
"""
提取首访时间
优先使用dim_member.create_time如果没有则使用dwd_settlement_head中的首次消费时间
Returns:
{member_id: first_visit_date}
"""
if not member_ids:
return {}
# 使用dim_member的create_time作为首访时间
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
member_id,
DATE(create_time) AS first_visit_date
FROM billiards_dwd.dim_member
WHERE member_id IN ({member_ids_str})
AND scd2_is_current = 1
"""
rows = self.db.query(sql)
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
first_date = row_dict['first_visit_date']
if first_date:
result[member_id] = first_date
return result
# ==========================================================================
# 特征计算方法
# ==========================================================================
def _calculate_visit_features(
self,
data: MemberRecallData,
visit_dates: List[date],
base_date: date,
params: Dict[str, float]
) -> None:
"""计算到店特征"""
if not visit_dates:
return
# 最近一次到店
last_visit = max(visit_dates)
data.days_since_last_visit = (base_date - last_visit).days
# 到店间隔
sorted_dates = sorted(visit_dates)
intervals = []
for i in range(1, len(sorted_dates)):
interval = (sorted_dates[i] - sorted_dates[i-1]).days
intervals.append(float(interval))
if intervals:
# 中位数(μ)
data.visit_interval_median = self.calculate_median(intervals)
# MADσ下限为sigma_min
mad = self.calculate_mad(intervals)
data.visit_interval_mad = max(mad, params['sigma_min'])
else:
# 只有一次到店,使用默认值
data.visit_interval_median = 7.0 # 默认周期7天
data.visit_interval_mad = params['sigma_min']
# 近14天/60天到店次数
days_14_ago = base_date - timedelta(days=14)
days_60_ago = base_date - timedelta(days=60)
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
def _calculate_component_scores(
self,
data: MemberRecallData,
params: Dict[str, float]
) -> None:
"""计算4项分数"""
# 1. 超期紧急性
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
t = data.days_since_last_visit
mu = data.visit_interval_median
sigma = data.visit_interval_mad or params['sigma_min']
# z = max(0, (t - μ) / σ)
z = max(0.0, (t - mu) / sigma)
# overdue = 1 - exp(-z)
data.score_overdue = 1.0 - math.exp(-z)
# 2. 新客户加分
lookback_days = int(params['lookback_days'])
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
data.score_new_bonus = self.decay(
data.days_since_first_visit,
params['halflife_new']
)
# 3. 刚充值加分
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
data.score_recharge_bonus = self.decay(
data.days_since_last_recharge,
params['halflife_recharge']
)
# 4. 热度断档加分
epsilon = 1e-6
n14 = data.visits_last_14_days
n60 = data.visits_last_60_days
r14 = n14 / 14.0
r60 = (n60 + 1) / 60.0 # +1 平滑
hot_ratio = r14 / (r60 + epsilon)
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
if hot_ratio > 1:
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
else:
data.score_hot_drop = 0.0
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
"""保存召回数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
member_ids = [d.member_id for d in data_list]
member_ids_str = ','.join(str(m) for m in member_ids)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_recall_index
WHERE site_id = %s AND member_id IN ({member_ids_str})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_recall_index (
site_id, tenant_id, member_id,
days_since_last_visit, visit_interval_median, visit_interval_mad,
days_since_first_visit, days_since_last_recharge,
visits_last_14_days, visits_last_60_days,
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id,
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
data.days_since_first_visit, data.days_since_last_recharge,
data.visits_last_14_days, data.visits_last_60_days,
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
# 从配置获取默认门店ID
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
if site_id is not None:
return int(site_id)
# 查询数据库获取第一个门店
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('site_id')
if value is not None:
return int(value)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('tenant_id')
if value is not None:
return int(value)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0