This commit is contained in:
Neo
2026-02-04 21:39:01 +08:00
parent ee773a9b52
commit a3f4d04335
148 changed files with 31455 additions and 182 deletions

View File

@@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
"""
指数算法任务模块
包含:
- RecallIndexTask: 客户召回指数计算任务
- IntimacyIndexTask: 客户-助教亲密指数计算任务
"""
from .recall_index_task import RecallIndexTask
from .intimacy_index_task import IntimacyIndexTask
__all__ = [
'RecallIndexTask',
'IntimacyIndexTask',
]

View File

@@ -0,0 +1,518 @@
# -*- coding: utf-8 -*-
"""
指数算法任务基类
功能说明:
- 提供半衰期时间衰减函数
- 提供分位数计算和分位截断
- 提供0-10映射方法
- 提供算法参数加载
- 提供分位点历史记录用于EWMA平滑
算法原理:
1. 时间衰减函数半衰期模型decay(d; h) = exp(-ln(2) * d / h)
当 d=h 时权重衰减到 0.5,越近权重越大
2. 0-10映射流程
Raw Score → Winsorize(P5, P95) → [可选Log压缩] → MinMax(0, 10)
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from abc import abstractmethod
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from ..base_dws_task import BaseDwsTask, TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class IndexParameters:
"""指数算法参数数据类"""
params: Dict[str, float]
loaded_at: datetime
@dataclass
class PercentileHistory:
"""分位点历史记录"""
percentile_5: float
percentile_95: float
percentile_5_smoothed: float
percentile_95_smoothed: float
record_count: int
calc_time: datetime
# =============================================================================
# 指数任务基类
# =============================================================================
class BaseIndexTask(BaseDwsTask):
"""
指数算法任务基类
提供指数计算通用功能:
1. 半衰期时间衰减函数
2. 分位数计算与截断
3. 0-10归一化映射
4. 算法参数加载
5. 分位点历史管理EWMA平滑
"""
# 子类需要定义的指数类型
INDEX_TYPE: str = ""
# 参数缓存
_index_params_cache: Optional[IndexParameters] = None
_index_params_ttl: int = 300 # 缓存有效期(秒)
# 默认参数
DEFAULT_LOOKBACK_DAYS = 60
DEFAULT_PERCENTILE_LOWER = 5
DEFAULT_PERCENTILE_UPPER = 95
DEFAULT_EWMA_ALPHA = 0.2
# ==========================================================================
# 抽象方法(子类需实现)
# ==========================================================================
@abstractmethod
def get_index_type(self) -> str:
"""获取指数类型RECALL/INTIMACY"""
raise NotImplementedError
# ==========================================================================
# 时间衰减函数
# ==========================================================================
def decay(self, days: float, halflife: float) -> float:
"""
半衰期衰减函数
公式: decay(d; h) = exp(-ln(2) * d / h)
解释:当 d=h 时权重衰减到 0.5;越近权重越大,符合"近期更重要"的直觉
Args:
days: 事件距今天数 (d >= 0)
halflife: 半衰期 (h > 0),单位:天
Returns:
衰减后的权重,范围 (0, 1]
Examples:
>>> decay(0, 7) # 今天,权重=1.0
1.0
>>> decay(7, 7) # 7天前半衰期=7权重=0.5
0.5
>>> decay(14, 7) # 14天前权重=0.25
0.25
"""
if halflife <= 0:
raise ValueError("半衰期必须大于0")
if days < 0:
days = 0
return math.exp(-math.log(2) * days / halflife)
# ==========================================================================
# 分位数计算
# ==========================================================================
def calculate_percentiles(
self,
scores: List[float],
lower: int = 5,
upper: int = 95
) -> Tuple[float, float]:
"""
计算分位点
Args:
scores: 分数列表
lower: 下分位点百分比默认5
upper: 上分位点百分比默认95
Returns:
(下分位值, 上分位值) 元组
"""
if not scores:
return 0.0, 0.0
sorted_scores = sorted(scores)
n = len(sorted_scores)
# 计算分位点索引
lower_idx = max(0, int(n * lower / 100) - 1)
upper_idx = min(n - 1, int(n * upper / 100))
return sorted_scores[lower_idx], sorted_scores[upper_idx]
def winsorize(self, value: float, lower: float, upper: float) -> float:
"""
分位截断Winsorize
将值限制在 [lower, upper] 范围内
Args:
value: 原始值
lower: 下限P5分位
upper: 上限P95分位
Returns:
截断后的值
"""
return min(max(value, lower), upper)
# ==========================================================================
# 0-10映射
# ==========================================================================
def normalize_to_display(
self,
value: float,
min_val: float,
max_val: float,
use_log: bool = False,
epsilon: float = 1e-6
) -> float:
"""
归一化到0-10分
映射流程:
1. [可选] 对数压缩y = ln(1 + x)
2. MinMax映射score = 10 * (y - min) / (max - min)
Args:
value: 原始值已Winsorize
min_val: 最小值通常为P5
max_val: 最大值通常为P95
use_log: 是否使用对数压缩(亲密指数建议启用)
epsilon: 防除零小量
Returns:
0-10范围的分数
"""
if use_log:
value = math.log1p(value)
min_val = math.log1p(min_val)
max_val = math.log1p(max_val)
# 防止分母为0
range_val = max_val - min_val
if range_val < epsilon:
return 5.0 # 几乎全员相同时返回中间值
score = 10.0 * (value - min_val) / range_val
# 确保在0-10范围内
return max(0.0, min(10.0, score))
def batch_normalize_to_display(
self,
raw_scores: List[Tuple[Any, float]], # [(entity_id, raw_score), ...]
use_log: bool = False,
percentile_lower: int = 5,
percentile_upper: int = 95,
use_smoothing: bool = False,
site_id: Optional[int] = None
) -> List[Tuple[Any, float, float]]:
"""
批量归一化Raw Score到Display Score
流程:
1. 提取所有raw_score
2. 计算分位点可选EWMA平滑
3. Winsorize截断
4. MinMax映射到0-10
Args:
raw_scores: (entity_id, raw_score) 元组列表
use_log: 是否使用对数压缩
percentile_lower: 下分位百分比
percentile_upper: 上分位百分比
use_smoothing: 是否使用EWMA平滑分位点
site_id: 门店ID平滑时需要
Returns:
(entity_id, raw_score, display_score) 元组列表
"""
if not raw_scores:
return []
# 提取raw_score
scores = [s for _, s in raw_scores]
# 计算分位点
q_l, q_u = self.calculate_percentiles(scores, percentile_lower, percentile_upper)
# EWMA平滑
if use_smoothing and site_id is not None:
q_l, q_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
# 映射
results = []
for entity_id, raw_score in raw_scores:
clipped = self.winsorize(raw_score, q_l, q_u)
display = self.normalize_to_display(clipped, q_l, q_u, use_log)
results.append((entity_id, raw_score, round(display, 2)))
return results
# ==========================================================================
# 算法参数加载
# ==========================================================================
def load_index_parameters(
self,
index_type: Optional[str] = None,
force_reload: bool = False
) -> Dict[str, float]:
"""
加载指数算法参数
Args:
index_type: 指数类型默认使用子类定义的INDEX_TYPE
force_reload: 是否强制重新加载
Returns:
参数名到参数值的字典
"""
if index_type is None:
index_type = self.get_index_type()
now = datetime.now(self.tz)
# 检查缓存
if (
not force_reload
and self._index_params_cache is not None
and (now - self._index_params_cache.loaded_at).total_seconds() < self._index_params_ttl
):
return self._index_params_cache.params
self.logger.debug("加载指数算法参数: %s", index_type)
sql = """
SELECT param_name, param_value
FROM billiards_dws.cfg_index_parameters
WHERE index_type = %s
AND effective_from <= CURRENT_DATE
AND (effective_to IS NULL OR effective_to >= CURRENT_DATE)
ORDER BY effective_from DESC
"""
rows = self.db.query(sql, (index_type,))
params = {}
seen = set()
for row in (rows or []):
row_dict = dict(row)
name = row_dict['param_name']
if name not in seen:
params[name] = float(row_dict['param_value'])
seen.add(name)
self._index_params_cache = IndexParameters(
params=params,
loaded_at=now
)
return params
def get_param(self, name: str, default: float = 0.0) -> float:
"""
获取单个参数值
Args:
name: 参数名
default: 默认值
Returns:
参数值
"""
params = self.load_index_parameters()
return params.get(name, default)
# ==========================================================================
# 分位点历史管理EWMA平滑
# ==========================================================================
def get_last_percentile_history(
self,
site_id: int,
index_type: Optional[str] = None
) -> Optional[PercentileHistory]:
"""
获取最近一次分位点历史
Args:
site_id: 门店ID
index_type: 指数类型
Returns:
PercentileHistory 或 None
"""
if index_type is None:
index_type = self.get_index_type()
sql = """
SELECT
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, calc_time
FROM billiards_dws.dws_index_percentile_history
WHERE site_id = %s AND index_type = %s
ORDER BY calc_time DESC
LIMIT 1
"""
rows = self.db.query(sql, (site_id, index_type))
if not rows:
return None
row = dict(rows[0])
return PercentileHistory(
percentile_5=float(row['percentile_5'] or 0),
percentile_95=float(row['percentile_95'] or 0),
percentile_5_smoothed=float(row['percentile_5_smoothed'] or 0),
percentile_95_smoothed=float(row['percentile_95_smoothed'] or 0),
record_count=int(row['record_count'] or 0),
calc_time=row['calc_time']
)
def save_percentile_history(
self,
site_id: int,
percentile_5: float,
percentile_95: float,
percentile_5_smoothed: float,
percentile_95_smoothed: float,
record_count: int,
min_raw: float,
max_raw: float,
avg_raw: float,
index_type: Optional[str] = None
) -> None:
"""
保存分位点历史
Args:
site_id: 门店ID
percentile_5: 原始5分位
percentile_95: 原始95分位
percentile_5_smoothed: 平滑后5分位
percentile_95_smoothed: 平滑后95分位
record_count: 记录数
min_raw: 最小Raw Score
max_raw: 最大Raw Score
avg_raw: 平均Raw Score
index_type: 指数类型
"""
if index_type is None:
index_type = self.get_index_type()
sql = """
INSERT INTO billiards_dws.dws_index_percentile_history (
site_id, index_type, calc_time,
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, min_raw_score, max_raw_score, avg_raw_score
) VALUES (%s, %s, NOW(), %s, %s, %s, %s, %s, %s, %s, %s)
"""
with self.db.conn.cursor() as cur:
cur.execute(sql, (
site_id, index_type,
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, min_raw, max_raw, avg_raw
))
self.db.conn.commit()
def _apply_ewma_smoothing(
self,
site_id: int,
current_p5: float,
current_p95: float,
alpha: Optional[float] = None
) -> Tuple[float, float]:
"""
应用EWMA平滑到分位点
公式: Q_t = (1 - α) * Q_{t-1} + α * Q_now
Args:
site_id: 门店ID
current_p5: 当前5分位
current_p95: 当前95分位
alpha: 平滑系数默认0.2
Returns:
(平滑后的P5, 平滑后的P95)
"""
if alpha is None:
alpha = self.get_param('ewma_alpha', self.DEFAULT_EWMA_ALPHA)
history = self.get_last_percentile_history(site_id)
if history is None:
# 首次计算,不平滑
return current_p5, current_p95
smoothed_p5 = (1 - alpha) * history.percentile_5_smoothed + alpha * current_p5
smoothed_p95 = (1 - alpha) * history.percentile_95_smoothed + alpha * current_p95
return smoothed_p5, smoothed_p95
# ==========================================================================
# 统计工具方法
# ==========================================================================
def calculate_median(self, values: List[float]) -> float:
"""计算中位数"""
if not values:
return 0.0
sorted_vals = sorted(values)
n = len(sorted_vals)
mid = n // 2
if n % 2 == 0:
return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
return sorted_vals[mid]
def calculate_mad(self, values: List[float]) -> float:
"""
计算MAD中位绝对偏差
MAD = median(|x - median(x)|)
MAD是比标准差更稳健的离散度度量不受极端值影响
"""
if not values:
return 0.0
median_val = self.calculate_median(values)
deviations = [abs(v - median_val) for v in values]
return self.calculate_median(deviations)
def safe_log(self, value: float, default: float = 0.0) -> float:
"""安全的对数运算"""
if value <= 0:
return default
return math.log(value)
def safe_ln1p(self, value: float) -> float:
"""安全的ln(1+x)运算"""
if value < -1:
return 0.0
return math.log1p(value)

View File

@@ -0,0 +1,688 @@
# -*- coding: utf-8 -*-
"""
客户-助教亲密指数计算任务
功能说明:
- 衡量客户与助教的关系强度和近期温度
- 用于助教约课精力分配和约课成功率预估
- 附加课权重 = 基础课的1.5倍
- 检测频率激增并放大权重
算法公式:
Raw Score = (w_F × F + w_R × R + w_M × M + w_D × D) × mult
其中:
- F = Σ(τ_i × decay(d_i, h_sess)) # 频次强度
- R = decay(d_last, h_last) # 最近温度
- M = Σ(ln(1+amt/A0) × decay(d_r, h_pay)) # 归因充值强度
- D = Σ(sqrt(dur/60) × τ × decay(d, h)) # 时长贡献
- mult = 1 + γ × burst # 激增放大
- burst = max(0, ln(1 + (F_short/F_long - 1)))
特殊逻辑:
- 会话合并:同一客人对同一助教,间隔<4小时算同次服务
- 充值归因服务结束后1小时内的充值算做该助教贡献
数据来源:
- dwd_assistant_service_log: 服务记录
- dwd_recharge_order: 充值记录
更新频率每4小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class ServiceSession:
"""合并后的服务会话"""
session_start: datetime
session_end: datetime
total_duration_minutes: int = 0
course_weight: float = 1.0 # 1.0=基础课, 1.5=附加课
is_incentive: bool = False # 是否为附加课
@dataclass
class AttributedRecharge:
"""归因充值"""
pay_time: datetime
pay_amount: float
days_ago: float
@dataclass
class MemberAssistantIntimacyData:
"""客户-助教亲密数据"""
member_id: int
assistant_no: str # 助教工号(字符串,如 "1", "2", "15"
assistant_nickname: str # 助教昵称
site_id: int
tenant_id: int
# 计算输入特征
session_count: int = 0
total_duration_minutes: int = 0
basic_session_count: int = 0
incentive_session_count: int = 0
days_since_last_session: Optional[int] = None
attributed_recharge_count: int = 0
attributed_recharge_amount: float = 0.0
# 分项得分
score_frequency: float = 0.0
score_recency: float = 0.0
score_recharge: float = 0.0
score_duration: float = 0.0
burst_multiplier: float = 1.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# 中间数据
sessions: List[ServiceSession] = field(default_factory=list)
recharges: List[AttributedRecharge] = field(default_factory=list)
# =============================================================================
# 亲密指数任务
# =============================================================================
class IntimacyIndexTask(BaseIndexTask):
"""
客户-助教亲密指数计算任务
计算流程:
1. 提取近60天的助教服务记录
2. 按(member_id, assistant_id)分组合并4小时内的服务
3. 提取归因充值服务结束后1小时内
4. 计算5项分数频次、最近、充值、时长、激增
5. 汇总Raw Score
6. 分位截断 + Log压缩 + MinMax映射到0-10
7. 写入DWS表
"""
INDEX_TYPE = "INTIMACY"
# 技能ID映射
SKILL_ID_BASIC = 2790683529513797 # 基础课
SKILL_ID_INCENTIVE = 2790683529513798 # 附加课/激励课
SKILL_ID_BOX = 3039912271463941 # 包厢课
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'halflife_session': 14.0,
'halflife_last': 10.0,
'halflife_recharge': 21.0,
'halflife_short': 7.0,
'halflife_long': 30.0,
'amount_base': 500.0,
'incentive_weight': 1.5,
'session_merge_hours': 4,
'recharge_attribute_hours': 1,
'weight_frequency': 2.0,
'weight_recency': 1.5,
'weight_recharge': 2.0,
'weight_duration': 0.5,
'burst_gamma': 0.6,
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_INTIMACY_INDEX"
def get_target_table(self) -> str:
return "dws_member_assistant_intimacy"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id', 'assistant_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行亲密指数计算"""
self.logger.info("开始计算客户-助教亲密指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期和时间
now = datetime.now(self.tz)
base_date = now.date()
start_datetime = now - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, h_sess=%.1f, h_last=%.1f, h_pay=%.1f, γ=%.2f",
lookback_days, params['halflife_session'], params['halflife_last'],
params['halflife_recharge'], params['burst_gamma']
)
# 1. 提取服务记录
raw_services = self._extract_service_records(site_id, start_datetime, now)
self.logger.info("提取到 %d 条原始服务记录", len(raw_services))
if not raw_services:
self.logger.warning("没有服务记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 按(member_id, assistant_id)分组并合并会话
pair_data = self._group_and_merge_sessions(raw_services, params, now)
self.logger.info("合并为 %d 个客户-助教对", len(pair_data))
# 3. 提取归因充值
self._extract_attributed_recharges(site_id, pair_data, params, now)
# 4. 计算每个pair的特征和分数
intimacy_data_list: List[MemberAssistantIntimacyData] = []
for key, data in pair_data.items():
data.site_id = site_id
data.tenant_id = tenant_id
# 计算分项得分
self._calculate_component_scores(data, params, now)
# 汇总Raw Score
base_score = (
params['weight_frequency'] * data.score_frequency +
params['weight_recency'] * data.score_recency +
params['weight_recharge'] * data.score_recharge +
params['weight_duration'] * data.score_duration
)
data.raw_score = base_score * data.burst_multiplier
intimacy_data_list.append(data)
self.logger.info("计算完成 %d 个pair的Raw Score", len(intimacy_data_list))
# 5. 归一化到Display Score使用对数压缩
raw_scores = [((d.member_id, d.assistant_no), d.raw_score) for d in intimacy_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
use_log=True, # 亲密指数建议使用对数压缩
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=True,
site_id=site_id
)
# 更新display_score
score_map = {key: (raw, display) for key, raw, display in normalized}
for data in intimacy_data_list:
key = (data.member_id, data.assistant_no)
if key in score_map:
_, data.display_score = score_map[key]
# 6. 保存分位点历史
if intimacy_data_list:
all_raw = [d.raw_score for d in intimacy_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_intimacy_data(intimacy_data_list)
self.logger.info("亲密指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'pair_count': len(intimacy_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_records(
self,
site_id: int,
start_datetime: datetime,
end_datetime: datetime
) -> List[Dict[str, Any]]:
"""
提取服务记录
注意: 使用 assistant_no (助教工号) 作为助教标识,而不是 site_assistant_id
因为 site_assistant_id 在数据中是每次服务的唯一ID不是助教的唯一标识
Returns:
[{'member_id', 'assistant_no', 'assistant_nickname', 'start_time', 'end_time', 'duration_minutes', 'skill_id'}, ...]
"""
sql = """
SELECT
tenant_member_id AS member_id,
assistant_no,
nickname AS assistant_nickname,
start_use_time,
last_use_time,
COALESCE(income_seconds, 0) / 60 AS duration_minutes,
skill_id
FROM billiards_dwd.dwd_assistant_service_log
WHERE site_id = %s
AND tenant_member_id > 0 -- 排除散客
AND is_delete = 0
AND assistant_no IS NOT NULL -- 确保有助教工号
AND last_use_time >= %s
AND last_use_time < %s
ORDER BY tenant_member_id, assistant_no, start_use_time
"""
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
result = []
for row in (rows or []):
row_dict = dict(row)
# 使用 assistant_no 作为助教标识
assistant_no = row_dict['assistant_no']
if assistant_no:
result.append({
'member_id': int(row_dict['member_id']),
'assistant_no': str(assistant_no), # 助教工号(字符串)
'assistant_nickname': row_dict['assistant_nickname'] or '',
'start_time': row_dict['start_use_time'],
'end_time': row_dict['last_use_time'],
'duration_minutes': int(row_dict['duration_minutes'] or 0),
'skill_id': int(row_dict['skill_id'] or 0)
})
return result
def _group_and_merge_sessions(
self,
raw_services: List[Dict[str, Any]],
params: Dict[str, float],
now: datetime
) -> Dict[Tuple[int, str], MemberAssistantIntimacyData]:
"""
按(member_id, assistant_no)分组并合并会话
合并逻辑:同一客人对同一助教,间隔<4小时算同次服务
"""
merge_threshold_hours = int(params['session_merge_hours'])
merge_threshold = timedelta(hours=merge_threshold_hours)
incentive_weight = params['incentive_weight']
pair_data: Dict[Tuple[int, str], MemberAssistantIntimacyData] = {}
# 按pair分组使用assistant_no
pair_services: Dict[Tuple[int, str], List[Dict[str, Any]]] = {}
for svc in raw_services:
key = (svc['member_id'], svc['assistant_no'])
if key not in pair_services:
pair_services[key] = []
pair_services[key].append(svc)
# 对每个pair合并会话
for key, services in pair_services.items():
member_id, assistant_no = key
# 取第一个服务记录的昵称
assistant_nickname = services[0]['assistant_nickname'] if services else ''
data = MemberAssistantIntimacyData(
member_id=member_id,
assistant_no=assistant_no,
assistant_nickname=assistant_nickname,
site_id=0, # 稍后填充
tenant_id=0
)
# 按开始时间排序
sorted_services = sorted(services, key=lambda x: x['start_time'])
# 合并会话
current_session: Optional[ServiceSession] = None
for svc in sorted_services:
start_time = svc['start_time']
end_time = svc['end_time']
duration = svc['duration_minutes']
skill_id = svc['skill_id']
# 判断课型
is_incentive = (skill_id == self.SKILL_ID_INCENTIVE)
weight = incentive_weight if is_incentive else 1.0
if current_session is None:
# 开始新会话
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
elif start_time - current_session.session_end <= merge_threshold:
# 合并到当前会话
current_session.session_end = max(current_session.session_end, end_time)
current_session.total_duration_minutes += duration
# 同次服务取最高权重
current_session.course_weight = max(current_session.course_weight, weight)
current_session.is_incentive = current_session.is_incentive or is_incentive
else:
# 保存当前会话,开始新会话
data.sessions.append(current_session)
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
# 保存最后一个会话
if current_session is not None:
data.sessions.append(current_session)
# 统计特征
data.session_count = len(data.sessions)
data.total_duration_minutes = sum(s.total_duration_minutes for s in data.sessions)
data.basic_session_count = sum(1 for s in data.sessions if not s.is_incentive)
data.incentive_session_count = sum(1 for s in data.sessions if s.is_incentive)
# 最近一次服务
if data.sessions:
last_session = max(data.sessions, key=lambda s: s.session_end)
data.days_since_last_session = (now - last_session.session_end).days
pair_data[key] = data
return pair_data
def _extract_attributed_recharges(
self,
site_id: int,
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData],
params: Dict[str, float],
now: datetime
) -> None:
"""
提取归因充值
归因逻辑服务结束后1小时内的充值算做该助教贡献
"""
attribution_hours = int(params['recharge_attribute_hours'])
attribution_window = timedelta(hours=attribution_hours)
# 获取所有相关会员ID
member_ids = set(key[0] for key in pair_data.keys())
if not member_ids:
return
member_ids_str = ','.join(str(m) for m in member_ids)
# 查询充值记录
sql = f"""
SELECT
member_id,
pay_time,
pay_amount
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id IN ({member_ids_str})
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
"""
lookback_days = int(params['lookback_days'])
start_datetime = now - timedelta(days=lookback_days)
rows = self.db.query(sql, (site_id, start_datetime))
# 为每个充值找到归因助教
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
pay_time = row_dict['pay_time']
pay_amount = float(row_dict['pay_amount'] or 0)
if pay_amount <= 0:
continue
# 查找该会员在pay_time前1小时内结束服务的助教
for key, data in pair_data.items():
if key[0] != member_id:
continue
for session in data.sessions:
# 服务结束后1小时内的充值
if (session.session_end <= pay_time and
pay_time - session.session_end <= attribution_window):
# 归因给这个助教
data.attributed_recharge_count += 1
data.attributed_recharge_amount += pay_amount
data.recharges.append(AttributedRecharge(
pay_time=pay_time,
pay_amount=pay_amount,
days_ago=(now - pay_time).total_seconds() / 86400
))
break # 一笔充值只归因给一个助教
# ==========================================================================
# 分数计算方法
# ==========================================================================
def _calculate_component_scores(
self,
data: MemberAssistantIntimacyData,
params: Dict[str, float],
now: datetime
) -> None:
"""计算5项分数"""
epsilon = 1e-6
h_sess = params['halflife_session']
h_last = params['halflife_last']
h_pay = params['halflife_recharge']
h_short = params['halflife_short']
h_long = params['halflife_long']
A0 = params['amount_base']
gamma = params['burst_gamma']
# 1. 频次强度 F = Σ(τ_i × decay(d_i, h_sess))
F = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
F += session.course_weight * self.decay(days_ago, h_sess)
data.score_frequency = F
# 2. 最近温度 R = decay(d_last, h_last)
if data.days_since_last_session is not None:
data.score_recency = self.decay(data.days_since_last_session, h_last)
else:
data.score_recency = 0.0
# 3. 归因充值强度 M = Σ(ln(1+amt/A0) × decay(d_r, h_pay))
M = 0.0
for recharge in data.recharges:
m_amt = math.log1p(recharge.pay_amount / A0)
M += m_amt * self.decay(recharge.days_ago, h_pay)
data.score_recharge = M
# 4. 时长贡献 D = Σ(sqrt(dur/60) × τ × decay(d, h_sess))
D = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
dur_hours = session.total_duration_minutes / 60.0
D += math.sqrt(dur_hours) * session.course_weight * self.decay(days_ago, h_sess)
data.score_duration = D
# 5. 频率激增放大 mult = 1 + γ × burst
# F_short = Σ(τ × decay(d, h_short))
# F_long = Σ(τ × decay(d, h_long))
F_short = 0.0
F_long = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
F_short += session.course_weight * self.decay(days_ago, h_short)
F_long += session.course_weight * self.decay(days_ago, h_long)
# burst = max(0, ln(1 + (F_short/F_long - 1)))
ratio = F_short / (F_long + epsilon)
if ratio > 1:
burst = self.safe_ln1p(ratio - 1)
else:
burst = 0.0
data.burst_multiplier = 1 + gamma * burst
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_intimacy_data(self, data_list: List[MemberAssistantIntimacyData]) -> int:
"""保存亲密数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
# 构建删除条件使用assistant_no
# 注意assistant_id字段在数据库中存储assistant_no的整数形式
keys = [(d.member_id, d.assistant_no) for d in data_list]
conditions = " OR ".join(
f"(member_id = {m} AND assistant_id = {int(a)})" for m, a in keys
)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_assistant_intimacy
WHERE site_id = %s AND ({conditions})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
# 使用assistant_no的整数值作为assistant_id
insert_sql = """
INSERT INTO billiards_dws.dws_member_assistant_intimacy (
site_id, tenant_id, member_id, assistant_id,
session_count, total_duration_minutes,
basic_session_count, incentive_session_count,
days_since_last_session,
attributed_recharge_count, attributed_recharge_amount,
score_frequency, score_recency, score_recharge, score_duration,
burst_multiplier, raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
# 将assistant_no转为整数作为assistant_id
assistant_id = int(data.assistant_no) if data.assistant_no.isdigit() else 0
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id, assistant_id,
data.session_count, data.total_duration_minutes,
data.basic_session_count, data.incentive_session_count,
data.days_since_last_session,
data.attributed_recharge_count, data.attributed_recharge_amount,
data.score_frequency, data.score_recency, data.score_recharge, data.score_duration,
data.burst_multiplier, data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
site_id = self.config.get('app.default_site_id')
if site_id:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['site_id'])
raise ValueError("无法确定门店ID")
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['tenant_id'])
return 0

View File

@@ -0,0 +1,564 @@
# -*- coding: utf-8 -*-
"""
客户召回指数计算任务
功能说明:
- 衡量客户召回的必要性和紧急程度
- 尊重客户个人到店周期(μ=中位数, σ=MAD
- 对新客户、刚充值客户增加召回倾向
- 检测"热了又断"的情况
算法公式:
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
其中:
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
- new_bonus = decay(d_first, h_new) # 新客户加分
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
数据来源:
- dwd_settlement_head: 会员到店记录
- dwd_recharge_order: 充值记录
- dim_member: 首访时间
更新频率每2小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class MemberRecallData:
"""会员召回数据"""
member_id: int
site_id: int
tenant_id: int
# 计算输入特征
days_since_last_visit: Optional[int] = None
visit_interval_median: Optional[float] = None
visit_interval_mad: Optional[float] = None
days_since_first_visit: Optional[int] = None
days_since_last_recharge: Optional[int] = None
visits_last_14_days: int = 0
visits_last_60_days: int = 0
# 分项得分
score_overdue: float = 0.0
score_new_bonus: float = 0.0
score_recharge_bonus: float = 0.0
score_hot_drop: float = 0.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# =============================================================================
# 召回指数任务
# =============================================================================
class RecallIndexTask(BaseIndexTask):
"""
客户召回指数计算任务
计算流程:
1. 提取近60天有到店记录的会员
2. 计算每个会员的到店间隔特征中位数、MAD
3. 计算4项分数超期、新客、充值、热度断档
4. 汇总Raw Score
5. 分位截断 + MinMax映射到0-10
6. 写入DWS表
"""
INDEX_TYPE = "RECALL"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'sigma_min': 2.0,
'halflife_new': 7.0,
'halflife_recharge': 10.0,
'weight_overdue': 3.0,
'weight_new': 1.0,
'weight_recharge': 1.0,
'weight_hot': 1.0,
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_RECALL_INDEX"
def get_target_table(self) -> str:
return "dws_member_recall_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行召回指数计算"""
self.logger.info("开始计算客户召回指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期
base_date = date.today()
start_date = base_date - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
)
# 1. 提取会员到店数据
member_visits = self._extract_member_visits(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
if not member_visits:
self.logger.warning("没有会员到店记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 提取充值记录
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
# 3. 提取首访时间
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
# 4. 计算每个会员的召回数据
recall_data_list: List[MemberRecallData] = []
for member_id, visit_dates in member_visits.items():
data = MemberRecallData(
member_id=member_id,
site_id=site_id,
tenant_id=tenant_id
)
# 计算特征
self._calculate_visit_features(data, visit_dates, base_date, params)
# 补充充值特征
if member_id in recharge_data:
last_recharge_date = recharge_data[member_id]
data.days_since_last_recharge = (base_date - last_recharge_date).days
# 补充首访特征
if member_id in first_visit_data:
first_visit_date = first_visit_data[member_id]
data.days_since_first_visit = (base_date - first_visit_date).days
# 计算分项得分
self._calculate_component_scores(data, params)
# 汇总Raw Score
data.raw_score = (
params['weight_overdue'] * data.score_overdue +
params['weight_new'] * data.score_new_bonus +
params['weight_recharge'] * data.score_recharge_bonus +
params['weight_hot'] * data.score_hot_drop
)
recall_data_list.append(data)
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
# 5. 归一化到Display Score
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
use_log=False,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=True,
site_id=site_id
)
# 更新display_score
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
for data in recall_data_list:
if data.member_id in score_map:
_, data.display_score = score_map[data.member_id]
# 6. 保存分位点历史
if recall_data_list:
all_raw = [d.raw_score for d in recall_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_recall_data(recall_data_list)
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'member_count': len(recall_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_member_visits(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, List[date]]:
"""
提取会员到店记录
Returns:
{member_id: [visit_date1, visit_date2, ...]}
"""
sql = """
SELECT
member_id,
DATE(pay_time) AS visit_date
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %s
AND member_id > 0 -- 排除散客
AND settle_type = 1 -- 台桌结账
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
GROUP BY member_id, DATE(pay_time)
ORDER BY member_id, visit_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, List[date]] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
visit_date = row_dict['visit_date']
if member_id not in result:
result[member_id] = []
result[member_id].append(visit_date)
return result
def _extract_recharge_data(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, date]:
"""
提取最近充值记录
Returns:
{member_id: last_recharge_date}
"""
sql = """
SELECT
member_id,
MAX(DATE(pay_time)) AS last_recharge_date
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id > 0
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
GROUP BY member_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
return result
def _extract_first_visit_data(
self,
site_id: int,
member_ids: List[int]
) -> Dict[int, date]:
"""
提取首访时间
优先使用dim_member.create_time如果没有则使用dwd_settlement_head中的首次消费时间
Returns:
{member_id: first_visit_date}
"""
if not member_ids:
return {}
# 使用dim_member的create_time作为首访时间
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
member_id,
DATE(create_time) AS first_visit_date
FROM billiards_dwd.dim_member
WHERE member_id IN ({member_ids_str})
AND scd2_is_current = 1
"""
rows = self.db.query(sql)
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
first_date = row_dict['first_visit_date']
if first_date:
result[member_id] = first_date
return result
# ==========================================================================
# 特征计算方法
# ==========================================================================
def _calculate_visit_features(
self,
data: MemberRecallData,
visit_dates: List[date],
base_date: date,
params: Dict[str, float]
) -> None:
"""计算到店特征"""
if not visit_dates:
return
# 最近一次到店
last_visit = max(visit_dates)
data.days_since_last_visit = (base_date - last_visit).days
# 到店间隔
sorted_dates = sorted(visit_dates)
intervals = []
for i in range(1, len(sorted_dates)):
interval = (sorted_dates[i] - sorted_dates[i-1]).days
intervals.append(float(interval))
if intervals:
# 中位数(μ)
data.visit_interval_median = self.calculate_median(intervals)
# MADσ下限为sigma_min
mad = self.calculate_mad(intervals)
data.visit_interval_mad = max(mad, params['sigma_min'])
else:
# 只有一次到店,使用默认值
data.visit_interval_median = 7.0 # 默认周期7天
data.visit_interval_mad = params['sigma_min']
# 近14天/60天到店次数
days_14_ago = base_date - timedelta(days=14)
days_60_ago = base_date - timedelta(days=60)
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
def _calculate_component_scores(
self,
data: MemberRecallData,
params: Dict[str, float]
) -> None:
"""计算4项分数"""
# 1. 超期紧急性
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
t = data.days_since_last_visit
mu = data.visit_interval_median
sigma = data.visit_interval_mad or params['sigma_min']
# z = max(0, (t - μ) / σ)
z = max(0.0, (t - mu) / sigma)
# overdue = 1 - exp(-z)
data.score_overdue = 1.0 - math.exp(-z)
# 2. 新客户加分
lookback_days = int(params['lookback_days'])
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
data.score_new_bonus = self.decay(
data.days_since_first_visit,
params['halflife_new']
)
# 3. 刚充值加分
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
data.score_recharge_bonus = self.decay(
data.days_since_last_recharge,
params['halflife_recharge']
)
# 4. 热度断档加分
epsilon = 1e-6
n14 = data.visits_last_14_days
n60 = data.visits_last_60_days
r14 = n14 / 14.0
r60 = (n60 + 1) / 60.0 # +1 平滑
hot_ratio = r14 / (r60 + epsilon)
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
if hot_ratio > 1:
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
else:
data.score_hot_drop = 0.0
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
"""保存召回数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
member_ids = [d.member_id for d in data_list]
member_ids_str = ','.join(str(m) for m in member_ids)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_recall_index
WHERE site_id = %s AND member_id IN ({member_ids_str})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_recall_index (
site_id, tenant_id, member_id,
days_since_last_visit, visit_interval_median, visit_interval_mad,
days_since_first_visit, days_since_last_recharge,
visits_last_14_days, visits_last_60_days,
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id,
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
data.days_since_first_visit, data.days_since_last_recharge,
data.visits_last_14_days, data.visits_last_60_days,
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
# 从配置获取默认门店ID
site_id = self.config.get('app.default_site_id')
if site_id:
return int(site_id)
# 查询数据库获取第一个门店
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['site_id'])
raise ValueError("无法确定门店ID")
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['tenant_id'])
return 0