Updata2
This commit is contained in:
16
etl_billiards/tasks/dws/index/__init__.py
Normal file
16
etl_billiards/tasks/dws/index/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
指数算法任务模块
|
||||
|
||||
包含:
|
||||
- RecallIndexTask: 客户召回指数计算任务
|
||||
- IntimacyIndexTask: 客户-助教亲密指数计算任务
|
||||
"""
|
||||
|
||||
from .recall_index_task import RecallIndexTask
|
||||
from .intimacy_index_task import IntimacyIndexTask
|
||||
|
||||
__all__ = [
|
||||
'RecallIndexTask',
|
||||
'IntimacyIndexTask',
|
||||
]
|
||||
518
etl_billiards/tasks/dws/index/base_index_task.py
Normal file
518
etl_billiards/tasks/dws/index/base_index_task.py
Normal file
@@ -0,0 +1,518 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
指数算法任务基类
|
||||
|
||||
功能说明:
|
||||
- 提供半衰期时间衰减函数
|
||||
- 提供分位数计算和分位截断
|
||||
- 提供0-10映射方法
|
||||
- 提供算法参数加载
|
||||
- 提供分位点历史记录(用于EWMA平滑)
|
||||
|
||||
算法原理:
|
||||
1. 时间衰减函数(半衰期模型):decay(d; h) = exp(-ln(2) * d / h)
|
||||
当 d=h 时权重衰减到 0.5,越近权重越大
|
||||
|
||||
2. 0-10映射流程:
|
||||
Raw Score → Winsorize(P5, P95) → [可选Log压缩] → MinMax(0, 10)
|
||||
|
||||
作者:ETL团队
|
||||
创建日期:2026-02-03
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from abc import abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from ..base_dws_task import BaseDwsTask, TaskContext
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 数据类定义
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class IndexParameters:
|
||||
"""指数算法参数数据类"""
|
||||
params: Dict[str, float]
|
||||
loaded_at: datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class PercentileHistory:
|
||||
"""分位点历史记录"""
|
||||
percentile_5: float
|
||||
percentile_95: float
|
||||
percentile_5_smoothed: float
|
||||
percentile_95_smoothed: float
|
||||
record_count: int
|
||||
calc_time: datetime
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 指数任务基类
|
||||
# =============================================================================
|
||||
|
||||
class BaseIndexTask(BaseDwsTask):
|
||||
"""
|
||||
指数算法任务基类
|
||||
|
||||
提供指数计算通用功能:
|
||||
1. 半衰期时间衰减函数
|
||||
2. 分位数计算与截断
|
||||
3. 0-10归一化映射
|
||||
4. 算法参数加载
|
||||
5. 分位点历史管理(EWMA平滑)
|
||||
"""
|
||||
|
||||
# 子类需要定义的指数类型
|
||||
INDEX_TYPE: str = ""
|
||||
|
||||
# 参数缓存
|
||||
_index_params_cache: Optional[IndexParameters] = None
|
||||
_index_params_ttl: int = 300 # 缓存有效期(秒)
|
||||
|
||||
# 默认参数
|
||||
DEFAULT_LOOKBACK_DAYS = 60
|
||||
DEFAULT_PERCENTILE_LOWER = 5
|
||||
DEFAULT_PERCENTILE_UPPER = 95
|
||||
DEFAULT_EWMA_ALPHA = 0.2
|
||||
|
||||
# ==========================================================================
|
||||
# 抽象方法(子类需实现)
|
||||
# ==========================================================================
|
||||
|
||||
@abstractmethod
|
||||
def get_index_type(self) -> str:
|
||||
"""获取指数类型(RECALL/INTIMACY)"""
|
||||
raise NotImplementedError
|
||||
|
||||
# ==========================================================================
|
||||
# 时间衰减函数
|
||||
# ==========================================================================
|
||||
|
||||
def decay(self, days: float, halflife: float) -> float:
|
||||
"""
|
||||
半衰期衰减函数
|
||||
|
||||
公式: decay(d; h) = exp(-ln(2) * d / h)
|
||||
|
||||
解释:当 d=h 时权重衰减到 0.5;越近权重越大,符合"近期更重要"的直觉
|
||||
|
||||
Args:
|
||||
days: 事件距今天数 (d >= 0)
|
||||
halflife: 半衰期 (h > 0),单位:天
|
||||
|
||||
Returns:
|
||||
衰减后的权重,范围 (0, 1]
|
||||
|
||||
Examples:
|
||||
>>> decay(0, 7) # 今天,权重=1.0
|
||||
1.0
|
||||
>>> decay(7, 7) # 7天前,半衰期=7,权重=0.5
|
||||
0.5
|
||||
>>> decay(14, 7) # 14天前,权重=0.25
|
||||
0.25
|
||||
"""
|
||||
if halflife <= 0:
|
||||
raise ValueError("半衰期必须大于0")
|
||||
if days < 0:
|
||||
days = 0
|
||||
return math.exp(-math.log(2) * days / halflife)
|
||||
|
||||
# ==========================================================================
|
||||
# 分位数计算
|
||||
# ==========================================================================
|
||||
|
||||
def calculate_percentiles(
|
||||
self,
|
||||
scores: List[float],
|
||||
lower: int = 5,
|
||||
upper: int = 95
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
计算分位点
|
||||
|
||||
Args:
|
||||
scores: 分数列表
|
||||
lower: 下分位点百分比(默认5)
|
||||
upper: 上分位点百分比(默认95)
|
||||
|
||||
Returns:
|
||||
(下分位值, 上分位值) 元组
|
||||
"""
|
||||
if not scores:
|
||||
return 0.0, 0.0
|
||||
|
||||
sorted_scores = sorted(scores)
|
||||
n = len(sorted_scores)
|
||||
|
||||
# 计算分位点索引
|
||||
lower_idx = max(0, int(n * lower / 100) - 1)
|
||||
upper_idx = min(n - 1, int(n * upper / 100))
|
||||
|
||||
return sorted_scores[lower_idx], sorted_scores[upper_idx]
|
||||
|
||||
def winsorize(self, value: float, lower: float, upper: float) -> float:
|
||||
"""
|
||||
分位截断(Winsorize)
|
||||
|
||||
将值限制在 [lower, upper] 范围内
|
||||
|
||||
Args:
|
||||
value: 原始值
|
||||
lower: 下限(P5分位)
|
||||
upper: 上限(P95分位)
|
||||
|
||||
Returns:
|
||||
截断后的值
|
||||
"""
|
||||
return min(max(value, lower), upper)
|
||||
|
||||
# ==========================================================================
|
||||
# 0-10映射
|
||||
# ==========================================================================
|
||||
|
||||
def normalize_to_display(
|
||||
self,
|
||||
value: float,
|
||||
min_val: float,
|
||||
max_val: float,
|
||||
use_log: bool = False,
|
||||
epsilon: float = 1e-6
|
||||
) -> float:
|
||||
"""
|
||||
归一化到0-10分
|
||||
|
||||
映射流程:
|
||||
1. [可选] 对数压缩:y = ln(1 + x)
|
||||
2. MinMax映射:score = 10 * (y - min) / (max - min)
|
||||
|
||||
Args:
|
||||
value: 原始值(已Winsorize)
|
||||
min_val: 最小值(通常为P5)
|
||||
max_val: 最大值(通常为P95)
|
||||
use_log: 是否使用对数压缩(亲密指数建议启用)
|
||||
epsilon: 防除零小量
|
||||
|
||||
Returns:
|
||||
0-10范围的分数
|
||||
"""
|
||||
if use_log:
|
||||
value = math.log1p(value)
|
||||
min_val = math.log1p(min_val)
|
||||
max_val = math.log1p(max_val)
|
||||
|
||||
# 防止分母为0
|
||||
range_val = max_val - min_val
|
||||
if range_val < epsilon:
|
||||
return 5.0 # 几乎全员相同时返回中间值
|
||||
|
||||
score = 10.0 * (value - min_val) / range_val
|
||||
|
||||
# 确保在0-10范围内
|
||||
return max(0.0, min(10.0, score))
|
||||
|
||||
def batch_normalize_to_display(
|
||||
self,
|
||||
raw_scores: List[Tuple[Any, float]], # [(entity_id, raw_score), ...]
|
||||
use_log: bool = False,
|
||||
percentile_lower: int = 5,
|
||||
percentile_upper: int = 95,
|
||||
use_smoothing: bool = False,
|
||||
site_id: Optional[int] = None
|
||||
) -> List[Tuple[Any, float, float]]:
|
||||
"""
|
||||
批量归一化Raw Score到Display Score
|
||||
|
||||
流程:
|
||||
1. 提取所有raw_score
|
||||
2. 计算分位点(可选EWMA平滑)
|
||||
3. Winsorize截断
|
||||
4. MinMax映射到0-10
|
||||
|
||||
Args:
|
||||
raw_scores: (entity_id, raw_score) 元组列表
|
||||
use_log: 是否使用对数压缩
|
||||
percentile_lower: 下分位百分比
|
||||
percentile_upper: 上分位百分比
|
||||
use_smoothing: 是否使用EWMA平滑分位点
|
||||
site_id: 门店ID(平滑时需要)
|
||||
|
||||
Returns:
|
||||
(entity_id, raw_score, display_score) 元组列表
|
||||
"""
|
||||
if not raw_scores:
|
||||
return []
|
||||
|
||||
# 提取raw_score
|
||||
scores = [s for _, s in raw_scores]
|
||||
|
||||
# 计算分位点
|
||||
q_l, q_u = self.calculate_percentiles(scores, percentile_lower, percentile_upper)
|
||||
|
||||
# EWMA平滑
|
||||
if use_smoothing and site_id is not None:
|
||||
q_l, q_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
|
||||
|
||||
# 映射
|
||||
results = []
|
||||
for entity_id, raw_score in raw_scores:
|
||||
clipped = self.winsorize(raw_score, q_l, q_u)
|
||||
display = self.normalize_to_display(clipped, q_l, q_u, use_log)
|
||||
results.append((entity_id, raw_score, round(display, 2)))
|
||||
|
||||
return results
|
||||
|
||||
# ==========================================================================
|
||||
# 算法参数加载
|
||||
# ==========================================================================
|
||||
|
||||
def load_index_parameters(
|
||||
self,
|
||||
index_type: Optional[str] = None,
|
||||
force_reload: bool = False
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
加载指数算法参数
|
||||
|
||||
Args:
|
||||
index_type: 指数类型(默认使用子类定义的INDEX_TYPE)
|
||||
force_reload: 是否强制重新加载
|
||||
|
||||
Returns:
|
||||
参数名到参数值的字典
|
||||
"""
|
||||
if index_type is None:
|
||||
index_type = self.get_index_type()
|
||||
|
||||
now = datetime.now(self.tz)
|
||||
|
||||
# 检查缓存
|
||||
if (
|
||||
not force_reload
|
||||
and self._index_params_cache is not None
|
||||
and (now - self._index_params_cache.loaded_at).total_seconds() < self._index_params_ttl
|
||||
):
|
||||
return self._index_params_cache.params
|
||||
|
||||
self.logger.debug("加载指数算法参数: %s", index_type)
|
||||
|
||||
sql = """
|
||||
SELECT param_name, param_value
|
||||
FROM billiards_dws.cfg_index_parameters
|
||||
WHERE index_type = %s
|
||||
AND effective_from <= CURRENT_DATE
|
||||
AND (effective_to IS NULL OR effective_to >= CURRENT_DATE)
|
||||
ORDER BY effective_from DESC
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (index_type,))
|
||||
|
||||
params = {}
|
||||
seen = set()
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
name = row_dict['param_name']
|
||||
if name not in seen:
|
||||
params[name] = float(row_dict['param_value'])
|
||||
seen.add(name)
|
||||
|
||||
self._index_params_cache = IndexParameters(
|
||||
params=params,
|
||||
loaded_at=now
|
||||
)
|
||||
|
||||
return params
|
||||
|
||||
def get_param(self, name: str, default: float = 0.0) -> float:
|
||||
"""
|
||||
获取单个参数值
|
||||
|
||||
Args:
|
||||
name: 参数名
|
||||
default: 默认值
|
||||
|
||||
Returns:
|
||||
参数值
|
||||
"""
|
||||
params = self.load_index_parameters()
|
||||
return params.get(name, default)
|
||||
|
||||
# ==========================================================================
|
||||
# 分位点历史管理(EWMA平滑)
|
||||
# ==========================================================================
|
||||
|
||||
def get_last_percentile_history(
|
||||
self,
|
||||
site_id: int,
|
||||
index_type: Optional[str] = None
|
||||
) -> Optional[PercentileHistory]:
|
||||
"""
|
||||
获取最近一次分位点历史
|
||||
|
||||
Args:
|
||||
site_id: 门店ID
|
||||
index_type: 指数类型
|
||||
|
||||
Returns:
|
||||
PercentileHistory 或 None
|
||||
"""
|
||||
if index_type is None:
|
||||
index_type = self.get_index_type()
|
||||
|
||||
sql = """
|
||||
SELECT
|
||||
percentile_5, percentile_95,
|
||||
percentile_5_smoothed, percentile_95_smoothed,
|
||||
record_count, calc_time
|
||||
FROM billiards_dws.dws_index_percentile_history
|
||||
WHERE site_id = %s AND index_type = %s
|
||||
ORDER BY calc_time DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (site_id, index_type))
|
||||
|
||||
if not rows:
|
||||
return None
|
||||
|
||||
row = dict(rows[0])
|
||||
return PercentileHistory(
|
||||
percentile_5=float(row['percentile_5'] or 0),
|
||||
percentile_95=float(row['percentile_95'] or 0),
|
||||
percentile_5_smoothed=float(row['percentile_5_smoothed'] or 0),
|
||||
percentile_95_smoothed=float(row['percentile_95_smoothed'] or 0),
|
||||
record_count=int(row['record_count'] or 0),
|
||||
calc_time=row['calc_time']
|
||||
)
|
||||
|
||||
def save_percentile_history(
|
||||
self,
|
||||
site_id: int,
|
||||
percentile_5: float,
|
||||
percentile_95: float,
|
||||
percentile_5_smoothed: float,
|
||||
percentile_95_smoothed: float,
|
||||
record_count: int,
|
||||
min_raw: float,
|
||||
max_raw: float,
|
||||
avg_raw: float,
|
||||
index_type: Optional[str] = None
|
||||
) -> None:
|
||||
"""
|
||||
保存分位点历史
|
||||
|
||||
Args:
|
||||
site_id: 门店ID
|
||||
percentile_5: 原始5分位
|
||||
percentile_95: 原始95分位
|
||||
percentile_5_smoothed: 平滑后5分位
|
||||
percentile_95_smoothed: 平滑后95分位
|
||||
record_count: 记录数
|
||||
min_raw: 最小Raw Score
|
||||
max_raw: 最大Raw Score
|
||||
avg_raw: 平均Raw Score
|
||||
index_type: 指数类型
|
||||
"""
|
||||
if index_type is None:
|
||||
index_type = self.get_index_type()
|
||||
|
||||
sql = """
|
||||
INSERT INTO billiards_dws.dws_index_percentile_history (
|
||||
site_id, index_type, calc_time,
|
||||
percentile_5, percentile_95,
|
||||
percentile_5_smoothed, percentile_95_smoothed,
|
||||
record_count, min_raw_score, max_raw_score, avg_raw_score
|
||||
) VALUES (%s, %s, NOW(), %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
|
||||
with self.db.conn.cursor() as cur:
|
||||
cur.execute(sql, (
|
||||
site_id, index_type,
|
||||
percentile_5, percentile_95,
|
||||
percentile_5_smoothed, percentile_95_smoothed,
|
||||
record_count, min_raw, max_raw, avg_raw
|
||||
))
|
||||
self.db.conn.commit()
|
||||
|
||||
def _apply_ewma_smoothing(
|
||||
self,
|
||||
site_id: int,
|
||||
current_p5: float,
|
||||
current_p95: float,
|
||||
alpha: Optional[float] = None
|
||||
) -> Tuple[float, float]:
|
||||
"""
|
||||
应用EWMA平滑到分位点
|
||||
|
||||
公式: Q_t = (1 - α) * Q_{t-1} + α * Q_now
|
||||
|
||||
Args:
|
||||
site_id: 门店ID
|
||||
current_p5: 当前5分位
|
||||
current_p95: 当前95分位
|
||||
alpha: 平滑系数(默认0.2)
|
||||
|
||||
Returns:
|
||||
(平滑后的P5, 平滑后的P95)
|
||||
"""
|
||||
if alpha is None:
|
||||
alpha = self.get_param('ewma_alpha', self.DEFAULT_EWMA_ALPHA)
|
||||
|
||||
history = self.get_last_percentile_history(site_id)
|
||||
|
||||
if history is None:
|
||||
# 首次计算,不平滑
|
||||
return current_p5, current_p95
|
||||
|
||||
smoothed_p5 = (1 - alpha) * history.percentile_5_smoothed + alpha * current_p5
|
||||
smoothed_p95 = (1 - alpha) * history.percentile_95_smoothed + alpha * current_p95
|
||||
|
||||
return smoothed_p5, smoothed_p95
|
||||
|
||||
# ==========================================================================
|
||||
# 统计工具方法
|
||||
# ==========================================================================
|
||||
|
||||
def calculate_median(self, values: List[float]) -> float:
|
||||
"""计算中位数"""
|
||||
if not values:
|
||||
return 0.0
|
||||
sorted_vals = sorted(values)
|
||||
n = len(sorted_vals)
|
||||
mid = n // 2
|
||||
if n % 2 == 0:
|
||||
return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
|
||||
return sorted_vals[mid]
|
||||
|
||||
def calculate_mad(self, values: List[float]) -> float:
|
||||
"""
|
||||
计算MAD(中位绝对偏差)
|
||||
|
||||
MAD = median(|x - median(x)|)
|
||||
|
||||
MAD是比标准差更稳健的离散度度量,不受极端值影响
|
||||
"""
|
||||
if not values:
|
||||
return 0.0
|
||||
median_val = self.calculate_median(values)
|
||||
deviations = [abs(v - median_val) for v in values]
|
||||
return self.calculate_median(deviations)
|
||||
|
||||
def safe_log(self, value: float, default: float = 0.0) -> float:
|
||||
"""安全的对数运算"""
|
||||
if value <= 0:
|
||||
return default
|
||||
return math.log(value)
|
||||
|
||||
def safe_ln1p(self, value: float) -> float:
|
||||
"""安全的ln(1+x)运算"""
|
||||
if value < -1:
|
||||
return 0.0
|
||||
return math.log1p(value)
|
||||
688
etl_billiards/tasks/dws/index/intimacy_index_task.py
Normal file
688
etl_billiards/tasks/dws/index/intimacy_index_task.py
Normal file
@@ -0,0 +1,688 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
客户-助教亲密指数计算任务
|
||||
|
||||
功能说明:
|
||||
- 衡量客户与助教的关系强度和近期温度
|
||||
- 用于助教约课精力分配和约课成功率预估
|
||||
- 附加课权重 = 基础课的1.5倍
|
||||
- 检测频率激增并放大权重
|
||||
|
||||
算法公式:
|
||||
Raw Score = (w_F × F + w_R × R + w_M × M + w_D × D) × mult
|
||||
|
||||
其中:
|
||||
- F = Σ(τ_i × decay(d_i, h_sess)) # 频次强度
|
||||
- R = decay(d_last, h_last) # 最近温度
|
||||
- M = Σ(ln(1+amt/A0) × decay(d_r, h_pay)) # 归因充值强度
|
||||
- D = Σ(sqrt(dur/60) × τ × decay(d, h)) # 时长贡献
|
||||
- mult = 1 + γ × burst # 激增放大
|
||||
- burst = max(0, ln(1 + (F_short/F_long - 1)))
|
||||
|
||||
特殊逻辑:
|
||||
- 会话合并:同一客人对同一助教,间隔<4小时算同次服务
|
||||
- 充值归因:服务结束后1小时内的充值算做该助教贡献
|
||||
|
||||
数据来源:
|
||||
- dwd_assistant_service_log: 服务记录
|
||||
- dwd_recharge_order: 充值记录
|
||||
|
||||
更新频率:每4小时
|
||||
|
||||
作者:ETL团队
|
||||
创建日期:2026-02-03
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base_index_task import BaseIndexTask, PercentileHistory
|
||||
from ..base_dws_task import TaskContext
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 数据类定义
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class ServiceSession:
|
||||
"""合并后的服务会话"""
|
||||
session_start: datetime
|
||||
session_end: datetime
|
||||
total_duration_minutes: int = 0
|
||||
course_weight: float = 1.0 # 1.0=基础课, 1.5=附加课
|
||||
is_incentive: bool = False # 是否为附加课
|
||||
|
||||
|
||||
@dataclass
|
||||
class AttributedRecharge:
|
||||
"""归因充值"""
|
||||
pay_time: datetime
|
||||
pay_amount: float
|
||||
days_ago: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class MemberAssistantIntimacyData:
|
||||
"""客户-助教亲密数据"""
|
||||
member_id: int
|
||||
assistant_no: str # 助教工号(字符串,如 "1", "2", "15")
|
||||
assistant_nickname: str # 助教昵称
|
||||
site_id: int
|
||||
tenant_id: int
|
||||
|
||||
# 计算输入特征
|
||||
session_count: int = 0
|
||||
total_duration_minutes: int = 0
|
||||
basic_session_count: int = 0
|
||||
incentive_session_count: int = 0
|
||||
days_since_last_session: Optional[int] = None
|
||||
attributed_recharge_count: int = 0
|
||||
attributed_recharge_amount: float = 0.0
|
||||
|
||||
# 分项得分
|
||||
score_frequency: float = 0.0
|
||||
score_recency: float = 0.0
|
||||
score_recharge: float = 0.0
|
||||
score_duration: float = 0.0
|
||||
burst_multiplier: float = 1.0
|
||||
|
||||
# 最终分数
|
||||
raw_score: float = 0.0
|
||||
display_score: float = 0.0
|
||||
|
||||
# 中间数据
|
||||
sessions: List[ServiceSession] = field(default_factory=list)
|
||||
recharges: List[AttributedRecharge] = field(default_factory=list)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 亲密指数任务
|
||||
# =============================================================================
|
||||
|
||||
class IntimacyIndexTask(BaseIndexTask):
|
||||
"""
|
||||
客户-助教亲密指数计算任务
|
||||
|
||||
计算流程:
|
||||
1. 提取近60天的助教服务记录
|
||||
2. 按(member_id, assistant_id)分组,合并4小时内的服务
|
||||
3. 提取归因充值(服务结束后1小时内)
|
||||
4. 计算5项分数(频次、最近、充值、时长、激增)
|
||||
5. 汇总Raw Score
|
||||
6. 分位截断 + Log压缩 + MinMax映射到0-10
|
||||
7. 写入DWS表
|
||||
"""
|
||||
|
||||
INDEX_TYPE = "INTIMACY"
|
||||
|
||||
# 技能ID映射
|
||||
SKILL_ID_BASIC = 2790683529513797 # 基础课
|
||||
SKILL_ID_INCENTIVE = 2790683529513798 # 附加课/激励课
|
||||
SKILL_ID_BOX = 3039912271463941 # 包厢课
|
||||
|
||||
# 默认参数
|
||||
DEFAULT_PARAMS = {
|
||||
'lookback_days': 60,
|
||||
'halflife_session': 14.0,
|
||||
'halflife_last': 10.0,
|
||||
'halflife_recharge': 21.0,
|
||||
'halflife_short': 7.0,
|
||||
'halflife_long': 30.0,
|
||||
'amount_base': 500.0,
|
||||
'incentive_weight': 1.5,
|
||||
'session_merge_hours': 4,
|
||||
'recharge_attribute_hours': 1,
|
||||
'weight_frequency': 2.0,
|
||||
'weight_recency': 1.5,
|
||||
'weight_recharge': 2.0,
|
||||
'weight_duration': 0.5,
|
||||
'burst_gamma': 0.6,
|
||||
'percentile_lower': 5,
|
||||
'percentile_upper': 95,
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 抽象方法实现
|
||||
# ==========================================================================
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
return "DWS_INTIMACY_INDEX"
|
||||
|
||||
def get_target_table(self) -> str:
|
||||
return "dws_member_assistant_intimacy"
|
||||
|
||||
def get_primary_keys(self) -> List[str]:
|
||||
return ['site_id', 'member_id', 'assistant_id']
|
||||
|
||||
def get_index_type(self) -> str:
|
||||
return self.INDEX_TYPE
|
||||
|
||||
# ==========================================================================
|
||||
# 任务执行
|
||||
# ==========================================================================
|
||||
|
||||
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
|
||||
"""执行亲密指数计算"""
|
||||
self.logger.info("开始计算客户-助教亲密指数")
|
||||
|
||||
# 获取门店ID
|
||||
site_id = self._get_site_id(context)
|
||||
tenant_id = self._get_tenant_id()
|
||||
|
||||
# 加载参数
|
||||
params = self._load_params()
|
||||
lookback_days = int(params['lookback_days'])
|
||||
|
||||
# 计算基准日期和时间
|
||||
now = datetime.now(self.tz)
|
||||
base_date = now.date()
|
||||
start_datetime = now - timedelta(days=lookback_days)
|
||||
|
||||
self.logger.info(
|
||||
"参数: lookback=%d天, h_sess=%.1f, h_last=%.1f, h_pay=%.1f, γ=%.2f",
|
||||
lookback_days, params['halflife_session'], params['halflife_last'],
|
||||
params['halflife_recharge'], params['burst_gamma']
|
||||
)
|
||||
|
||||
# 1. 提取服务记录
|
||||
raw_services = self._extract_service_records(site_id, start_datetime, now)
|
||||
self.logger.info("提取到 %d 条原始服务记录", len(raw_services))
|
||||
|
||||
if not raw_services:
|
||||
self.logger.warning("没有服务记录,跳过计算")
|
||||
return {'status': 'skipped', 'reason': 'no_data'}
|
||||
|
||||
# 2. 按(member_id, assistant_id)分组并合并会话
|
||||
pair_data = self._group_and_merge_sessions(raw_services, params, now)
|
||||
self.logger.info("合并为 %d 个客户-助教对", len(pair_data))
|
||||
|
||||
# 3. 提取归因充值
|
||||
self._extract_attributed_recharges(site_id, pair_data, params, now)
|
||||
|
||||
# 4. 计算每个pair的特征和分数
|
||||
intimacy_data_list: List[MemberAssistantIntimacyData] = []
|
||||
|
||||
for key, data in pair_data.items():
|
||||
data.site_id = site_id
|
||||
data.tenant_id = tenant_id
|
||||
|
||||
# 计算分项得分
|
||||
self._calculate_component_scores(data, params, now)
|
||||
|
||||
# 汇总Raw Score
|
||||
base_score = (
|
||||
params['weight_frequency'] * data.score_frequency +
|
||||
params['weight_recency'] * data.score_recency +
|
||||
params['weight_recharge'] * data.score_recharge +
|
||||
params['weight_duration'] * data.score_duration
|
||||
)
|
||||
data.raw_score = base_score * data.burst_multiplier
|
||||
|
||||
intimacy_data_list.append(data)
|
||||
|
||||
self.logger.info("计算完成 %d 个pair的Raw Score", len(intimacy_data_list))
|
||||
|
||||
# 5. 归一化到Display Score(使用对数压缩)
|
||||
raw_scores = [((d.member_id, d.assistant_no), d.raw_score) for d in intimacy_data_list]
|
||||
normalized = self.batch_normalize_to_display(
|
||||
raw_scores,
|
||||
use_log=True, # 亲密指数建议使用对数压缩
|
||||
percentile_lower=int(params['percentile_lower']),
|
||||
percentile_upper=int(params['percentile_upper']),
|
||||
use_smoothing=True,
|
||||
site_id=site_id
|
||||
)
|
||||
|
||||
# 更新display_score
|
||||
score_map = {key: (raw, display) for key, raw, display in normalized}
|
||||
for data in intimacy_data_list:
|
||||
key = (data.member_id, data.assistant_no)
|
||||
if key in score_map:
|
||||
_, data.display_score = score_map[key]
|
||||
|
||||
# 6. 保存分位点历史
|
||||
if intimacy_data_list:
|
||||
all_raw = [d.raw_score for d in intimacy_data_list]
|
||||
q_l, q_u = self.calculate_percentiles(
|
||||
all_raw,
|
||||
int(params['percentile_lower']),
|
||||
int(params['percentile_upper'])
|
||||
)
|
||||
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
|
||||
|
||||
self.save_percentile_history(
|
||||
site_id=site_id,
|
||||
percentile_5=q_l,
|
||||
percentile_95=q_u,
|
||||
percentile_5_smoothed=smoothed_l,
|
||||
percentile_95_smoothed=smoothed_u,
|
||||
record_count=len(all_raw),
|
||||
min_raw=min(all_raw),
|
||||
max_raw=max(all_raw),
|
||||
avg_raw=sum(all_raw) / len(all_raw)
|
||||
)
|
||||
|
||||
# 7. 写入DWS表
|
||||
inserted = self._save_intimacy_data(intimacy_data_list)
|
||||
|
||||
self.logger.info("亲密指数计算完成,写入 %d 条记录", inserted)
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'pair_count': len(intimacy_data_list),
|
||||
'records_inserted': inserted
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 数据提取方法
|
||||
# ==========================================================================
|
||||
|
||||
def _extract_service_records(
|
||||
self,
|
||||
site_id: int,
|
||||
start_datetime: datetime,
|
||||
end_datetime: datetime
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
提取服务记录
|
||||
|
||||
注意: 使用 assistant_no (助教工号) 作为助教标识,而不是 site_assistant_id
|
||||
因为 site_assistant_id 在数据中是每次服务的唯一ID,不是助教的唯一标识
|
||||
|
||||
Returns:
|
||||
[{'member_id', 'assistant_no', 'assistant_nickname', 'start_time', 'end_time', 'duration_minutes', 'skill_id'}, ...]
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
tenant_member_id AS member_id,
|
||||
assistant_no,
|
||||
nickname AS assistant_nickname,
|
||||
start_use_time,
|
||||
last_use_time,
|
||||
COALESCE(income_seconds, 0) / 60 AS duration_minutes,
|
||||
skill_id
|
||||
FROM billiards_dwd.dwd_assistant_service_log
|
||||
WHERE site_id = %s
|
||||
AND tenant_member_id > 0 -- 排除散客
|
||||
AND is_delete = 0
|
||||
AND assistant_no IS NOT NULL -- 确保有助教工号
|
||||
AND last_use_time >= %s
|
||||
AND last_use_time < %s
|
||||
ORDER BY tenant_member_id, assistant_no, start_use_time
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
|
||||
|
||||
result = []
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
# 使用 assistant_no 作为助教标识
|
||||
assistant_no = row_dict['assistant_no']
|
||||
if assistant_no:
|
||||
result.append({
|
||||
'member_id': int(row_dict['member_id']),
|
||||
'assistant_no': str(assistant_no), # 助教工号(字符串)
|
||||
'assistant_nickname': row_dict['assistant_nickname'] or '',
|
||||
'start_time': row_dict['start_use_time'],
|
||||
'end_time': row_dict['last_use_time'],
|
||||
'duration_minutes': int(row_dict['duration_minutes'] or 0),
|
||||
'skill_id': int(row_dict['skill_id'] or 0)
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
def _group_and_merge_sessions(
|
||||
self,
|
||||
raw_services: List[Dict[str, Any]],
|
||||
params: Dict[str, float],
|
||||
now: datetime
|
||||
) -> Dict[Tuple[int, str], MemberAssistantIntimacyData]:
|
||||
"""
|
||||
按(member_id, assistant_no)分组并合并会话
|
||||
|
||||
合并逻辑:同一客人对同一助教,间隔<4小时算同次服务
|
||||
"""
|
||||
merge_threshold_hours = int(params['session_merge_hours'])
|
||||
merge_threshold = timedelta(hours=merge_threshold_hours)
|
||||
incentive_weight = params['incentive_weight']
|
||||
|
||||
pair_data: Dict[Tuple[int, str], MemberAssistantIntimacyData] = {}
|
||||
|
||||
# 按pair分组(使用assistant_no)
|
||||
pair_services: Dict[Tuple[int, str], List[Dict[str, Any]]] = {}
|
||||
for svc in raw_services:
|
||||
key = (svc['member_id'], svc['assistant_no'])
|
||||
if key not in pair_services:
|
||||
pair_services[key] = []
|
||||
pair_services[key].append(svc)
|
||||
|
||||
# 对每个pair合并会话
|
||||
for key, services in pair_services.items():
|
||||
member_id, assistant_no = key
|
||||
# 取第一个服务记录的昵称
|
||||
assistant_nickname = services[0]['assistant_nickname'] if services else ''
|
||||
|
||||
data = MemberAssistantIntimacyData(
|
||||
member_id=member_id,
|
||||
assistant_no=assistant_no,
|
||||
assistant_nickname=assistant_nickname,
|
||||
site_id=0, # 稍后填充
|
||||
tenant_id=0
|
||||
)
|
||||
|
||||
# 按开始时间排序
|
||||
sorted_services = sorted(services, key=lambda x: x['start_time'])
|
||||
|
||||
# 合并会话
|
||||
current_session: Optional[ServiceSession] = None
|
||||
|
||||
for svc in sorted_services:
|
||||
start_time = svc['start_time']
|
||||
end_time = svc['end_time']
|
||||
duration = svc['duration_minutes']
|
||||
skill_id = svc['skill_id']
|
||||
|
||||
# 判断课型
|
||||
is_incentive = (skill_id == self.SKILL_ID_INCENTIVE)
|
||||
weight = incentive_weight if is_incentive else 1.0
|
||||
|
||||
if current_session is None:
|
||||
# 开始新会话
|
||||
current_session = ServiceSession(
|
||||
session_start=start_time,
|
||||
session_end=end_time,
|
||||
total_duration_minutes=duration,
|
||||
course_weight=weight,
|
||||
is_incentive=is_incentive
|
||||
)
|
||||
elif start_time - current_session.session_end <= merge_threshold:
|
||||
# 合并到当前会话
|
||||
current_session.session_end = max(current_session.session_end, end_time)
|
||||
current_session.total_duration_minutes += duration
|
||||
# 同次服务取最高权重
|
||||
current_session.course_weight = max(current_session.course_weight, weight)
|
||||
current_session.is_incentive = current_session.is_incentive or is_incentive
|
||||
else:
|
||||
# 保存当前会话,开始新会话
|
||||
data.sessions.append(current_session)
|
||||
current_session = ServiceSession(
|
||||
session_start=start_time,
|
||||
session_end=end_time,
|
||||
total_duration_minutes=duration,
|
||||
course_weight=weight,
|
||||
is_incentive=is_incentive
|
||||
)
|
||||
|
||||
# 保存最后一个会话
|
||||
if current_session is not None:
|
||||
data.sessions.append(current_session)
|
||||
|
||||
# 统计特征
|
||||
data.session_count = len(data.sessions)
|
||||
data.total_duration_minutes = sum(s.total_duration_minutes for s in data.sessions)
|
||||
data.basic_session_count = sum(1 for s in data.sessions if not s.is_incentive)
|
||||
data.incentive_session_count = sum(1 for s in data.sessions if s.is_incentive)
|
||||
|
||||
# 最近一次服务
|
||||
if data.sessions:
|
||||
last_session = max(data.sessions, key=lambda s: s.session_end)
|
||||
data.days_since_last_session = (now - last_session.session_end).days
|
||||
|
||||
pair_data[key] = data
|
||||
|
||||
return pair_data
|
||||
|
||||
def _extract_attributed_recharges(
|
||||
self,
|
||||
site_id: int,
|
||||
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData],
|
||||
params: Dict[str, float],
|
||||
now: datetime
|
||||
) -> None:
|
||||
"""
|
||||
提取归因充值
|
||||
|
||||
归因逻辑:服务结束后1小时内的充值算做该助教贡献
|
||||
"""
|
||||
attribution_hours = int(params['recharge_attribute_hours'])
|
||||
attribution_window = timedelta(hours=attribution_hours)
|
||||
|
||||
# 获取所有相关会员ID
|
||||
member_ids = set(key[0] for key in pair_data.keys())
|
||||
if not member_ids:
|
||||
return
|
||||
|
||||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||||
|
||||
# 查询充值记录
|
||||
sql = f"""
|
||||
SELECT
|
||||
member_id,
|
||||
pay_time,
|
||||
pay_amount
|
||||
FROM billiards_dwd.dwd_recharge_order
|
||||
WHERE site_id = %s
|
||||
AND member_id IN ({member_ids_str})
|
||||
AND settle_type = 5 -- 充值订单
|
||||
AND pay_time >= %s
|
||||
"""
|
||||
|
||||
lookback_days = int(params['lookback_days'])
|
||||
start_datetime = now - timedelta(days=lookback_days)
|
||||
|
||||
rows = self.db.query(sql, (site_id, start_datetime))
|
||||
|
||||
# 为每个充值找到归因助教
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
member_id = int(row_dict['member_id'])
|
||||
pay_time = row_dict['pay_time']
|
||||
pay_amount = float(row_dict['pay_amount'] or 0)
|
||||
|
||||
if pay_amount <= 0:
|
||||
continue
|
||||
|
||||
# 查找该会员在pay_time前1小时内结束服务的助教
|
||||
for key, data in pair_data.items():
|
||||
if key[0] != member_id:
|
||||
continue
|
||||
|
||||
for session in data.sessions:
|
||||
# 服务结束后1小时内的充值
|
||||
if (session.session_end <= pay_time and
|
||||
pay_time - session.session_end <= attribution_window):
|
||||
# 归因给这个助教
|
||||
data.attributed_recharge_count += 1
|
||||
data.attributed_recharge_amount += pay_amount
|
||||
data.recharges.append(AttributedRecharge(
|
||||
pay_time=pay_time,
|
||||
pay_amount=pay_amount,
|
||||
days_ago=(now - pay_time).total_seconds() / 86400
|
||||
))
|
||||
break # 一笔充值只归因给一个助教
|
||||
|
||||
# ==========================================================================
|
||||
# 分数计算方法
|
||||
# ==========================================================================
|
||||
|
||||
def _calculate_component_scores(
|
||||
self,
|
||||
data: MemberAssistantIntimacyData,
|
||||
params: Dict[str, float],
|
||||
now: datetime
|
||||
) -> None:
|
||||
"""计算5项分数"""
|
||||
epsilon = 1e-6
|
||||
|
||||
h_sess = params['halflife_session']
|
||||
h_last = params['halflife_last']
|
||||
h_pay = params['halflife_recharge']
|
||||
h_short = params['halflife_short']
|
||||
h_long = params['halflife_long']
|
||||
A0 = params['amount_base']
|
||||
gamma = params['burst_gamma']
|
||||
|
||||
# 1. 频次强度 F = Σ(τ_i × decay(d_i, h_sess))
|
||||
F = 0.0
|
||||
for session in data.sessions:
|
||||
days_ago = (now - session.session_end).total_seconds() / 86400
|
||||
F += session.course_weight * self.decay(days_ago, h_sess)
|
||||
data.score_frequency = F
|
||||
|
||||
# 2. 最近温度 R = decay(d_last, h_last)
|
||||
if data.days_since_last_session is not None:
|
||||
data.score_recency = self.decay(data.days_since_last_session, h_last)
|
||||
else:
|
||||
data.score_recency = 0.0
|
||||
|
||||
# 3. 归因充值强度 M = Σ(ln(1+amt/A0) × decay(d_r, h_pay))
|
||||
M = 0.0
|
||||
for recharge in data.recharges:
|
||||
m_amt = math.log1p(recharge.pay_amount / A0)
|
||||
M += m_amt * self.decay(recharge.days_ago, h_pay)
|
||||
data.score_recharge = M
|
||||
|
||||
# 4. 时长贡献 D = Σ(sqrt(dur/60) × τ × decay(d, h_sess))
|
||||
D = 0.0
|
||||
for session in data.sessions:
|
||||
days_ago = (now - session.session_end).total_seconds() / 86400
|
||||
dur_hours = session.total_duration_minutes / 60.0
|
||||
D += math.sqrt(dur_hours) * session.course_weight * self.decay(days_ago, h_sess)
|
||||
data.score_duration = D
|
||||
|
||||
# 5. 频率激增放大 mult = 1 + γ × burst
|
||||
# F_short = Σ(τ × decay(d, h_short))
|
||||
# F_long = Σ(τ × decay(d, h_long))
|
||||
F_short = 0.0
|
||||
F_long = 0.0
|
||||
for session in data.sessions:
|
||||
days_ago = (now - session.session_end).total_seconds() / 86400
|
||||
F_short += session.course_weight * self.decay(days_ago, h_short)
|
||||
F_long += session.course_weight * self.decay(days_ago, h_long)
|
||||
|
||||
# burst = max(0, ln(1 + (F_short/F_long - 1)))
|
||||
ratio = F_short / (F_long + epsilon)
|
||||
if ratio > 1:
|
||||
burst = self.safe_ln1p(ratio - 1)
|
||||
else:
|
||||
burst = 0.0
|
||||
|
||||
data.burst_multiplier = 1 + gamma * burst
|
||||
|
||||
# ==========================================================================
|
||||
# 数据保存方法
|
||||
# ==========================================================================
|
||||
|
||||
def _save_intimacy_data(self, data_list: List[MemberAssistantIntimacyData]) -> int:
|
||||
"""保存亲密数据到DWS表"""
|
||||
if not data_list:
|
||||
return 0
|
||||
|
||||
# 先删除已存在的记录
|
||||
site_id = data_list[0].site_id
|
||||
|
||||
# 构建删除条件(使用assistant_no)
|
||||
# 注意:assistant_id字段在数据库中存储assistant_no的整数形式
|
||||
keys = [(d.member_id, d.assistant_no) for d in data_list]
|
||||
conditions = " OR ".join(
|
||||
f"(member_id = {m} AND assistant_id = {int(a)})" for m, a in keys
|
||||
)
|
||||
|
||||
delete_sql = f"""
|
||||
DELETE FROM billiards_dws.dws_member_assistant_intimacy
|
||||
WHERE site_id = %s AND ({conditions})
|
||||
"""
|
||||
|
||||
with self.db.conn.cursor() as cur:
|
||||
cur.execute(delete_sql, (site_id,))
|
||||
|
||||
# 插入新记录
|
||||
# 使用assistant_no的整数值作为assistant_id
|
||||
insert_sql = """
|
||||
INSERT INTO billiards_dws.dws_member_assistant_intimacy (
|
||||
site_id, tenant_id, member_id, assistant_id,
|
||||
session_count, total_duration_minutes,
|
||||
basic_session_count, incentive_session_count,
|
||||
days_since_last_session,
|
||||
attributed_recharge_count, attributed_recharge_amount,
|
||||
score_frequency, score_recency, score_recharge, score_duration,
|
||||
burst_multiplier, raw_score, display_score,
|
||||
calc_time, created_at, updated_at
|
||||
) VALUES (
|
||||
%s, %s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s,
|
||||
%s,
|
||||
%s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s, %s,
|
||||
NOW(), NOW(), NOW()
|
||||
)
|
||||
"""
|
||||
|
||||
inserted = 0
|
||||
with self.db.conn.cursor() as cur:
|
||||
for data in data_list:
|
||||
# 将assistant_no转为整数作为assistant_id
|
||||
assistant_id = int(data.assistant_no) if data.assistant_no.isdigit() else 0
|
||||
cur.execute(insert_sql, (
|
||||
data.site_id, data.tenant_id, data.member_id, assistant_id,
|
||||
data.session_count, data.total_duration_minutes,
|
||||
data.basic_session_count, data.incentive_session_count,
|
||||
data.days_since_last_session,
|
||||
data.attributed_recharge_count, data.attributed_recharge_amount,
|
||||
data.score_frequency, data.score_recency, data.score_recharge, data.score_duration,
|
||||
data.burst_multiplier, data.raw_score, data.display_score
|
||||
))
|
||||
inserted += cur.rowcount
|
||||
|
||||
# 提交事务
|
||||
self.db.conn.commit()
|
||||
|
||||
return inserted
|
||||
|
||||
# ==========================================================================
|
||||
# 辅助方法
|
||||
# ==========================================================================
|
||||
|
||||
def _load_params(self) -> Dict[str, float]:
|
||||
"""加载参数,缺失时使用默认值"""
|
||||
params = self.load_index_parameters()
|
||||
result = dict(self.DEFAULT_PARAMS)
|
||||
result.update(params)
|
||||
return result
|
||||
|
||||
def _get_site_id(self, context: Optional[TaskContext]) -> int:
|
||||
"""获取门店ID"""
|
||||
if context and hasattr(context, 'store_id') and context.store_id:
|
||||
return context.store_id
|
||||
|
||||
site_id = self.config.get('app.default_site_id')
|
||||
if site_id:
|
||||
return int(site_id)
|
||||
|
||||
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
|
||||
rows = self.db.query(sql)
|
||||
if rows:
|
||||
return int(dict(rows[0])['site_id'])
|
||||
|
||||
raise ValueError("无法确定门店ID")
|
||||
|
||||
def _get_tenant_id(self) -> int:
|
||||
"""获取租户ID"""
|
||||
tenant_id = self.config.get('app.tenant_id')
|
||||
if tenant_id:
|
||||
return int(tenant_id)
|
||||
|
||||
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
|
||||
rows = self.db.query(sql)
|
||||
if rows:
|
||||
return int(dict(rows[0])['tenant_id'])
|
||||
|
||||
return 0
|
||||
564
etl_billiards/tasks/dws/index/recall_index_task.py
Normal file
564
etl_billiards/tasks/dws/index/recall_index_task.py
Normal file
@@ -0,0 +1,564 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
客户召回指数计算任务
|
||||
|
||||
功能说明:
|
||||
- 衡量客户召回的必要性和紧急程度
|
||||
- 尊重客户个人到店周期(μ=中位数, σ=MAD)
|
||||
- 对新客户、刚充值客户增加召回倾向
|
||||
- 检测"热了又断"的情况
|
||||
|
||||
算法公式:
|
||||
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
|
||||
|
||||
其中:
|
||||
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
|
||||
- new_bonus = decay(d_first, h_new) # 新客户加分
|
||||
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
|
||||
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
|
||||
|
||||
数据来源:
|
||||
- dwd_settlement_head: 会员到店记录
|
||||
- dwd_recharge_order: 充值记录
|
||||
- dim_member: 首访时间
|
||||
|
||||
更新频率:每2小时
|
||||
|
||||
作者:ETL团队
|
||||
创建日期:2026-02-03
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base_index_task import BaseIndexTask, PercentileHistory
|
||||
from ..base_dws_task import TaskContext
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 数据类定义
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class MemberRecallData:
|
||||
"""会员召回数据"""
|
||||
member_id: int
|
||||
site_id: int
|
||||
tenant_id: int
|
||||
|
||||
# 计算输入特征
|
||||
days_since_last_visit: Optional[int] = None
|
||||
visit_interval_median: Optional[float] = None
|
||||
visit_interval_mad: Optional[float] = None
|
||||
days_since_first_visit: Optional[int] = None
|
||||
days_since_last_recharge: Optional[int] = None
|
||||
visits_last_14_days: int = 0
|
||||
visits_last_60_days: int = 0
|
||||
|
||||
# 分项得分
|
||||
score_overdue: float = 0.0
|
||||
score_new_bonus: float = 0.0
|
||||
score_recharge_bonus: float = 0.0
|
||||
score_hot_drop: float = 0.0
|
||||
|
||||
# 最终分数
|
||||
raw_score: float = 0.0
|
||||
display_score: float = 0.0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 召回指数任务
|
||||
# =============================================================================
|
||||
|
||||
class RecallIndexTask(BaseIndexTask):
|
||||
"""
|
||||
客户召回指数计算任务
|
||||
|
||||
计算流程:
|
||||
1. 提取近60天有到店记录的会员
|
||||
2. 计算每个会员的到店间隔特征(中位数、MAD)
|
||||
3. 计算4项分数(超期、新客、充值、热度断档)
|
||||
4. 汇总Raw Score
|
||||
5. 分位截断 + MinMax映射到0-10
|
||||
6. 写入DWS表
|
||||
"""
|
||||
|
||||
INDEX_TYPE = "RECALL"
|
||||
|
||||
# 默认参数
|
||||
DEFAULT_PARAMS = {
|
||||
'lookback_days': 60,
|
||||
'sigma_min': 2.0,
|
||||
'halflife_new': 7.0,
|
||||
'halflife_recharge': 10.0,
|
||||
'weight_overdue': 3.0,
|
||||
'weight_new': 1.0,
|
||||
'weight_recharge': 1.0,
|
||||
'weight_hot': 1.0,
|
||||
'percentile_lower': 5,
|
||||
'percentile_upper': 95,
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 抽象方法实现
|
||||
# ==========================================================================
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
return "DWS_RECALL_INDEX"
|
||||
|
||||
def get_target_table(self) -> str:
|
||||
return "dws_member_recall_index"
|
||||
|
||||
def get_primary_keys(self) -> List[str]:
|
||||
return ['site_id', 'member_id']
|
||||
|
||||
def get_index_type(self) -> str:
|
||||
return self.INDEX_TYPE
|
||||
|
||||
# ==========================================================================
|
||||
# 任务执行
|
||||
# ==========================================================================
|
||||
|
||||
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
|
||||
"""执行召回指数计算"""
|
||||
self.logger.info("开始计算客户召回指数")
|
||||
|
||||
# 获取门店ID
|
||||
site_id = self._get_site_id(context)
|
||||
tenant_id = self._get_tenant_id()
|
||||
|
||||
# 加载参数
|
||||
params = self._load_params()
|
||||
lookback_days = int(params['lookback_days'])
|
||||
|
||||
# 计算基准日期
|
||||
base_date = date.today()
|
||||
start_date = base_date - timedelta(days=lookback_days)
|
||||
|
||||
self.logger.info(
|
||||
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
|
||||
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
|
||||
)
|
||||
|
||||
# 1. 提取会员到店数据
|
||||
member_visits = self._extract_member_visits(site_id, start_date, base_date)
|
||||
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
|
||||
|
||||
if not member_visits:
|
||||
self.logger.warning("没有会员到店记录,跳过计算")
|
||||
return {'status': 'skipped', 'reason': 'no_data'}
|
||||
|
||||
# 2. 提取充值记录
|
||||
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
|
||||
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
|
||||
|
||||
# 3. 提取首访时间
|
||||
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
|
||||
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
|
||||
|
||||
# 4. 计算每个会员的召回数据
|
||||
recall_data_list: List[MemberRecallData] = []
|
||||
|
||||
for member_id, visit_dates in member_visits.items():
|
||||
data = MemberRecallData(
|
||||
member_id=member_id,
|
||||
site_id=site_id,
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
|
||||
# 计算特征
|
||||
self._calculate_visit_features(data, visit_dates, base_date, params)
|
||||
|
||||
# 补充充值特征
|
||||
if member_id in recharge_data:
|
||||
last_recharge_date = recharge_data[member_id]
|
||||
data.days_since_last_recharge = (base_date - last_recharge_date).days
|
||||
|
||||
# 补充首访特征
|
||||
if member_id in first_visit_data:
|
||||
first_visit_date = first_visit_data[member_id]
|
||||
data.days_since_first_visit = (base_date - first_visit_date).days
|
||||
|
||||
# 计算分项得分
|
||||
self._calculate_component_scores(data, params)
|
||||
|
||||
# 汇总Raw Score
|
||||
data.raw_score = (
|
||||
params['weight_overdue'] * data.score_overdue +
|
||||
params['weight_new'] * data.score_new_bonus +
|
||||
params['weight_recharge'] * data.score_recharge_bonus +
|
||||
params['weight_hot'] * data.score_hot_drop
|
||||
)
|
||||
|
||||
recall_data_list.append(data)
|
||||
|
||||
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
|
||||
|
||||
# 5. 归一化到Display Score
|
||||
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
|
||||
normalized = self.batch_normalize_to_display(
|
||||
raw_scores,
|
||||
use_log=False,
|
||||
percentile_lower=int(params['percentile_lower']),
|
||||
percentile_upper=int(params['percentile_upper']),
|
||||
use_smoothing=True,
|
||||
site_id=site_id
|
||||
)
|
||||
|
||||
# 更新display_score
|
||||
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
|
||||
for data in recall_data_list:
|
||||
if data.member_id in score_map:
|
||||
_, data.display_score = score_map[data.member_id]
|
||||
|
||||
# 6. 保存分位点历史
|
||||
if recall_data_list:
|
||||
all_raw = [d.raw_score for d in recall_data_list]
|
||||
q_l, q_u = self.calculate_percentiles(
|
||||
all_raw,
|
||||
int(params['percentile_lower']),
|
||||
int(params['percentile_upper'])
|
||||
)
|
||||
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
|
||||
|
||||
self.save_percentile_history(
|
||||
site_id=site_id,
|
||||
percentile_5=q_l,
|
||||
percentile_95=q_u,
|
||||
percentile_5_smoothed=smoothed_l,
|
||||
percentile_95_smoothed=smoothed_u,
|
||||
record_count=len(all_raw),
|
||||
min_raw=min(all_raw),
|
||||
max_raw=max(all_raw),
|
||||
avg_raw=sum(all_raw) / len(all_raw)
|
||||
)
|
||||
|
||||
# 7. 写入DWS表
|
||||
inserted = self._save_recall_data(recall_data_list)
|
||||
|
||||
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'member_count': len(recall_data_list),
|
||||
'records_inserted': inserted
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 数据提取方法
|
||||
# ==========================================================================
|
||||
|
||||
def _extract_member_visits(
|
||||
self,
|
||||
site_id: int,
|
||||
start_date: date,
|
||||
end_date: date
|
||||
) -> Dict[int, List[date]]:
|
||||
"""
|
||||
提取会员到店记录
|
||||
|
||||
Returns:
|
||||
{member_id: [visit_date1, visit_date2, ...]}
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
member_id,
|
||||
DATE(pay_time) AS visit_date
|
||||
FROM billiards_dwd.dwd_settlement_head
|
||||
WHERE site_id = %s
|
||||
AND member_id > 0 -- 排除散客
|
||||
AND settle_type = 1 -- 台桌结账
|
||||
AND pay_time >= %s
|
||||
AND pay_time < %s + INTERVAL '1 day'
|
||||
GROUP BY member_id, DATE(pay_time)
|
||||
ORDER BY member_id, visit_date
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (site_id, start_date, end_date))
|
||||
|
||||
result: Dict[int, List[date]] = {}
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
member_id = int(row_dict['member_id'])
|
||||
visit_date = row_dict['visit_date']
|
||||
|
||||
if member_id not in result:
|
||||
result[member_id] = []
|
||||
result[member_id].append(visit_date)
|
||||
|
||||
return result
|
||||
|
||||
def _extract_recharge_data(
|
||||
self,
|
||||
site_id: int,
|
||||
start_date: date,
|
||||
end_date: date
|
||||
) -> Dict[int, date]:
|
||||
"""
|
||||
提取最近充值记录
|
||||
|
||||
Returns:
|
||||
{member_id: last_recharge_date}
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
member_id,
|
||||
MAX(DATE(pay_time)) AS last_recharge_date
|
||||
FROM billiards_dwd.dwd_recharge_order
|
||||
WHERE site_id = %s
|
||||
AND member_id > 0
|
||||
AND settle_type = 5 -- 充值订单
|
||||
AND pay_time >= %s
|
||||
AND pay_time < %s + INTERVAL '1 day'
|
||||
GROUP BY member_id
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (site_id, start_date, end_date))
|
||||
|
||||
result: Dict[int, date] = {}
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
|
||||
|
||||
return result
|
||||
|
||||
def _extract_first_visit_data(
|
||||
self,
|
||||
site_id: int,
|
||||
member_ids: List[int]
|
||||
) -> Dict[int, date]:
|
||||
"""
|
||||
提取首访时间
|
||||
|
||||
优先使用dim_member.create_time,如果没有则使用dwd_settlement_head中的首次消费时间
|
||||
|
||||
Returns:
|
||||
{member_id: first_visit_date}
|
||||
"""
|
||||
if not member_ids:
|
||||
return {}
|
||||
|
||||
# 使用dim_member的create_time作为首访时间
|
||||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||||
sql = f"""
|
||||
SELECT
|
||||
member_id,
|
||||
DATE(create_time) AS first_visit_date
|
||||
FROM billiards_dwd.dim_member
|
||||
WHERE member_id IN ({member_ids_str})
|
||||
AND scd2_is_current = 1
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql)
|
||||
|
||||
result: Dict[int, date] = {}
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
member_id = int(row_dict['member_id'])
|
||||
first_date = row_dict['first_visit_date']
|
||||
if first_date:
|
||||
result[member_id] = first_date
|
||||
|
||||
return result
|
||||
|
||||
# ==========================================================================
|
||||
# 特征计算方法
|
||||
# ==========================================================================
|
||||
|
||||
def _calculate_visit_features(
|
||||
self,
|
||||
data: MemberRecallData,
|
||||
visit_dates: List[date],
|
||||
base_date: date,
|
||||
params: Dict[str, float]
|
||||
) -> None:
|
||||
"""计算到店特征"""
|
||||
if not visit_dates:
|
||||
return
|
||||
|
||||
# 最近一次到店
|
||||
last_visit = max(visit_dates)
|
||||
data.days_since_last_visit = (base_date - last_visit).days
|
||||
|
||||
# 到店间隔
|
||||
sorted_dates = sorted(visit_dates)
|
||||
intervals = []
|
||||
for i in range(1, len(sorted_dates)):
|
||||
interval = (sorted_dates[i] - sorted_dates[i-1]).days
|
||||
intervals.append(float(interval))
|
||||
|
||||
if intervals:
|
||||
# 中位数(μ)
|
||||
data.visit_interval_median = self.calculate_median(intervals)
|
||||
|
||||
# MAD(σ),下限为sigma_min
|
||||
mad = self.calculate_mad(intervals)
|
||||
data.visit_interval_mad = max(mad, params['sigma_min'])
|
||||
else:
|
||||
# 只有一次到店,使用默认值
|
||||
data.visit_interval_median = 7.0 # 默认周期7天
|
||||
data.visit_interval_mad = params['sigma_min']
|
||||
|
||||
# 近14天/60天到店次数
|
||||
days_14_ago = base_date - timedelta(days=14)
|
||||
days_60_ago = base_date - timedelta(days=60)
|
||||
|
||||
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
|
||||
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
|
||||
|
||||
def _calculate_component_scores(
|
||||
self,
|
||||
data: MemberRecallData,
|
||||
params: Dict[str, float]
|
||||
) -> None:
|
||||
"""计算4项分数"""
|
||||
|
||||
# 1. 超期紧急性
|
||||
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
|
||||
t = data.days_since_last_visit
|
||||
mu = data.visit_interval_median
|
||||
sigma = data.visit_interval_mad or params['sigma_min']
|
||||
|
||||
# z = max(0, (t - μ) / σ)
|
||||
z = max(0.0, (t - mu) / sigma)
|
||||
# overdue = 1 - exp(-z)
|
||||
data.score_overdue = 1.0 - math.exp(-z)
|
||||
|
||||
# 2. 新客户加分
|
||||
lookback_days = int(params['lookback_days'])
|
||||
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
|
||||
data.score_new_bonus = self.decay(
|
||||
data.days_since_first_visit,
|
||||
params['halflife_new']
|
||||
)
|
||||
|
||||
# 3. 刚充值加分
|
||||
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
|
||||
data.score_recharge_bonus = self.decay(
|
||||
data.days_since_last_recharge,
|
||||
params['halflife_recharge']
|
||||
)
|
||||
|
||||
# 4. 热度断档加分
|
||||
epsilon = 1e-6
|
||||
n14 = data.visits_last_14_days
|
||||
n60 = data.visits_last_60_days
|
||||
|
||||
r14 = n14 / 14.0
|
||||
r60 = (n60 + 1) / 60.0 # +1 平滑
|
||||
|
||||
hot_ratio = r14 / (r60 + epsilon)
|
||||
|
||||
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
|
||||
if hot_ratio > 1:
|
||||
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
|
||||
else:
|
||||
data.score_hot_drop = 0.0
|
||||
|
||||
# ==========================================================================
|
||||
# 数据保存方法
|
||||
# ==========================================================================
|
||||
|
||||
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
|
||||
"""保存召回数据到DWS表"""
|
||||
if not data_list:
|
||||
return 0
|
||||
|
||||
# 先删除已存在的记录
|
||||
site_id = data_list[0].site_id
|
||||
member_ids = [d.member_id for d in data_list]
|
||||
|
||||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||||
delete_sql = f"""
|
||||
DELETE FROM billiards_dws.dws_member_recall_index
|
||||
WHERE site_id = %s AND member_id IN ({member_ids_str})
|
||||
"""
|
||||
|
||||
with self.db.conn.cursor() as cur:
|
||||
cur.execute(delete_sql, (site_id,))
|
||||
|
||||
# 插入新记录
|
||||
insert_sql = """
|
||||
INSERT INTO billiards_dws.dws_member_recall_index (
|
||||
site_id, tenant_id, member_id,
|
||||
days_since_last_visit, visit_interval_median, visit_interval_mad,
|
||||
days_since_first_visit, days_since_last_recharge,
|
||||
visits_last_14_days, visits_last_60_days,
|
||||
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
|
||||
raw_score, display_score,
|
||||
calc_time, created_at, updated_at
|
||||
) VALUES (
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s,
|
||||
NOW(), NOW(), NOW()
|
||||
)
|
||||
"""
|
||||
|
||||
inserted = 0
|
||||
with self.db.conn.cursor() as cur:
|
||||
for data in data_list:
|
||||
cur.execute(insert_sql, (
|
||||
data.site_id, data.tenant_id, data.member_id,
|
||||
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
|
||||
data.days_since_first_visit, data.days_since_last_recharge,
|
||||
data.visits_last_14_days, data.visits_last_60_days,
|
||||
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
|
||||
data.raw_score, data.display_score
|
||||
))
|
||||
inserted += cur.rowcount
|
||||
|
||||
# 提交事务
|
||||
self.db.conn.commit()
|
||||
|
||||
return inserted
|
||||
|
||||
# ==========================================================================
|
||||
# 辅助方法
|
||||
# ==========================================================================
|
||||
|
||||
def _load_params(self) -> Dict[str, float]:
|
||||
"""加载参数,缺失时使用默认值"""
|
||||
params = self.load_index_parameters()
|
||||
result = dict(self.DEFAULT_PARAMS)
|
||||
result.update(params)
|
||||
return result
|
||||
|
||||
def _get_site_id(self, context: Optional[TaskContext]) -> int:
|
||||
"""获取门店ID"""
|
||||
if context and hasattr(context, 'store_id') and context.store_id:
|
||||
return context.store_id
|
||||
|
||||
# 从配置获取默认门店ID
|
||||
site_id = self.config.get('app.default_site_id')
|
||||
if site_id:
|
||||
return int(site_id)
|
||||
|
||||
# 查询数据库获取第一个门店
|
||||
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head LIMIT 1"
|
||||
rows = self.db.query(sql)
|
||||
if rows:
|
||||
return int(dict(rows[0])['site_id'])
|
||||
|
||||
raise ValueError("无法确定门店ID")
|
||||
|
||||
def _get_tenant_id(self) -> int:
|
||||
"""获取租户ID"""
|
||||
tenant_id = self.config.get('app.tenant_id')
|
||||
if tenant_id:
|
||||
return int(tenant_id)
|
||||
|
||||
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head LIMIT 1"
|
||||
rows = self.db.query(sql)
|
||||
if rows:
|
||||
return int(dict(rows[0])['tenant_id'])
|
||||
|
||||
return 0
|
||||
Reference in New Issue
Block a user