Files
ZQYY.FQ-ETL/tasks/dws/index/intimacy_index_task.py

695 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
客户-助教亲密指数计算任务
功能说明:
- 衡量客户与助教的关系强度和近期温度
- 用于助教约课精力分配和约课成功率预估
- 附加课权重 = 基础课的1.5倍
- 检测频率激增并放大权重
算法公式:
Raw Score = (w_F × F + w_R × R + w_M × M + w_D × D) × mult
其中:
- F = Σ(τ_i × decay(d_i, h_sess)) # 频次强度
- R = decay(d_last, h_last) # 最近温度
- M = Σ(ln(1+amt/A0) × decay(d_r, h_pay)) # 归因充值强度
- D = Σ(sqrt(dur/60) × τ × decay(d, h)) # 时长贡献
- mult = 1 + γ × burst # 激增放大
- burst = max(0, ln(1 + (F_short/F_long - 1)))
特殊逻辑:
- 会话合并:同一客人对同一助教,间隔<4小时算同次服务
- 充值归因服务结束后1小时内的充值算做该助教贡献
数据来源:
- dwd_assistant_service_log: 服务记录
- dwd_recharge_order: 充值记录
更新频率每4小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import CourseType, TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class ServiceSession:
"""合并后的服务会话"""
session_start: datetime
session_end: datetime
total_duration_minutes: int = 0
course_weight: float = 1.0 # 1.0=基础课, 1.5=附加课
is_incentive: bool = False # 是否为附加课
@dataclass
class AttributedRecharge:
"""归因充值"""
pay_time: datetime
pay_amount: float
days_ago: float
@dataclass
class MemberAssistantIntimacyData:
"""客户-助教亲密数据"""
member_id: int
assistant_id: int # 助教IDdim_assistant.assistant_id通过user_id关联获取
assistant_user_id: int # 助教user_id来自服务日志用于中间关联
site_id: int
tenant_id: int
# 计算输入特征
session_count: int = 0
total_duration_minutes: int = 0
basic_session_count: int = 0
incentive_session_count: int = 0
days_since_last_session: Optional[int] = None
attributed_recharge_count: int = 0
attributed_recharge_amount: float = 0.0
# 分项得分
score_frequency: float = 0.0
score_recency: float = 0.0
score_recharge: float = 0.0
score_duration: float = 0.0
burst_multiplier: float = 1.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# 中间数据
sessions: List[ServiceSession] = field(default_factory=list)
recharges: List[AttributedRecharge] = field(default_factory=list)
# =============================================================================
# 亲密指数任务
# =============================================================================
class IntimacyIndexTask(BaseIndexTask):
"""
客户-助教亲密指数计算任务
计算流程:
1. 提取近60天的助教服务记录
2. 按(member_id, assistant_id)分组合并4小时内的服务
3. 提取归因充值服务结束后1小时内
4. 计算5项分数频次、最近、充值、时长、激增
5. 汇总Raw Score
6. 分位截断 + Log压缩 + MinMax映射到0-10
7. 写入DWS表
"""
INDEX_TYPE = "INTIMACY"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'halflife_session': 14.0,
'halflife_last': 10.0,
'halflife_recharge': 21.0,
'halflife_short': 7.0,
'halflife_long': 30.0,
'amount_base': 500.0,
'incentive_weight': 1.5,
'session_merge_hours': 4,
'recharge_attribute_hours': 1,
'weight_frequency': 2.0,
'weight_recency': 1.5,
'weight_recharge': 2.0,
'weight_duration': 0.5,
'burst_gamma': 0.6,
'compression_mode': 1, # 0=none, 1=log1p, 2=asinh
'use_smoothing': 1, # 1=启用EWMA平滑, 0=关闭
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_INTIMACY_INDEX"
def get_target_table(self) -> str:
return "dws_member_assistant_intimacy"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id', 'assistant_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行亲密指数计算"""
self.logger.info("开始计算客户-助教亲密指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期和时间
now = datetime.now(self.tz)
base_date = now.date()
start_datetime = now - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, h_sess=%.1f, h_last=%.1f, h_pay=%.1f, γ=%.2f",
lookback_days, params['halflife_session'], params['halflife_last'],
params['halflife_recharge'], params['burst_gamma']
)
# 1. 提取服务记录
raw_services = self._extract_service_records(site_id, start_datetime, now)
self.logger.info("提取到 %d 条原始服务记录", len(raw_services))
if not raw_services:
self.logger.warning("没有服务记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 按(member_id, assistant_id)分组并合并会话
pair_data = self._group_and_merge_sessions(raw_services, params, now)
self.logger.info("合并为 %d 个客户-助教对", len(pair_data))
# 3. 提取归因充值
self._extract_attributed_recharges(site_id, pair_data, params, now)
# 4. 计算每个pair的特征和分数
intimacy_data_list: List[MemberAssistantIntimacyData] = []
for key, data in pair_data.items():
data.site_id = site_id
data.tenant_id = tenant_id
# 计算分项得分
self._calculate_component_scores(data, params, now)
# 汇总Raw Score
base_score = (
params['weight_frequency'] * data.score_frequency +
params['weight_recency'] * data.score_recency +
params['weight_recharge'] * data.score_recharge +
params['weight_duration'] * data.score_duration
)
data.raw_score = base_score * data.burst_multiplier
intimacy_data_list.append(data)
self.logger.info("计算完成 %d 个pair的Raw Score", len(intimacy_data_list))
# 5. 归一化到Display Score支持log1p/asinh压缩
compression_mode = int(params.get('compression_mode', 1))
compression = {1: "log1p", 2: "asinh"}.get(compression_mode, "none")
use_smoothing = bool(int(params.get('use_smoothing', 1)))
raw_scores = [((d.member_id, d.assistant_id), d.raw_score) for d in intimacy_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
# 更新display_score
score_map = {key: (raw, display) for key, raw, display in normalized}
for data in intimacy_data_list:
key = (data.member_id, data.assistant_id)
if key in score_map:
_, data.display_score = score_map[key]
# 6. 保存分位点历史
if intimacy_data_list:
all_raw = [d.raw_score for d in intimacy_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_intimacy_data(intimacy_data_list)
self.logger.info("亲密指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'pair_count': len(intimacy_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_records(
self,
site_id: int,
start_datetime: datetime,
end_datetime: datetime
) -> List[Dict[str, Any]]:
"""
提取服务记录
注意: 使用 assistant_no (助教工号) 作为助教标识,而不是 site_assistant_id
因为 site_assistant_id 在数据中是每次服务的唯一ID不是助教的唯一标识
Returns:
[{'member_id', 'assistant_no', 'assistant_nickname', 'start_time', 'end_time', 'duration_minutes', 'skill_id'}, ...]
"""
# 通过 user_id 关联 dim_assistant 获取 assistant_id
sql = """
SELECT
s.tenant_member_id AS member_id,
s.user_id AS assistant_user_id,
d.assistant_id,
s.start_use_time,
s.last_use_time,
COALESCE(s.income_seconds, 0) / 60 AS duration_minutes,
s.skill_id
FROM billiards_dwd.dwd_assistant_service_log s
JOIN billiards_dwd.dim_assistant d
ON s.user_id = d.user_id AND d.scd2_is_current = 1
WHERE s.site_id = %s
AND s.tenant_member_id > 0 -- 排除散客
AND s.is_delete = 0
AND s.user_id > 0 -- 确保有助教user_id
AND s.last_use_time >= %s
AND s.last_use_time < %s
ORDER BY s.tenant_member_id, d.assistant_id, s.start_use_time
"""
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
result = []
for row in (rows or []):
row_dict = dict(row)
assistant_id = row_dict['assistant_id']
if assistant_id:
result.append({
'member_id': int(row_dict['member_id']),
'assistant_id': int(assistant_id), # 助教IDdim_assistant主键
'assistant_user_id': int(row_dict['assistant_user_id']), # user_id用于中间处理
'start_time': row_dict['start_use_time'],
'end_time': row_dict['last_use_time'],
'duration_minutes': int(row_dict['duration_minutes'] or 0),
'skill_id': int(row_dict['skill_id'] or 0)
})
return result
def _group_and_merge_sessions(
self,
raw_services: List[Dict[str, Any]],
params: Dict[str, float],
now: datetime
) -> Dict[Tuple[int, int], MemberAssistantIntimacyData]:
"""
按(member_id, assistant_id)分组并合并会话
合并逻辑:同一客人对同一助教,间隔<4小时算同次服务
"""
merge_threshold_hours = int(params['session_merge_hours'])
merge_threshold = timedelta(hours=merge_threshold_hours)
incentive_weight = params['incentive_weight']
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData] = {}
# 按pair分组使用assistant_id
pair_services: Dict[Tuple[int, int], List[Dict[str, Any]]] = {}
for svc in raw_services:
key = (svc['member_id'], svc['assistant_id'])
if key not in pair_services:
pair_services[key] = []
pair_services[key].append(svc)
# 对每个pair合并会话
for key, services in pair_services.items():
member_id, assistant_id = key
# 取第一个服务记录的user_id
assistant_user_id = services[0]['assistant_user_id'] if services else 0
data = MemberAssistantIntimacyData(
member_id=member_id,
assistant_id=assistant_id,
assistant_user_id=assistant_user_id,
site_id=0, # 稍后填充
tenant_id=0
)
# 按开始时间排序
sorted_services = sorted(services, key=lambda x: x['start_time'])
# 合并会话
current_session: Optional[ServiceSession] = None
for svc in sorted_services:
start_time = svc['start_time']
end_time = svc['end_time']
duration = svc['duration_minutes']
skill_id = svc['skill_id']
# 判断课型(附加课权重更高,包厢课按基础课处理)
course_type = self.get_course_type(skill_id)
is_incentive = course_type == CourseType.BONUS
weight = incentive_weight if is_incentive else 1.0
if current_session is None:
# 开始新会话
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
elif start_time - current_session.session_end <= merge_threshold:
# 合并到当前会话
current_session.session_end = max(current_session.session_end, end_time)
current_session.total_duration_minutes += duration
# 同次服务取最高权重
current_session.course_weight = max(current_session.course_weight, weight)
current_session.is_incentive = current_session.is_incentive or is_incentive
else:
# 保存当前会话,开始新会话
data.sessions.append(current_session)
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
# 保存最后一个会话
if current_session is not None:
data.sessions.append(current_session)
# 统计特征
data.session_count = len(data.sessions)
data.total_duration_minutes = sum(s.total_duration_minutes for s in data.sessions)
data.basic_session_count = sum(1 for s in data.sessions if not s.is_incentive)
data.incentive_session_count = sum(1 for s in data.sessions if s.is_incentive)
# 最近一次服务
if data.sessions:
last_session = max(data.sessions, key=lambda s: s.session_end)
data.days_since_last_session = (now - last_session.session_end).days
pair_data[key] = data
return pair_data
def _extract_attributed_recharges(
self,
site_id: int,
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData],
params: Dict[str, float],
now: datetime
) -> None:
"""
提取归因充值
归因逻辑服务结束后1小时内的充值算做该助教贡献
"""
attribution_hours = int(params['recharge_attribute_hours'])
attribution_window = timedelta(hours=attribution_hours)
# 获取所有相关会员ID
member_ids = set(key[0] for key in pair_data.keys())
if not member_ids:
return
member_ids_str = ','.join(str(m) for m in member_ids)
# 查询充值记录
sql = f"""
SELECT
member_id,
pay_time,
pay_amount
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id IN ({member_ids_str})
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
"""
lookback_days = int(params['lookback_days'])
start_datetime = now - timedelta(days=lookback_days)
rows = self.db.query(sql, (site_id, start_datetime))
# 为每个充值找到归因助教
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
pay_time = row_dict['pay_time']
pay_amount = float(row_dict['pay_amount'] or 0)
if pay_amount <= 0:
continue
# 查找该会员在pay_time前1小时内结束服务的助教
for key, data in pair_data.items():
if key[0] != member_id:
continue
for session in data.sessions:
# 服务结束后1小时内的充值
if (session.session_end <= pay_time and
pay_time - session.session_end <= attribution_window):
# 归因给这个助教
data.attributed_recharge_count += 1
data.attributed_recharge_amount += pay_amount
data.recharges.append(AttributedRecharge(
pay_time=pay_time,
pay_amount=pay_amount,
days_ago=(now - pay_time).total_seconds() / 86400
))
break # 一笔充值只归因给一个助教
# ==========================================================================
# 分数计算方法
# ==========================================================================
def _calculate_component_scores(
self,
data: MemberAssistantIntimacyData,
params: Dict[str, float],
now: datetime
) -> None:
"""计算5项分数"""
epsilon = 1e-6
lookback_days = int(params['lookback_days'])
h_sess = params['halflife_session']
h_last = params['halflife_last']
h_pay = params['halflife_recharge']
h_short = params['halflife_short']
h_long = params['halflife_long']
A0 = params['amount_base']
gamma = params['burst_gamma']
# 1. 频次强度 F = Σ(τ_i × decay(d_i, h_sess))
F = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
days_ago = min(days_ago, lookback_days)
F += session.course_weight * self.decay(days_ago, h_sess)
data.score_frequency = F
# 2. 最近温度 R = decay(d_last, h_last)
if data.days_since_last_session is not None:
data.score_recency = self.decay(min(data.days_since_last_session, lookback_days), h_last)
else:
data.score_recency = 0.0
# 3. 归因充值强度 M = Σ(ln(1+amt/A0) × decay(d_r, h_pay))
M = 0.0
for recharge in data.recharges:
m_amt = math.log1p(recharge.pay_amount / A0)
M += m_amt * self.decay(min(recharge.days_ago, lookback_days), h_pay)
data.score_recharge = M
# 4. 时长贡献 D = Σ(sqrt(dur/60) × τ × decay(d, h_sess))
D = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
dur_hours = session.total_duration_minutes / 60.0
days_ago = min(days_ago, lookback_days)
D += math.sqrt(dur_hours) * session.course_weight * self.decay(days_ago, h_sess)
data.score_duration = D
# 5. 频率激增放大 mult = 1 + γ × burst
# F_short = Σ(τ × decay(d, h_short))
# F_long = Σ(τ × decay(d, h_long))
F_short = 0.0
F_long = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
days_ago = min(days_ago, lookback_days)
F_short += session.course_weight * self.decay(days_ago, h_short)
F_long += session.course_weight * self.decay(days_ago, h_long)
# burst = max(0, ln(1 + (F_short/F_long - 1)))
ratio = F_short / (F_long + epsilon)
if ratio > 1:
burst = self.safe_ln1p(ratio - 1)
else:
burst = 0.0
data.burst_multiplier = 1 + gamma * burst
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_intimacy_data(self, data_list: List[MemberAssistantIntimacyData]) -> int:
"""保存亲密数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
# 构建删除条件使用assistant_id
keys = [(d.member_id, d.assistant_id) for d in data_list]
conditions = " OR ".join(
f"(member_id = {m} AND assistant_id = {a})" for m, a in keys
)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_assistant_intimacy
WHERE site_id = %s AND ({conditions})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_assistant_intimacy (
site_id, tenant_id, member_id, assistant_id,
session_count, total_duration_minutes,
basic_session_count, incentive_session_count,
days_since_last_session,
attributed_recharge_count, attributed_recharge_amount,
score_frequency, score_recency, score_recharge, score_duration,
burst_multiplier, raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id, data.assistant_id,
data.session_count, data.total_duration_minutes,
data.basic_session_count, data.incentive_session_count,
data.days_since_last_session,
data.attributed_recharge_count, data.attributed_recharge_amount,
data.score_frequency, data.score_recency, data.score_recharge, data.score_duration,
data.burst_multiplier, data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
site_id = self.config.get('app.default_site_id')
if site_id:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['site_id'])
raise ValueError("无法确定门店ID")
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['tenant_id'])
return 0