初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

View File

@@ -0,0 +1,694 @@
# -*- coding: utf-8 -*-
"""
客户-助教亲密指数计算任务
功能说明:
- 衡量客户与助教的关系强度和近期温度
- 用于助教约课精力分配和约课成功率预估
- 附加课权重 = 基础课的1.5倍
- 检测频率激增并放大权重
算法公式:
Raw Score = (w_F × F + w_R × R + w_M × M + w_D × D) × mult
其中:
- F = Σ(τ_i × decay(d_i, h_sess)) # 频次强度
- R = decay(d_last, h_last) # 最近温度
- M = Σ(ln(1+amt/A0) × decay(d_r, h_pay)) # 归因充值强度
- D = Σ(sqrt(dur/60) × τ × decay(d, h)) # 时长贡献
- mult = 1 + γ × burst # 激增放大
- burst = max(0, ln(1 + (F_short/F_long - 1)))
特殊逻辑:
- 会话合并:同一客人对同一助教,间隔<4小时算同次服务
- 充值归因服务结束后1小时内的充值算做该助教贡献
数据来源:
- dwd_assistant_service_log: 服务记录
- dwd_recharge_order: 充值记录
更新频率每4小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import CourseType, TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class ServiceSession:
"""合并后的服务会话"""
session_start: datetime
session_end: datetime
total_duration_minutes: int = 0
course_weight: float = 1.0 # 1.0=基础课, 1.5=附加课
is_incentive: bool = False # 是否为附加课
@dataclass
class AttributedRecharge:
"""归因充值"""
pay_time: datetime
pay_amount: float
days_ago: float
@dataclass
class MemberAssistantIntimacyData:
"""客户-助教亲密数据"""
member_id: int
assistant_id: int # 助教IDdim_assistant.assistant_id通过user_id关联获取
assistant_user_id: int # 助教user_id来自服务日志用于中间关联
site_id: int
tenant_id: int
# 计算输入特征
session_count: int = 0
total_duration_minutes: int = 0
basic_session_count: int = 0
incentive_session_count: int = 0
days_since_last_session: Optional[int] = None
attributed_recharge_count: int = 0
attributed_recharge_amount: float = 0.0
# 分项得分
score_frequency: float = 0.0
score_recency: float = 0.0
score_recharge: float = 0.0
score_duration: float = 0.0
burst_multiplier: float = 1.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# 中间数据
sessions: List[ServiceSession] = field(default_factory=list)
recharges: List[AttributedRecharge] = field(default_factory=list)
# =============================================================================
# 亲密指数任务
# =============================================================================
class IntimacyIndexTask(BaseIndexTask):
"""
客户-助教亲密指数计算任务
计算流程:
1. 提取近60天的助教服务记录
2. 按(member_id, assistant_id)分组合并4小时内的服务
3. 提取归因充值服务结束后1小时内
4. 计算5项分数频次、最近、充值、时长、激增
5. 汇总Raw Score
6. 分位截断 + Log压缩 + MinMax映射到0-10
7. 写入DWS表
"""
INDEX_TYPE = "INTIMACY"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'halflife_session': 14.0,
'halflife_last': 10.0,
'halflife_recharge': 21.0,
'halflife_short': 7.0,
'halflife_long': 30.0,
'amount_base': 500.0,
'incentive_weight': 1.5,
'session_merge_hours': 4,
'recharge_attribute_hours': 1,
'weight_frequency': 2.0,
'weight_recency': 1.5,
'weight_recharge': 2.0,
'weight_duration': 0.5,
'burst_gamma': 0.6,
'compression_mode': 1, # 0=none, 1=log1p, 2=asinh
'use_smoothing': 1, # 1=启用EWMA平滑, 0=关闭
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_INTIMACY_INDEX"
def get_target_table(self) -> str:
return "dws_member_assistant_intimacy"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id', 'assistant_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行亲密指数计算"""
self.logger.info("开始计算客户-助教亲密指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期和时间
now = datetime.now(self.tz)
base_date = now.date()
start_datetime = now - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, h_sess=%.1f, h_last=%.1f, h_pay=%.1f, γ=%.2f",
lookback_days, params['halflife_session'], params['halflife_last'],
params['halflife_recharge'], params['burst_gamma']
)
# 1. 提取服务记录
raw_services = self._extract_service_records(site_id, start_datetime, now)
self.logger.info("提取到 %d 条原始服务记录", len(raw_services))
if not raw_services:
self.logger.warning("没有服务记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 按(member_id, assistant_id)分组并合并会话
pair_data = self._group_and_merge_sessions(raw_services, params, now)
self.logger.info("合并为 %d 个客户-助教对", len(pair_data))
# 3. 提取归因充值
self._extract_attributed_recharges(site_id, pair_data, params, now)
# 4. 计算每个pair的特征和分数
intimacy_data_list: List[MemberAssistantIntimacyData] = []
for key, data in pair_data.items():
data.site_id = site_id
data.tenant_id = tenant_id
# 计算分项得分
self._calculate_component_scores(data, params, now)
# 汇总Raw Score
base_score = (
params['weight_frequency'] * data.score_frequency +
params['weight_recency'] * data.score_recency +
params['weight_recharge'] * data.score_recharge +
params['weight_duration'] * data.score_duration
)
data.raw_score = base_score * data.burst_multiplier
intimacy_data_list.append(data)
self.logger.info("计算完成 %d 个pair的Raw Score", len(intimacy_data_list))
# 5. 归一化到Display Score支持log1p/asinh压缩
compression_mode = int(params.get('compression_mode', 1))
compression = {1: "log1p", 2: "asinh"}.get(compression_mode, "none")
use_smoothing = bool(int(params.get('use_smoothing', 1)))
raw_scores = [((d.member_id, d.assistant_id), d.raw_score) for d in intimacy_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
# 更新display_score
score_map = {key: (raw, display) for key, raw, display in normalized}
for data in intimacy_data_list:
key = (data.member_id, data.assistant_id)
if key in score_map:
_, data.display_score = score_map[key]
# 6. 保存分位点历史
if intimacy_data_list:
all_raw = [d.raw_score for d in intimacy_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_intimacy_data(intimacy_data_list)
self.logger.info("亲密指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'pair_count': len(intimacy_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_records(
self,
site_id: int,
start_datetime: datetime,
end_datetime: datetime
) -> List[Dict[str, Any]]:
"""
提取服务记录
注意: 使用 assistant_no (助教工号) 作为助教标识,而不是 site_assistant_id
因为 site_assistant_id 在数据中是每次服务的唯一ID不是助教的唯一标识
Returns:
[{'member_id', 'assistant_no', 'assistant_nickname', 'start_time', 'end_time', 'duration_minutes', 'skill_id'}, ...]
"""
# 通过 user_id 关联 dim_assistant 获取 assistant_id
sql = """
SELECT
s.tenant_member_id AS member_id,
s.user_id AS assistant_user_id,
d.assistant_id,
s.start_use_time,
s.last_use_time,
COALESCE(s.income_seconds, 0) / 60 AS duration_minutes,
s.skill_id
FROM billiards_dwd.dwd_assistant_service_log s
JOIN billiards_dwd.dim_assistant d
ON s.user_id = d.user_id AND d.scd2_is_current = 1
WHERE s.site_id = %s
AND s.tenant_member_id > 0 -- 排除散客
AND s.is_delete = 0
AND s.user_id > 0 -- 确保有助教user_id
AND s.last_use_time >= %s
AND s.last_use_time < %s
ORDER BY s.tenant_member_id, d.assistant_id, s.start_use_time
"""
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
result = []
for row in (rows or []):
row_dict = dict(row)
assistant_id = row_dict['assistant_id']
if assistant_id:
result.append({
'member_id': int(row_dict['member_id']),
'assistant_id': int(assistant_id), # 助教IDdim_assistant主键
'assistant_user_id': int(row_dict['assistant_user_id']), # user_id用于中间处理
'start_time': row_dict['start_use_time'],
'end_time': row_dict['last_use_time'],
'duration_minutes': int(row_dict['duration_minutes'] or 0),
'skill_id': int(row_dict['skill_id'] or 0)
})
return result
def _group_and_merge_sessions(
self,
raw_services: List[Dict[str, Any]],
params: Dict[str, float],
now: datetime
) -> Dict[Tuple[int, int], MemberAssistantIntimacyData]:
"""
按(member_id, assistant_id)分组并合并会话
合并逻辑:同一客人对同一助教,间隔<4小时算同次服务
"""
merge_threshold_hours = int(params['session_merge_hours'])
merge_threshold = timedelta(hours=merge_threshold_hours)
incentive_weight = params['incentive_weight']
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData] = {}
# 按pair分组使用assistant_id
pair_services: Dict[Tuple[int, int], List[Dict[str, Any]]] = {}
for svc in raw_services:
key = (svc['member_id'], svc['assistant_id'])
if key not in pair_services:
pair_services[key] = []
pair_services[key].append(svc)
# 对每个pair合并会话
for key, services in pair_services.items():
member_id, assistant_id = key
# 取第一个服务记录的user_id
assistant_user_id = services[0]['assistant_user_id'] if services else 0
data = MemberAssistantIntimacyData(
member_id=member_id,
assistant_id=assistant_id,
assistant_user_id=assistant_user_id,
site_id=0, # 稍后填充
tenant_id=0
)
# 按开始时间排序
sorted_services = sorted(services, key=lambda x: x['start_time'])
# 合并会话
current_session: Optional[ServiceSession] = None
for svc in sorted_services:
start_time = svc['start_time']
end_time = svc['end_time']
duration = svc['duration_minutes']
skill_id = svc['skill_id']
# 判断课型(附加课权重更高,包厢课按基础课处理)
course_type = self.get_course_type(skill_id)
is_incentive = course_type == CourseType.BONUS
weight = incentive_weight if is_incentive else 1.0
if current_session is None:
# 开始新会话
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
elif start_time - current_session.session_end <= merge_threshold:
# 合并到当前会话
current_session.session_end = max(current_session.session_end, end_time)
current_session.total_duration_minutes += duration
# 同次服务取最高权重
current_session.course_weight = max(current_session.course_weight, weight)
current_session.is_incentive = current_session.is_incentive or is_incentive
else:
# 保存当前会话,开始新会话
data.sessions.append(current_session)
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
# 保存最后一个会话
if current_session is not None:
data.sessions.append(current_session)
# 统计特征
data.session_count = len(data.sessions)
data.total_duration_minutes = sum(s.total_duration_minutes for s in data.sessions)
data.basic_session_count = sum(1 for s in data.sessions if not s.is_incentive)
data.incentive_session_count = sum(1 for s in data.sessions if s.is_incentive)
# 最近一次服务
if data.sessions:
last_session = max(data.sessions, key=lambda s: s.session_end)
data.days_since_last_session = (now - last_session.session_end).days
pair_data[key] = data
return pair_data
def _extract_attributed_recharges(
self,
site_id: int,
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData],
params: Dict[str, float],
now: datetime
) -> None:
"""
提取归因充值
归因逻辑服务结束后1小时内的充值算做该助教贡献
"""
attribution_hours = int(params['recharge_attribute_hours'])
attribution_window = timedelta(hours=attribution_hours)
# 获取所有相关会员ID
member_ids = set(key[0] for key in pair_data.keys())
if not member_ids:
return
member_ids_str = ','.join(str(m) for m in member_ids)
# 查询充值记录
sql = f"""
SELECT
member_id,
pay_time,
pay_amount
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id IN ({member_ids_str})
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
"""
lookback_days = int(params['lookback_days'])
start_datetime = now - timedelta(days=lookback_days)
rows = self.db.query(sql, (site_id, start_datetime))
# 为每个充值找到归因助教
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
pay_time = row_dict['pay_time']
pay_amount = float(row_dict['pay_amount'] or 0)
if pay_amount <= 0:
continue
# 查找该会员在pay_time前1小时内结束服务的助教
for key, data in pair_data.items():
if key[0] != member_id:
continue
for session in data.sessions:
# 服务结束后1小时内的充值
if (session.session_end <= pay_time and
pay_time - session.session_end <= attribution_window):
# 归因给这个助教
data.attributed_recharge_count += 1
data.attributed_recharge_amount += pay_amount
data.recharges.append(AttributedRecharge(
pay_time=pay_time,
pay_amount=pay_amount,
days_ago=(now - pay_time).total_seconds() / 86400
))
break # 一笔充值只归因给一个助教
# ==========================================================================
# 分数计算方法
# ==========================================================================
def _calculate_component_scores(
self,
data: MemberAssistantIntimacyData,
params: Dict[str, float],
now: datetime
) -> None:
"""计算5项分数"""
epsilon = 1e-6
lookback_days = int(params['lookback_days'])
h_sess = params['halflife_session']
h_last = params['halflife_last']
h_pay = params['halflife_recharge']
h_short = params['halflife_short']
h_long = params['halflife_long']
A0 = params['amount_base']
gamma = params['burst_gamma']
# 1. 频次强度 F = Σ(τ_i × decay(d_i, h_sess))
F = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
days_ago = min(days_ago, lookback_days)
F += session.course_weight * self.decay(days_ago, h_sess)
data.score_frequency = F
# 2. 最近温度 R = decay(d_last, h_last)
if data.days_since_last_session is not None:
data.score_recency = self.decay(min(data.days_since_last_session, lookback_days), h_last)
else:
data.score_recency = 0.0
# 3. 归因充值强度 M = Σ(ln(1+amt/A0) × decay(d_r, h_pay))
M = 0.0
for recharge in data.recharges:
m_amt = math.log1p(recharge.pay_amount / A0)
M += m_amt * self.decay(min(recharge.days_ago, lookback_days), h_pay)
data.score_recharge = M
# 4. 时长贡献 D = Σ(sqrt(dur/60) × τ × decay(d, h_sess))
D = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
dur_hours = session.total_duration_minutes / 60.0
days_ago = min(days_ago, lookback_days)
D += math.sqrt(dur_hours) * session.course_weight * self.decay(days_ago, h_sess)
data.score_duration = D
# 5. 频率激增放大 mult = 1 + γ × burst
# F_short = Σ(τ × decay(d, h_short))
# F_long = Σ(τ × decay(d, h_long))
F_short = 0.0
F_long = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
days_ago = min(days_ago, lookback_days)
F_short += session.course_weight * self.decay(days_ago, h_short)
F_long += session.course_weight * self.decay(days_ago, h_long)
# burst = max(0, ln(1 + (F_short/F_long - 1)))
ratio = F_short / (F_long + epsilon)
if ratio > 1:
burst = self.safe_ln1p(ratio - 1)
else:
burst = 0.0
data.burst_multiplier = 1 + gamma * burst
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_intimacy_data(self, data_list: List[MemberAssistantIntimacyData]) -> int:
"""保存亲密数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
# 构建删除条件使用assistant_id
keys = [(d.member_id, d.assistant_id) for d in data_list]
conditions = " OR ".join(
f"(member_id = {m} AND assistant_id = {a})" for m, a in keys
)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_assistant_intimacy
WHERE site_id = %s AND ({conditions})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_assistant_intimacy (
site_id, tenant_id, member_id, assistant_id,
session_count, total_duration_minutes,
basic_session_count, incentive_session_count,
days_since_last_session,
attributed_recharge_count, attributed_recharge_amount,
score_frequency, score_recency, score_recharge, score_duration,
burst_multiplier, raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id, data.assistant_id,
data.session_count, data.total_duration_minutes,
data.basic_session_count, data.incentive_session_count,
data.days_since_last_session,
data.attributed_recharge_count, data.attributed_recharge_amount,
data.score_frequency, data.score_recency, data.score_recharge, data.score_duration,
data.burst_multiplier, data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
site_id = self.config.get('app.default_site_id')
if site_id:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['site_id'])
raise ValueError("无法确定门店ID")
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['tenant_id'])
return 0