初始提交:飞球 ETL 系统全量代码
This commit is contained in:
587
tasks/dws/index/recall_index_task.py
Normal file
587
tasks/dws/index/recall_index_task.py
Normal file
@@ -0,0 +1,587 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
客户召回指数计算任务
|
||||
|
||||
功能说明:
|
||||
- 衡量客户召回的必要性和紧急程度
|
||||
- 尊重客户个人到店周期(μ=中位数, σ=MAD)
|
||||
- 对新客户、刚充值客户增加召回倾向
|
||||
- 检测"热了又断"的情况
|
||||
|
||||
算法公式:
|
||||
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
|
||||
|
||||
其中:
|
||||
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
|
||||
- new_bonus = decay(d_first, h_new) # 新客户加分
|
||||
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
|
||||
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
|
||||
|
||||
数据来源:
|
||||
- dwd_settlement_head: 会员到店记录
|
||||
- dwd_recharge_order: 充值记录
|
||||
- dim_member: 首访时间
|
||||
|
||||
更新频率:每2小时
|
||||
|
||||
作者:ETL团队
|
||||
创建日期:2026-02-03
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from .base_index_task import BaseIndexTask, PercentileHistory
|
||||
from ..base_dws_task import TaskContext
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 数据类定义
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class MemberRecallData:
|
||||
"""会员召回数据"""
|
||||
member_id: int
|
||||
site_id: int
|
||||
tenant_id: int
|
||||
|
||||
# 计算输入特征
|
||||
days_since_last_visit: Optional[int] = None
|
||||
visit_interval_median: Optional[float] = None
|
||||
visit_interval_mad: Optional[float] = None
|
||||
days_since_first_visit: Optional[int] = None
|
||||
days_since_last_recharge: Optional[int] = None
|
||||
visits_last_14_days: int = 0
|
||||
visits_last_60_days: int = 0
|
||||
|
||||
# 分项得分
|
||||
score_overdue: float = 0.0
|
||||
score_new_bonus: float = 0.0
|
||||
score_recharge_bonus: float = 0.0
|
||||
score_hot_drop: float = 0.0
|
||||
|
||||
# 最终分数
|
||||
raw_score: float = 0.0
|
||||
display_score: float = 0.0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# 召回指数任务
|
||||
# =============================================================================
|
||||
|
||||
class RecallIndexTask(BaseIndexTask):
|
||||
"""
|
||||
客户召回指数计算任务
|
||||
|
||||
计算流程:
|
||||
1. 提取近60天有到店记录的会员
|
||||
2. 计算每个会员的到店间隔特征(中位数、MAD)
|
||||
3. 计算4项分数(超期、新客、充值、热度断档)
|
||||
4. 汇总Raw Score
|
||||
5. 分位截断 + MinMax映射到0-10
|
||||
6. 写入DWS表
|
||||
"""
|
||||
|
||||
INDEX_TYPE = "RECALL"
|
||||
|
||||
# 默认参数
|
||||
DEFAULT_PARAMS = {
|
||||
'lookback_days': 60,
|
||||
'sigma_min': 2.0,
|
||||
'halflife_new': 7.0,
|
||||
'halflife_recharge': 10.0,
|
||||
'weight_overdue': 3.0,
|
||||
'weight_new': 1.0,
|
||||
'weight_recharge': 1.0,
|
||||
'weight_hot': 1.0,
|
||||
'percentile_lower': 5,
|
||||
'percentile_upper': 95,
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 抽象方法实现
|
||||
# ==========================================================================
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
return "DWS_RECALL_INDEX"
|
||||
|
||||
def get_target_table(self) -> str:
|
||||
return "dws_member_recall_index"
|
||||
|
||||
def get_primary_keys(self) -> List[str]:
|
||||
return ['site_id', 'member_id']
|
||||
|
||||
def get_index_type(self) -> str:
|
||||
return self.INDEX_TYPE
|
||||
|
||||
# ==========================================================================
|
||||
# 任务执行
|
||||
# ==========================================================================
|
||||
|
||||
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
|
||||
"""执行召回指数计算"""
|
||||
self.logger.info("开始计算客户召回指数")
|
||||
|
||||
# 获取门店ID
|
||||
site_id = self._get_site_id(context)
|
||||
tenant_id = self._get_tenant_id()
|
||||
|
||||
# 加载参数
|
||||
params = self._load_params()
|
||||
lookback_days = int(params['lookback_days'])
|
||||
|
||||
# 计算基准日期
|
||||
base_date = date.today()
|
||||
start_date = base_date - timedelta(days=lookback_days)
|
||||
|
||||
self.logger.info(
|
||||
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
|
||||
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
|
||||
)
|
||||
|
||||
# 1. 提取会员到店数据
|
||||
member_visits = self._extract_member_visits(site_id, start_date, base_date)
|
||||
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
|
||||
|
||||
if not member_visits:
|
||||
self.logger.warning("没有会员到店记录,跳过计算")
|
||||
return {'status': 'skipped', 'reason': 'no_data'}
|
||||
|
||||
# 2. 提取充值记录
|
||||
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
|
||||
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
|
||||
|
||||
# 3. 提取首访时间
|
||||
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
|
||||
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
|
||||
|
||||
# 4. 计算每个会员的召回数据
|
||||
recall_data_list: List[MemberRecallData] = []
|
||||
|
||||
for member_id, visit_dates in member_visits.items():
|
||||
data = MemberRecallData(
|
||||
member_id=member_id,
|
||||
site_id=site_id,
|
||||
tenant_id=tenant_id
|
||||
)
|
||||
|
||||
# 计算特征
|
||||
self._calculate_visit_features(data, visit_dates, base_date, params)
|
||||
|
||||
# 补充充值特征
|
||||
if member_id in recharge_data:
|
||||
last_recharge_date = recharge_data[member_id]
|
||||
data.days_since_last_recharge = (base_date - last_recharge_date).days
|
||||
|
||||
# 补充首访特征
|
||||
if member_id in first_visit_data:
|
||||
first_visit_date = first_visit_data[member_id]
|
||||
data.days_since_first_visit = (base_date - first_visit_date).days
|
||||
|
||||
# 计算分项得分
|
||||
self._calculate_component_scores(data, params)
|
||||
|
||||
# 汇总Raw Score
|
||||
data.raw_score = (
|
||||
params['weight_overdue'] * data.score_overdue +
|
||||
params['weight_new'] * data.score_new_bonus +
|
||||
params['weight_recharge'] * data.score_recharge_bonus +
|
||||
params['weight_hot'] * data.score_hot_drop
|
||||
)
|
||||
|
||||
recall_data_list.append(data)
|
||||
|
||||
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
|
||||
|
||||
# 5. 归一化到Display Score
|
||||
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
|
||||
normalized = self.batch_normalize_to_display(
|
||||
raw_scores,
|
||||
use_log=False,
|
||||
percentile_lower=int(params['percentile_lower']),
|
||||
percentile_upper=int(params['percentile_upper']),
|
||||
use_smoothing=True,
|
||||
site_id=site_id
|
||||
)
|
||||
|
||||
# 更新display_score
|
||||
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
|
||||
for data in recall_data_list:
|
||||
if data.member_id in score_map:
|
||||
_, data.display_score = score_map[data.member_id]
|
||||
|
||||
# 6. 保存分位点历史
|
||||
if recall_data_list:
|
||||
all_raw = [d.raw_score for d in recall_data_list]
|
||||
q_l, q_u = self.calculate_percentiles(
|
||||
all_raw,
|
||||
int(params['percentile_lower']),
|
||||
int(params['percentile_upper'])
|
||||
)
|
||||
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
|
||||
|
||||
self.save_percentile_history(
|
||||
site_id=site_id,
|
||||
percentile_5=q_l,
|
||||
percentile_95=q_u,
|
||||
percentile_5_smoothed=smoothed_l,
|
||||
percentile_95_smoothed=smoothed_u,
|
||||
record_count=len(all_raw),
|
||||
min_raw=min(all_raw),
|
||||
max_raw=max(all_raw),
|
||||
avg_raw=sum(all_raw) / len(all_raw)
|
||||
)
|
||||
|
||||
# 7. 写入DWS表
|
||||
inserted = self._save_recall_data(recall_data_list)
|
||||
|
||||
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'member_count': len(recall_data_list),
|
||||
'records_inserted': inserted
|
||||
}
|
||||
|
||||
# ==========================================================================
|
||||
# 数据提取方法
|
||||
# ==========================================================================
|
||||
|
||||
def _extract_member_visits(
|
||||
self,
|
||||
site_id: int,
|
||||
start_date: date,
|
||||
end_date: date
|
||||
) -> Dict[int, List[date]]:
|
||||
"""
|
||||
提取会员到店记录
|
||||
|
||||
Returns:
|
||||
{member_id: [visit_date1, visit_date2, ...]}
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
member_id,
|
||||
DATE(pay_time) AS visit_date
|
||||
FROM billiards_dwd.dwd_settlement_head s
|
||||
WHERE s.site_id = %s
|
||||
AND s.member_id > 0 -- 排除散客
|
||||
AND s.pay_time >= %s
|
||||
AND s.pay_time < %s + INTERVAL '1 day'
|
||||
AND (
|
||||
s.settle_type = 1
|
||||
OR (
|
||||
s.settle_type = 3
|
||||
AND EXISTS (
|
||||
SELECT 1
|
||||
FROM billiards_dwd.dwd_assistant_service_log asl
|
||||
JOIN billiards_dws.cfg_skill_type st
|
||||
ON asl.skill_id = st.skill_id
|
||||
AND st.course_type_code = 'BONUS'
|
||||
AND st.is_active = TRUE
|
||||
WHERE asl.order_settle_id = s.order_settle_id
|
||||
AND asl.site_id = s.site_id
|
||||
AND asl.tenant_member_id = s.member_id
|
||||
AND asl.is_delete = 0
|
||||
)
|
||||
)
|
||||
)
|
||||
GROUP BY member_id, DATE(pay_time)
|
||||
ORDER BY member_id, visit_date
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (site_id, start_date, end_date))
|
||||
|
||||
result: Dict[int, List[date]] = {}
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
member_id = int(row_dict['member_id'])
|
||||
visit_date = row_dict['visit_date']
|
||||
|
||||
if member_id not in result:
|
||||
result[member_id] = []
|
||||
result[member_id].append(visit_date)
|
||||
|
||||
return result
|
||||
|
||||
def _extract_recharge_data(
|
||||
self,
|
||||
site_id: int,
|
||||
start_date: date,
|
||||
end_date: date
|
||||
) -> Dict[int, date]:
|
||||
"""
|
||||
提取最近充值记录
|
||||
|
||||
Returns:
|
||||
{member_id: last_recharge_date}
|
||||
"""
|
||||
sql = """
|
||||
SELECT
|
||||
member_id,
|
||||
MAX(DATE(pay_time)) AS last_recharge_date
|
||||
FROM billiards_dwd.dwd_recharge_order
|
||||
WHERE site_id = %s
|
||||
AND member_id > 0
|
||||
AND settle_type = 5 -- 充值订单
|
||||
AND pay_time >= %s
|
||||
AND pay_time < %s + INTERVAL '1 day'
|
||||
GROUP BY member_id
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (site_id, start_date, end_date))
|
||||
|
||||
result: Dict[int, date] = {}
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
|
||||
|
||||
return result
|
||||
|
||||
def _extract_first_visit_data(
|
||||
self,
|
||||
site_id: int,
|
||||
member_ids: List[int]
|
||||
) -> Dict[int, date]:
|
||||
"""
|
||||
提取首访时间
|
||||
|
||||
优先使用dim_member.create_time,如果没有则使用dwd_settlement_head中的首次消费时间
|
||||
|
||||
Returns:
|
||||
{member_id: first_visit_date}
|
||||
"""
|
||||
if not member_ids:
|
||||
return {}
|
||||
|
||||
# 使用dim_member的create_time作为首访时间
|
||||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||||
sql = f"""
|
||||
SELECT
|
||||
member_id,
|
||||
DATE(create_time) AS first_visit_date
|
||||
FROM billiards_dwd.dim_member
|
||||
WHERE member_id IN ({member_ids_str})
|
||||
AND scd2_is_current = 1
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql)
|
||||
|
||||
result: Dict[int, date] = {}
|
||||
for row in (rows or []):
|
||||
row_dict = dict(row)
|
||||
member_id = int(row_dict['member_id'])
|
||||
first_date = row_dict['first_visit_date']
|
||||
if first_date:
|
||||
result[member_id] = first_date
|
||||
|
||||
return result
|
||||
|
||||
# ==========================================================================
|
||||
# 特征计算方法
|
||||
# ==========================================================================
|
||||
|
||||
def _calculate_visit_features(
|
||||
self,
|
||||
data: MemberRecallData,
|
||||
visit_dates: List[date],
|
||||
base_date: date,
|
||||
params: Dict[str, float]
|
||||
) -> None:
|
||||
"""计算到店特征"""
|
||||
if not visit_dates:
|
||||
return
|
||||
|
||||
# 最近一次到店
|
||||
last_visit = max(visit_dates)
|
||||
data.days_since_last_visit = (base_date - last_visit).days
|
||||
|
||||
# 到店间隔
|
||||
sorted_dates = sorted(visit_dates)
|
||||
intervals = []
|
||||
for i in range(1, len(sorted_dates)):
|
||||
interval = (sorted_dates[i] - sorted_dates[i-1]).days
|
||||
intervals.append(float(interval))
|
||||
|
||||
if intervals:
|
||||
# 中位数(μ)
|
||||
data.visit_interval_median = self.calculate_median(intervals)
|
||||
|
||||
# MAD(σ),下限为sigma_min
|
||||
mad = self.calculate_mad(intervals)
|
||||
data.visit_interval_mad = max(mad, params['sigma_min'])
|
||||
else:
|
||||
# 只有一次到店,使用默认值
|
||||
data.visit_interval_median = 7.0 # 默认周期7天
|
||||
data.visit_interval_mad = params['sigma_min']
|
||||
|
||||
# 近14天/60天到店次数
|
||||
days_14_ago = base_date - timedelta(days=14)
|
||||
days_60_ago = base_date - timedelta(days=60)
|
||||
|
||||
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
|
||||
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
|
||||
|
||||
def _calculate_component_scores(
|
||||
self,
|
||||
data: MemberRecallData,
|
||||
params: Dict[str, float]
|
||||
) -> None:
|
||||
"""计算4项分数"""
|
||||
|
||||
# 1. 超期紧急性
|
||||
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
|
||||
t = data.days_since_last_visit
|
||||
mu = data.visit_interval_median
|
||||
sigma = data.visit_interval_mad or params['sigma_min']
|
||||
|
||||
# z = max(0, (t - μ) / σ)
|
||||
z = max(0.0, (t - mu) / sigma)
|
||||
# overdue = 1 - exp(-z)
|
||||
data.score_overdue = 1.0 - math.exp(-z)
|
||||
|
||||
# 2. 新客户加分
|
||||
lookback_days = int(params['lookback_days'])
|
||||
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
|
||||
data.score_new_bonus = self.decay(
|
||||
data.days_since_first_visit,
|
||||
params['halflife_new']
|
||||
)
|
||||
|
||||
# 3. 刚充值加分
|
||||
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
|
||||
data.score_recharge_bonus = self.decay(
|
||||
data.days_since_last_recharge,
|
||||
params['halflife_recharge']
|
||||
)
|
||||
|
||||
# 4. 热度断档加分
|
||||
epsilon = 1e-6
|
||||
n14 = data.visits_last_14_days
|
||||
n60 = data.visits_last_60_days
|
||||
|
||||
r14 = n14 / 14.0
|
||||
r60 = (n60 + 1) / 60.0 # +1 平滑
|
||||
|
||||
hot_ratio = r14 / (r60 + epsilon)
|
||||
|
||||
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
|
||||
if hot_ratio > 1:
|
||||
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
|
||||
else:
|
||||
data.score_hot_drop = 0.0
|
||||
|
||||
# ==========================================================================
|
||||
# 数据保存方法
|
||||
# ==========================================================================
|
||||
|
||||
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
|
||||
"""保存召回数据到DWS表"""
|
||||
if not data_list:
|
||||
return 0
|
||||
|
||||
# 先删除已存在的记录
|
||||
site_id = data_list[0].site_id
|
||||
member_ids = [d.member_id for d in data_list]
|
||||
|
||||
member_ids_str = ','.join(str(m) for m in member_ids)
|
||||
delete_sql = f"""
|
||||
DELETE FROM billiards_dws.dws_member_recall_index
|
||||
WHERE site_id = %s AND member_id IN ({member_ids_str})
|
||||
"""
|
||||
|
||||
with self.db.conn.cursor() as cur:
|
||||
cur.execute(delete_sql, (site_id,))
|
||||
|
||||
# 插入新记录
|
||||
insert_sql = """
|
||||
INSERT INTO billiards_dws.dws_member_recall_index (
|
||||
site_id, tenant_id, member_id,
|
||||
days_since_last_visit, visit_interval_median, visit_interval_mad,
|
||||
days_since_first_visit, days_since_last_recharge,
|
||||
visits_last_14_days, visits_last_60_days,
|
||||
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
|
||||
raw_score, display_score,
|
||||
calc_time, created_at, updated_at
|
||||
) VALUES (
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s,
|
||||
NOW(), NOW(), NOW()
|
||||
)
|
||||
"""
|
||||
|
||||
inserted = 0
|
||||
with self.db.conn.cursor() as cur:
|
||||
for data in data_list:
|
||||
cur.execute(insert_sql, (
|
||||
data.site_id, data.tenant_id, data.member_id,
|
||||
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
|
||||
data.days_since_first_visit, data.days_since_last_recharge,
|
||||
data.visits_last_14_days, data.visits_last_60_days,
|
||||
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
|
||||
data.raw_score, data.display_score
|
||||
))
|
||||
inserted += cur.rowcount
|
||||
|
||||
# 提交事务
|
||||
self.db.conn.commit()
|
||||
|
||||
return inserted
|
||||
|
||||
# ==========================================================================
|
||||
# 辅助方法
|
||||
# ==========================================================================
|
||||
|
||||
def _load_params(self) -> Dict[str, float]:
|
||||
"""加载参数,缺失时使用默认值"""
|
||||
params = self.load_index_parameters()
|
||||
result = dict(self.DEFAULT_PARAMS)
|
||||
result.update(params)
|
||||
return result
|
||||
|
||||
def _get_site_id(self, context: Optional[TaskContext]) -> int:
|
||||
"""获取门店ID"""
|
||||
if context and hasattr(context, 'store_id') and context.store_id:
|
||||
return context.store_id
|
||||
|
||||
# 从配置获取默认门店ID
|
||||
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
|
||||
if site_id is not None:
|
||||
return int(site_id)
|
||||
|
||||
# 查询数据库获取第一个门店
|
||||
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
|
||||
rows = self.db.query(sql)
|
||||
if rows:
|
||||
value = dict(rows[0]).get('site_id')
|
||||
if value is not None:
|
||||
return int(value)
|
||||
|
||||
self.logger.warning("无法确定门店ID,使用 0 继续执行")
|
||||
return 0
|
||||
|
||||
def _get_tenant_id(self) -> int:
|
||||
"""获取租户ID"""
|
||||
tenant_id = self.config.get('app.tenant_id')
|
||||
if tenant_id is not None:
|
||||
return int(tenant_id)
|
||||
|
||||
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
|
||||
rows = self.db.query(sql)
|
||||
if rows:
|
||||
value = dict(rows[0]).get('tenant_id')
|
||||
if value is not None:
|
||||
return int(value)
|
||||
|
||||
self.logger.warning("无法确定租户ID,使用 0 继续执行")
|
||||
return 0
|
||||
Reference in New Issue
Block a user