初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

View File

@@ -0,0 +1,587 @@
# -*- coding: utf-8 -*-
"""
客户召回指数计算任务
功能说明:
- 衡量客户召回的必要性和紧急程度
- 尊重客户个人到店周期(μ=中位数, σ=MAD
- 对新客户、刚充值客户增加召回倾向
- 检测"热了又断"的情况
算法公式:
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
其中:
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
- new_bonus = decay(d_first, h_new) # 新客户加分
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
数据来源:
- dwd_settlement_head: 会员到店记录
- dwd_recharge_order: 充值记录
- dim_member: 首访时间
更新频率每2小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class MemberRecallData:
"""会员召回数据"""
member_id: int
site_id: int
tenant_id: int
# 计算输入特征
days_since_last_visit: Optional[int] = None
visit_interval_median: Optional[float] = None
visit_interval_mad: Optional[float] = None
days_since_first_visit: Optional[int] = None
days_since_last_recharge: Optional[int] = None
visits_last_14_days: int = 0
visits_last_60_days: int = 0
# 分项得分
score_overdue: float = 0.0
score_new_bonus: float = 0.0
score_recharge_bonus: float = 0.0
score_hot_drop: float = 0.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# =============================================================================
# 召回指数任务
# =============================================================================
class RecallIndexTask(BaseIndexTask):
"""
客户召回指数计算任务
计算流程:
1. 提取近60天有到店记录的会员
2. 计算每个会员的到店间隔特征中位数、MAD
3. 计算4项分数超期、新客、充值、热度断档
4. 汇总Raw Score
5. 分位截断 + MinMax映射到0-10
6. 写入DWS表
"""
INDEX_TYPE = "RECALL"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'sigma_min': 2.0,
'halflife_new': 7.0,
'halflife_recharge': 10.0,
'weight_overdue': 3.0,
'weight_new': 1.0,
'weight_recharge': 1.0,
'weight_hot': 1.0,
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_RECALL_INDEX"
def get_target_table(self) -> str:
return "dws_member_recall_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行召回指数计算"""
self.logger.info("开始计算客户召回指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期
base_date = date.today()
start_date = base_date - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
)
# 1. 提取会员到店数据
member_visits = self._extract_member_visits(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
if not member_visits:
self.logger.warning("没有会员到店记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 提取充值记录
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
# 3. 提取首访时间
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
# 4. 计算每个会员的召回数据
recall_data_list: List[MemberRecallData] = []
for member_id, visit_dates in member_visits.items():
data = MemberRecallData(
member_id=member_id,
site_id=site_id,
tenant_id=tenant_id
)
# 计算特征
self._calculate_visit_features(data, visit_dates, base_date, params)
# 补充充值特征
if member_id in recharge_data:
last_recharge_date = recharge_data[member_id]
data.days_since_last_recharge = (base_date - last_recharge_date).days
# 补充首访特征
if member_id in first_visit_data:
first_visit_date = first_visit_data[member_id]
data.days_since_first_visit = (base_date - first_visit_date).days
# 计算分项得分
self._calculate_component_scores(data, params)
# 汇总Raw Score
data.raw_score = (
params['weight_overdue'] * data.score_overdue +
params['weight_new'] * data.score_new_bonus +
params['weight_recharge'] * data.score_recharge_bonus +
params['weight_hot'] * data.score_hot_drop
)
recall_data_list.append(data)
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
# 5. 归一化到Display Score
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
use_log=False,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=True,
site_id=site_id
)
# 更新display_score
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
for data in recall_data_list:
if data.member_id in score_map:
_, data.display_score = score_map[data.member_id]
# 6. 保存分位点历史
if recall_data_list:
all_raw = [d.raw_score for d in recall_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_recall_data(recall_data_list)
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'member_count': len(recall_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_member_visits(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, List[date]]:
"""
提取会员到店记录
Returns:
{member_id: [visit_date1, visit_date2, ...]}
"""
sql = """
SELECT
member_id,
DATE(pay_time) AS visit_date
FROM billiards_dwd.dwd_settlement_head s
WHERE s.site_id = %s
AND s.member_id > 0 -- 排除散客
AND s.pay_time >= %s
AND s.pay_time < %s + INTERVAL '1 day'
AND (
s.settle_type = 1
OR (
s.settle_type = 3
AND EXISTS (
SELECT 1
FROM billiards_dwd.dwd_assistant_service_log asl
JOIN billiards_dws.cfg_skill_type st
ON asl.skill_id = st.skill_id
AND st.course_type_code = 'BONUS'
AND st.is_active = TRUE
WHERE asl.order_settle_id = s.order_settle_id
AND asl.site_id = s.site_id
AND asl.tenant_member_id = s.member_id
AND asl.is_delete = 0
)
)
)
GROUP BY member_id, DATE(pay_time)
ORDER BY member_id, visit_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, List[date]] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
visit_date = row_dict['visit_date']
if member_id not in result:
result[member_id] = []
result[member_id].append(visit_date)
return result
def _extract_recharge_data(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, date]:
"""
提取最近充值记录
Returns:
{member_id: last_recharge_date}
"""
sql = """
SELECT
member_id,
MAX(DATE(pay_time)) AS last_recharge_date
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id > 0
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
GROUP BY member_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
return result
def _extract_first_visit_data(
self,
site_id: int,
member_ids: List[int]
) -> Dict[int, date]:
"""
提取首访时间
优先使用dim_member.create_time如果没有则使用dwd_settlement_head中的首次消费时间
Returns:
{member_id: first_visit_date}
"""
if not member_ids:
return {}
# 使用dim_member的create_time作为首访时间
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
member_id,
DATE(create_time) AS first_visit_date
FROM billiards_dwd.dim_member
WHERE member_id IN ({member_ids_str})
AND scd2_is_current = 1
"""
rows = self.db.query(sql)
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
first_date = row_dict['first_visit_date']
if first_date:
result[member_id] = first_date
return result
# ==========================================================================
# 特征计算方法
# ==========================================================================
def _calculate_visit_features(
self,
data: MemberRecallData,
visit_dates: List[date],
base_date: date,
params: Dict[str, float]
) -> None:
"""计算到店特征"""
if not visit_dates:
return
# 最近一次到店
last_visit = max(visit_dates)
data.days_since_last_visit = (base_date - last_visit).days
# 到店间隔
sorted_dates = sorted(visit_dates)
intervals = []
for i in range(1, len(sorted_dates)):
interval = (sorted_dates[i] - sorted_dates[i-1]).days
intervals.append(float(interval))
if intervals:
# 中位数(μ)
data.visit_interval_median = self.calculate_median(intervals)
# MADσ下限为sigma_min
mad = self.calculate_mad(intervals)
data.visit_interval_mad = max(mad, params['sigma_min'])
else:
# 只有一次到店,使用默认值
data.visit_interval_median = 7.0 # 默认周期7天
data.visit_interval_mad = params['sigma_min']
# 近14天/60天到店次数
days_14_ago = base_date - timedelta(days=14)
days_60_ago = base_date - timedelta(days=60)
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
def _calculate_component_scores(
self,
data: MemberRecallData,
params: Dict[str, float]
) -> None:
"""计算4项分数"""
# 1. 超期紧急性
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
t = data.days_since_last_visit
mu = data.visit_interval_median
sigma = data.visit_interval_mad or params['sigma_min']
# z = max(0, (t - μ) / σ)
z = max(0.0, (t - mu) / sigma)
# overdue = 1 - exp(-z)
data.score_overdue = 1.0 - math.exp(-z)
# 2. 新客户加分
lookback_days = int(params['lookback_days'])
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
data.score_new_bonus = self.decay(
data.days_since_first_visit,
params['halflife_new']
)
# 3. 刚充值加分
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
data.score_recharge_bonus = self.decay(
data.days_since_last_recharge,
params['halflife_recharge']
)
# 4. 热度断档加分
epsilon = 1e-6
n14 = data.visits_last_14_days
n60 = data.visits_last_60_days
r14 = n14 / 14.0
r60 = (n60 + 1) / 60.0 # +1 平滑
hot_ratio = r14 / (r60 + epsilon)
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
if hot_ratio > 1:
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
else:
data.score_hot_drop = 0.0
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
"""保存召回数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
member_ids = [d.member_id for d in data_list]
member_ids_str = ','.join(str(m) for m in member_ids)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_recall_index
WHERE site_id = %s AND member_id IN ({member_ids_str})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_recall_index (
site_id, tenant_id, member_id,
days_since_last_visit, visit_interval_median, visit_interval_mad,
days_since_first_visit, days_since_last_recharge,
visits_last_14_days, visits_last_60_days,
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id,
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
data.days_since_first_visit, data.days_since_last_recharge,
data.visits_last_14_days, data.visits_last_60_days,
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
# 从配置获取默认门店ID
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
if site_id is not None:
return int(site_id)
# 查询数据库获取第一个门店
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('site_id')
if value is not None:
return int(value)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('tenant_id')
if value is not None:
return int(value)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0