初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

69
tasks/dws/__init__.py Normal file
View File

@@ -0,0 +1,69 @@
# -*- coding: utf-8 -*-
"""
DWS层ETL任务模块
包含:
- BaseDwsTask: DWS任务基类
- 助教维度任务
- 客户维度任务
- 财务维度任务
- 指数算法任务
"""
from .base_dws_task import BaseDwsTask, TimeLayer, TimeWindow, CourseType, DiscountType
from .assistant_daily_task import AssistantDailyTask
from .assistant_monthly_task import AssistantMonthlyTask
from .assistant_customer_task import AssistantCustomerTask
from .assistant_salary_task import AssistantSalaryTask
from .assistant_finance_task import AssistantFinanceTask
from .member_consumption_task import MemberConsumptionTask
from .member_visit_task import MemberVisitTask
from .finance_daily_task import FinanceDailyTask
from .finance_recharge_task import FinanceRechargeTask
from .finance_income_task import FinanceIncomeStructureTask
from .finance_discount_task import FinanceDiscountDetailTask
from .retention_cleanup_task import DwsRetentionCleanupTask
from .mv_refresh_task import DwsMvRefreshFinanceDailyTask, DwsMvRefreshAssistantDailyTask
# 指数算法任务
from .index import (
RecallIndexTask,
IntimacyIndexTask,
WinbackIndexTask,
NewconvIndexTask,
MlManualImportTask,
RelationIndexTask,
)
__all__ = [
# 基类
"BaseDwsTask",
"TimeLayer",
"TimeWindow",
"CourseType",
"DiscountType",
# 助教维度
"AssistantDailyTask",
"AssistantMonthlyTask",
"AssistantCustomerTask",
"AssistantSalaryTask",
"AssistantFinanceTask",
# 客户维度
"MemberConsumptionTask",
"MemberVisitTask",
# 财务维度
"FinanceDailyTask",
"FinanceRechargeTask",
"FinanceIncomeStructureTask",
"FinanceDiscountDetailTask",
"DwsRetentionCleanupTask",
"DwsMvRefreshFinanceDailyTask",
"DwsMvRefreshAssistantDailyTask",
# 指数算法
"WinbackIndexTask",
"NewconvIndexTask",
"RecallIndexTask",
"IntimacyIndexTask",
"MlManualImportTask",
"RelationIndexTask",
]

View File

@@ -0,0 +1,334 @@
# -*- coding: utf-8 -*-
"""
助教服务客户统计任务
功能说明:
"助教+客户"为粒度,统计服务关系和滚动窗口指标
数据来源:
- dwd_assistant_service_log: 助教服务流水
- dim_member: 会员维度
目标表:
billiards_dws.dws_assistant_customer_stats
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按统计日期
业务规则:
- 散客处理member_id=0 不进入此表统计
- 滚动窗口7/10/15/30/60/90天
- 活跃度近7天/30天是否有服务
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class AssistantCustomerTask(BaseDwsTask):
"""
助教服务客户统计任务
统计每个助教与每个客户的服务关系:
- 首次/最近服务日期
- 累计服务统计
- 滚动窗口统计7/10/15/30/60/90天
- 活跃度指标
"""
def get_task_code(self) -> str:
return "DWS_ASSISTANT_CUSTOMER"
def get_target_table(self) -> str:
return "dws_assistant_customer_stats"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "member_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
stat_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,统计日期 %s",
self.get_task_code(), stat_date
)
# 计算最大回溯日期90天窗口
lookback_start = stat_date - timedelta(days=90)
# 1. 获取助教-客户服务记录(包含历史全量用于累计统计)
service_pairs = self._extract_service_pairs(site_id, stat_date)
# 2. 获取会员信息
member_info = self._extract_member_info(site_id)
# 3. 获取助教信息
assistant_info = self._extract_assistant_info(site_id)
return {
'service_pairs': service_pairs,
'member_info': member_info,
'assistant_info': assistant_info,
'stat_date': stat_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算各窗口统计
"""
service_pairs = extracted['service_pairs']
member_info = extracted['member_info']
assistant_info = extracted['assistant_info']
stat_date = extracted['stat_date']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条服务关系记录",
self.get_task_code(), len(service_pairs)
)
# 构建统计记录
results = []
for pair in service_pairs:
assistant_id = pair.get('assistant_id')
member_id = pair.get('member_id')
# 跳过散客
if self.is_guest(member_id):
continue
asst_info = assistant_info.get(assistant_id, {})
memb_info = member_info.get(member_id, {})
# 构建记录
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': asst_info.get('nickname', pair.get('assistant_nickname')),
'member_id': member_id,
'member_nickname': memb_info.get('nickname'),
'member_mobile': self._mask_mobile(memb_info.get('mobile')),
'stat_date': stat_date,
# 全量累计统计
'first_service_date': pair.get('first_service_date'),
'last_service_date': pair.get('last_service_date'),
'total_service_count': self.safe_int(pair.get('total_service_count', 0)),
'total_service_hours': self.safe_decimal(pair.get('total_service_hours', 0)),
'total_service_amount': self.safe_decimal(pair.get('total_service_amount', 0)),
# 滚动窗口统计
'service_count_7d': self.safe_int(pair.get('service_count_7d', 0)),
'service_count_10d': self.safe_int(pair.get('service_count_10d', 0)),
'service_count_15d': self.safe_int(pair.get('service_count_15d', 0)),
'service_count_30d': self.safe_int(pair.get('service_count_30d', 0)),
'service_count_60d': self.safe_int(pair.get('service_count_60d', 0)),
'service_count_90d': self.safe_int(pair.get('service_count_90d', 0)),
'service_hours_7d': self.safe_decimal(pair.get('service_hours_7d', 0)),
'service_hours_10d': self.safe_decimal(pair.get('service_hours_10d', 0)),
'service_hours_15d': self.safe_decimal(pair.get('service_hours_15d', 0)),
'service_hours_30d': self.safe_decimal(pair.get('service_hours_30d', 0)),
'service_hours_60d': self.safe_decimal(pair.get('service_hours_60d', 0)),
'service_hours_90d': self.safe_decimal(pair.get('service_hours_90d', 0)),
'service_amount_7d': self.safe_decimal(pair.get('service_amount_7d', 0)),
'service_amount_10d': self.safe_decimal(pair.get('service_amount_10d', 0)),
'service_amount_15d': self.safe_decimal(pair.get('service_amount_15d', 0)),
'service_amount_30d': self.safe_decimal(pair.get('service_amount_30d', 0)),
'service_amount_60d': self.safe_decimal(pair.get('service_amount_60d', 0)),
'service_amount_90d': self.safe_decimal(pair.get('service_amount_90d', 0)),
# 活跃度指标
'days_since_last': self._calc_days_since(stat_date, pair.get('last_service_date')),
'is_active_7d': self.safe_int(pair.get('service_count_7d', 0)) > 0,
'is_active_30d': self.safe_int(pair.get('service_count_30d', 0)) > 0,
}
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据
deleted = self.delete_existing_data(context, date_col="stat_date")
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_pairs(
self,
site_id: int,
stat_date: date
) -> List[Dict[str, Any]]:
"""
提取助教-客户服务统计(含滚动窗口)
"""
sql = """
WITH service_base AS (
SELECT
site_assistant_id AS assistant_id,
nickname AS assistant_nickname,
tenant_member_id AS member_id,
DATE(start_use_time) AS service_date,
income_seconds,
ledger_amount
FROM billiards_dwd.dwd_assistant_service_log
WHERE site_id = %s
AND tenant_member_id IS NOT NULL
AND tenant_member_id != 0
AND is_delete = 0
)
SELECT
assistant_id,
MAX(assistant_nickname) AS assistant_nickname,
member_id,
MIN(service_date) AS first_service_date,
MAX(service_date) AS last_service_date,
-- 全量累计
COUNT(*) AS total_service_count,
SUM(income_seconds) / 3600.0 AS total_service_hours,
SUM(ledger_amount) AS total_service_amount,
-- 7天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '6 days' THEN 1 END) AS service_count_7d,
SUM(CASE WHEN service_date >= %s - INTERVAL '6 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_7d,
SUM(CASE WHEN service_date >= %s - INTERVAL '6 days' THEN ledger_amount ELSE 0 END) AS service_amount_7d,
-- 10天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '9 days' THEN 1 END) AS service_count_10d,
SUM(CASE WHEN service_date >= %s - INTERVAL '9 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_10d,
SUM(CASE WHEN service_date >= %s - INTERVAL '9 days' THEN ledger_amount ELSE 0 END) AS service_amount_10d,
-- 15天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '14 days' THEN 1 END) AS service_count_15d,
SUM(CASE WHEN service_date >= %s - INTERVAL '14 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_15d,
SUM(CASE WHEN service_date >= %s - INTERVAL '14 days' THEN ledger_amount ELSE 0 END) AS service_amount_15d,
-- 30天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '29 days' THEN 1 END) AS service_count_30d,
SUM(CASE WHEN service_date >= %s - INTERVAL '29 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_30d,
SUM(CASE WHEN service_date >= %s - INTERVAL '29 days' THEN ledger_amount ELSE 0 END) AS service_amount_30d,
-- 60天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '59 days' THEN 1 END) AS service_count_60d,
SUM(CASE WHEN service_date >= %s - INTERVAL '59 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_60d,
SUM(CASE WHEN service_date >= %s - INTERVAL '59 days' THEN ledger_amount ELSE 0 END) AS service_amount_60d,
-- 90天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '89 days' THEN 1 END) AS service_count_90d,
SUM(CASE WHEN service_date >= %s - INTERVAL '89 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_90d,
SUM(CASE WHEN service_date >= %s - INTERVAL '89 days' THEN ledger_amount ELSE 0 END) AS service_amount_90d
FROM service_base
GROUP BY assistant_id, member_id
HAVING MAX(service_date) >= %s - INTERVAL '90 days'
"""
# 构建参数每个窗口需要3个日期参数
params = [site_id]
for _ in range(6): # 6个窗口每个3个参数
params.extend([stat_date, stat_date, stat_date])
params.append(stat_date) # HAVING条件
rows = self.db.query(sql, tuple(params))
return [dict(row) for row in rows] if rows else []
def _extract_member_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取会员信息
"""
sql = """
SELECT
member_id,
nickname,
mobile
FROM billiards_dwd.dim_member
WHERE site_id = %s
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['member_id']] = row_dict
return result
def _extract_assistant_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取助教信息
"""
sql = """
SELECT
assistant_id,
nickname
FROM billiards_dwd.dim_assistant
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['assistant_id']] = row_dict
return result
# ==========================================================================
# 工具方法
# ==========================================================================
def _mask_mobile(self, mobile: Optional[str]) -> Optional[str]:
"""
手机号脱敏
"""
if not mobile or len(mobile) < 7:
return mobile
return mobile[:3] + "****" + mobile[-4:]
def _calc_days_since(self, stat_date: date, last_date: Optional[date]) -> Optional[int]:
"""
计算距离最近服务的天数
"""
if not last_date:
return None
if isinstance(last_date, datetime):
last_date = last_date.date()
return (stat_date - last_date).days
# 便于外部导入
__all__ = ['AssistantCustomerTask']

View File

@@ -0,0 +1,356 @@
# -*- coding: utf-8 -*-
"""
助教日度业绩明细任务
功能说明:
"助教+日期"为粒度,汇总每日业绩明细
数据来源:
- dwd_assistant_service_log: 助教服务流水
- dwd_assistant_trash_event: 废除记录(排除)
- dim_assistant: 助教维度SCD2获取当日等级
- cfg_skill_type: 技能→课程类型映射
目标表:
billiards_dws.dws_assistant_daily_detail
更新策略:
- 更新频率:每小时增量更新
- 幂等方式delete-before-insert按日期窗口
业务规则:
- 有效业绩需排除dwd_assistant_trash_event中的废除记录
- 助教等级使用SCD2 as-of取值获取统计日当日生效的等级
- 课程类型通过skill_id映射分为基础课和附加课
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, CourseType, TaskContext
class AssistantDailyTask(BaseDwsTask):
"""
助教日度业绩明细任务
汇总每个助教每天的:
- 服务次数(总/基础课/附加课)
- 计费时长(秒/小时)
- 计费金额
- 服务客户数(去重)
- 服务台桌数(去重)
- 被废除的记录统计
"""
def get_task_code(self) -> str:
return "DWS_ASSISTANT_DAILY"
def get_target_table(self) -> str:
return "dws_assistant_daily_detail"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据从DWD层读取助教服务记录
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,日期范围 %s ~ %s",
self.get_task_code(), start_date, end_date
)
# 1. 获取助教服务记录
service_records = self._extract_service_records(site_id, start_date, end_date)
# 2. 获取废除记录
trash_records = self._extract_trash_records(site_id, start_date, end_date)
# 3. 加载配置缓存
self.load_config_cache()
return {
'service_records': service_records,
'trash_records': trash_records,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:按助教+日期聚合
"""
service_records = extracted['service_records']
trash_records = extracted['trash_records']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,服务记录 %d 条,废除记录 %d",
self.get_task_code(), len(service_records), len(trash_records)
)
# 构建废除记录索引assistant_service_id -> trash_info
trash_index = self._build_trash_index(trash_records)
# 按助教+日期聚合
aggregated = self._aggregate_by_assistant_date(
service_records,
trash_index,
site_id
)
return aggregated
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据写入DWS表
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据(幂等)
deleted = self.delete_existing_data(context, date_col="stat_date")
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_records(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取助教服务记录
"""
sql = """
SELECT
asl.assistant_service_id,
asl.order_settle_id,
asl.site_assistant_id AS assistant_id,
asl.nickname AS assistant_nickname,
asl.assistant_level,
asl.skill_id,
asl.skill_name,
asl.tenant_member_id AS member_id,
asl.site_table_id AS table_id,
asl.income_seconds,
asl.real_use_seconds,
asl.ledger_amount,
asl.ledger_unit_price,
DATE(asl.start_use_time) AS service_date
FROM billiards_dwd.dwd_assistant_service_log asl
WHERE asl.site_id = %s
AND DATE(asl.start_use_time) >= %s
AND DATE(asl.start_use_time) <= %s
AND asl.is_delete = 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_trash_records(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取废除记录
有效业绩的排除规则:仅对"助教废除表"的记录进行处理排除
"""
sql = """
SELECT
assistant_service_id,
trash_seconds,
trash_reason,
trash_time
FROM billiards_dwd.dwd_assistant_trash_event
WHERE site_id = %s
AND DATE(trash_time) >= %s
AND DATE(trash_time) <= %s
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _build_trash_index(
self,
trash_records: List[Dict[str, Any]]
) -> Dict[int, Dict[str, Any]]:
"""
构建废除记录索引
"""
index = {}
for record in trash_records:
service_id = record.get('assistant_service_id')
if service_id:
index[service_id] = record
return index
def _aggregate_by_assistant_date(
self,
service_records: List[Dict[str, Any]],
trash_index: Dict[int, Dict[str, Any]],
site_id: int
) -> List[Dict[str, Any]]:
"""
按助教+日期聚合服务记录
"""
# 聚合字典:(assistant_id, service_date) -> aggregated_data
agg_dict: Dict[Tuple[int, date], Dict[str, Any]] = {}
for record in service_records:
assistant_id = record.get('assistant_id')
service_date = record.get('service_date')
if not assistant_id or not service_date:
continue
key = (assistant_id, service_date)
# 初始化聚合数据
if key not in agg_dict:
# 获取助教当日等级SCD2 as-of
level_info = self.get_assistant_level_asof(assistant_id, service_date)
agg_dict[key] = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': record.get('assistant_nickname'),
'stat_date': service_date,
'assistant_level_code': level_info.get('level_code') if level_info else record.get('assistant_level'),
'assistant_level_name': level_info.get('level_name') if level_info else None,
'total_service_count': 0,
'base_service_count': 0,
'bonus_service_count': 0,
'room_service_count': 0,
'total_seconds': 0,
'base_seconds': 0,
'bonus_seconds': 0,
'room_seconds': 0,
'total_hours': Decimal('0'),
'base_hours': Decimal('0'),
'bonus_hours': Decimal('0'),
'room_hours': Decimal('0'),
'total_ledger_amount': Decimal('0'),
'base_ledger_amount': Decimal('0'),
'bonus_ledger_amount': Decimal('0'),
'room_ledger_amount': Decimal('0'),
'unique_customers': set(),
'unique_tables': set(),
'trashed_seconds': 0,
'trashed_count': 0,
}
agg = agg_dict[key]
# 获取服务信息
service_id = record.get('assistant_service_id')
income_seconds = self.safe_int(record.get('income_seconds', 0))
ledger_amount = self.safe_decimal(record.get('ledger_amount', 0))
skill_id = record.get('skill_id')
member_id = record.get('member_id')
table_id = record.get('table_id')
# 判断课程类型
course_type = self.get_course_type(skill_id) if skill_id else CourseType.BASE
is_base = course_type == CourseType.BASE
is_bonus = course_type == CourseType.BONUS
is_room = course_type == CourseType.ROOM
# 检查是否被废除
is_trashed = service_id in trash_index
if is_trashed:
# 废除记录单独统计
trash_info = trash_index[service_id]
trash_seconds = self.safe_int(trash_info.get('trash_seconds', income_seconds))
agg['trashed_seconds'] += trash_seconds
agg['trashed_count'] += 1
else:
# 正常记录累加
agg['total_service_count'] += 1
agg['total_seconds'] += income_seconds
agg['total_ledger_amount'] += ledger_amount
if is_base:
agg['base_service_count'] += 1
agg['base_seconds'] += income_seconds
agg['base_ledger_amount'] += ledger_amount
elif is_bonus:
agg['bonus_service_count'] += 1
agg['bonus_seconds'] += income_seconds
agg['bonus_ledger_amount'] += ledger_amount
elif is_room:
agg['room_service_count'] += 1
agg['room_seconds'] += income_seconds
agg['room_ledger_amount'] += ledger_amount
# 客户和台桌去重统计(不论是否废除)
if member_id and not self.is_guest(member_id):
agg['unique_customers'].add(member_id)
if table_id:
agg['unique_tables'].add(table_id)
# 转换为列表并计算派生字段
result = []
for key, agg in agg_dict.items():
# 计算小时数
agg['total_hours'] = self.seconds_to_hours(agg['total_seconds'])
agg['base_hours'] = self.seconds_to_hours(agg['base_seconds'])
agg['bonus_hours'] = self.seconds_to_hours(agg['bonus_seconds'])
agg['room_hours'] = self.seconds_to_hours(agg['room_seconds'])
# 转换set为count
agg['unique_customers'] = len(agg['unique_customers'])
agg['unique_tables'] = len(agg['unique_tables'])
result.append(agg)
return result
# 便于外部导入
__all__ = ['AssistantDailyTask']

View File

@@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
"""
助教收支分析任务
功能说明:
"日期+助教"为粒度,分析助教产出的收入和成本
数据来源:
- dwd_assistant_service_log: 助教服务流水(收入)
- dws_assistant_salary_calc: 工资计算(成本)
目标表:
billiards_dws.dws_assistant_finance_analysis
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, CourseType, TaskContext
class AssistantFinanceTask(BaseDwsTask):
"""
助教收支分析任务
"""
def get_task_code(self) -> str:
return "DWS_ASSISTANT_FINANCE"
def get_target_table(self) -> str:
return "dws_assistant_finance_analysis"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date", "assistant_id"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 获取助教日度收入
daily_revenue = self._extract_daily_revenue(site_id, start_date, end_date)
# 获取月度工资(用于计算日均成本)
monthly_salary = self._extract_monthly_salary(site_id, start_date, end_date)
# 加载配置
self.load_config_cache()
return {
'daily_revenue': daily_revenue,
'monthly_salary': monthly_salary,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
daily_revenue = extracted['daily_revenue']
monthly_salary = extracted['monthly_salary']
site_id = extracted['site_id']
# 构建月度工资索引
salary_index = {}
for sal in monthly_salary:
asst_id = sal.get('assistant_id')
month = sal.get('salary_month')
if asst_id and month:
salary_index[(asst_id, month)] = sal
results = []
for rev in daily_revenue:
assistant_id = rev.get('assistant_id')
stat_date = rev.get('stat_date')
# 获取对应月份的工资
month_start = stat_date.replace(day=1) if isinstance(stat_date, date) else None
salary = salary_index.get((assistant_id, month_start), {})
# 计算日均成本
gross_salary = self.safe_decimal(salary.get('gross_salary', 0))
work_days = self.safe_int(salary.get('work_days', 1)) or 1
cost_daily = gross_salary / Decimal(str(work_days))
revenue_total = self.safe_decimal(rev.get('revenue_total', 0))
gross_profit = revenue_total - cost_daily
gross_margin = gross_profit / revenue_total if revenue_total > 0 else Decimal('0')
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'stat_date': stat_date,
'assistant_id': assistant_id,
'assistant_nickname': rev.get('assistant_nickname'),
'revenue_total': revenue_total,
'revenue_base': self.safe_decimal(rev.get('revenue_base', 0)),
'revenue_bonus': self.safe_decimal(rev.get('revenue_bonus', 0)),
'revenue_room': self.safe_decimal(rev.get('revenue_room', 0)),
'cost_daily': cost_daily,
'gross_profit': gross_profit,
'gross_margin': gross_margin,
'service_count': self.safe_int(rev.get('service_count', 0)),
'service_hours': self.safe_decimal(rev.get('service_hours', 0)),
'room_service_count': self.safe_int(rev.get('room_service_count', 0)),
'room_service_hours': self.safe_decimal(rev.get('room_service_hours', 0)),
'unique_customers': self.safe_int(rev.get('unique_customers', 0)),
}
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
if not transformed:
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
deleted = self.delete_existing_data(context, date_col="stat_date")
inserted = self.bulk_insert(transformed)
return {
"counts": {"fetched": len(transformed), "inserted": inserted, "updated": 0, "skipped": 0, "errors": 0},
"extra": {"deleted": deleted}
}
def _extract_daily_revenue(self, site_id: int, start_date: date, end_date: date) -> List[Dict[str, Any]]:
sql = """
SELECT
DATE(s.start_use_time) AS stat_date,
s.site_assistant_id AS assistant_id,
MAX(s.nickname) AS assistant_nickname,
COUNT(*) AS service_count,
SUM(s.income_seconds) / 3600.0 AS service_hours,
SUM(s.ledger_amount) AS revenue_total,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'BASE' THEN s.ledger_amount ELSE 0 END) AS revenue_base,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'BONUS' THEN s.ledger_amount ELSE 0 END) AS revenue_bonus,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'ROOM' THEN s.ledger_amount ELSE 0 END) AS revenue_room,
COUNT(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'ROOM' THEN 1 END) AS room_service_count,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'ROOM' THEN s.income_seconds ELSE 0 END) / 3600.0 AS room_service_hours,
COUNT(DISTINCT CASE WHEN s.tenant_member_id > 0 THEN s.tenant_member_id END) AS unique_customers
FROM billiards_dwd.dwd_assistant_service_log s
LEFT JOIN billiards_dws.cfg_skill_type st
ON st.skill_id = s.skill_id AND st.is_active = TRUE
WHERE s.site_id = %s
AND DATE(s.start_use_time) >= %s
AND DATE(s.start_use_time) <= %s
AND s.is_delete = 0
GROUP BY DATE(s.start_use_time), s.site_assistant_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_monthly_salary(self, site_id: int, start_date: date, end_date: date) -> List[Dict[str, Any]]:
# 获取涉及的月份
month_start = start_date.replace(day=1)
month_end = end_date.replace(day=1)
sql = """
SELECT
assistant_id,
salary_month,
gross_salary,
effective_hours
FROM billiards_dws.dws_assistant_salary_calc
WHERE site_id = %s
AND salary_month >= %s
AND salary_month <= %s
"""
rows = self.db.query(sql, (site_id, month_start, month_end))
# 获取每月工作天数
work_days_sql = """
SELECT
assistant_id,
DATE_TRUNC('month', stat_date)::DATE AS month,
COUNT(DISTINCT stat_date) AS work_days
FROM billiards_dws.dws_assistant_daily_detail
WHERE site_id = %s
AND stat_date >= %s
AND stat_date <= %s
GROUP BY assistant_id, DATE_TRUNC('month', stat_date)
"""
work_days_rows = self.db.query(work_days_sql, (site_id, start_date, end_date))
work_days_index = {(r['assistant_id'], r['month']): r['work_days'] for r in (work_days_rows or [])}
results = []
for row in (rows or []):
row_dict = dict(row)
asst_id = row_dict.get('assistant_id')
month = row_dict.get('salary_month')
row_dict['work_days'] = work_days_index.get((asst_id, month), 20)
results.append(row_dict)
return results
__all__ = ['AssistantFinanceTask']

View File

@@ -0,0 +1,600 @@
# -*- coding: utf-8 -*-
"""
助教月度业绩汇总任务
功能说明:
"助教+月份"为粒度,汇总月度业绩及档位计算
数据来源:
- dws_assistant_daily_detail: 日度明细(聚合)
- dim_assistant: 助教维度(入职日期、等级)
- cfg_performance_tier: 绩效档位配置
目标表:
billiards_dws.dws_assistant_monthly_summary
更新策略:
- 更新频率:每日更新当月数据
- 幂等方式delete-before-insert按月份
业务规则:
- 新入职判断入职日期在月1日0点之后则为新入职
- 有效业绩total_hours - trashed_hours
- 档位匹配根据有效业绩小时数匹配cfg_performance_tier
- 排名计算按有效业绩小时数降序考虑并列如2个第一则都是1下一个是3
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class AssistantMonthlyTask(BaseDwsTask):
"""
助教月度业绩汇总任务
汇总每个助教每月的:
- 工作天数、服务次数、时长
- 有效业绩(扣除废除记录后)
- 档位匹配
- 月度排名用于Top3奖金
"""
def get_task_code(self) -> str:
return "DWS_ASSISTANT_MONTHLY"
def get_target_table(self) -> str:
return "dws_assistant_monthly_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "stat_month"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据:从日度明细表聚合
"""
# 确定月份范围
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 获取涉及的月份列表
months = self._get_months_in_range(start_date, end_date)
months = self._filter_months_for_schedule(months, end_date)
self.logger.info(
"%s: 提取数据,月份范围 %s",
self.get_task_code(), [str(m) for m in months]
)
if not months:
self.logger.info("%s: 无需处理月份,跳过", self.get_task_code())
return {
'daily_aggregates': [],
'monthly_uniques': [],
'assistant_info': {},
'months': [],
'site_id': site_id
}
# 1. 获取日度明细聚合数据
daily_aggregates = self._extract_daily_aggregates(site_id, months)
# 1.1 获取月度去重客户/台桌统计从DWD直接去重
monthly_uniques = self._extract_monthly_uniques(site_id, months)
# 2. 获取助教基本信息
assistant_info = self._extract_assistant_info(site_id)
# 3. 加载配置缓存
self.load_config_cache()
return {
'daily_aggregates': daily_aggregates,
'monthly_uniques': monthly_uniques,
'assistant_info': assistant_info,
'months': months,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算月度汇总、档位匹配、排名
"""
daily_aggregates = extracted['daily_aggregates']
monthly_uniques = extracted['monthly_uniques']
assistant_info = extracted['assistant_info']
months = extracted['months']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 个月份,%d 条聚合记录",
self.get_task_code(), len(months), len(daily_aggregates)
)
# 月度去重索引
monthly_unique_index = {
(row.get('assistant_id'), row.get('stat_month')): row
for row in (monthly_uniques or [])
if row.get('assistant_id') and row.get('stat_month')
}
# 按月份处理
all_results = []
for month in months:
month_results = self._process_month(
daily_aggregates,
assistant_info,
monthly_unique_index,
month,
site_id
)
all_results.extend(month_results)
return all_results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据写入DWS表
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据(按月份)
deleted = self._delete_by_months(context, transformed)
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _get_months_in_range(self, start_date: date, end_date: date) -> List[date]:
"""
获取日期范围内的所有月份(月第一天)
"""
months = []
current = start_date.replace(day=1)
end_month = end_date.replace(day=1)
while current <= end_month:
months.append(current)
# 下个月
if current.month == 12:
current = current.replace(year=current.year + 1, month=1)
else:
current = current.replace(month=current.month + 1)
return months
def _filter_months_for_schedule(self, months: List[date], end_date: date) -> List[date]:
"""
按调度口径过滤历史月份(默认仅当月,月初可包含上月)
"""
if not months:
return []
history_months = self.safe_int(self.config.get("dws.monthly.history_months", 0))
if history_months > 0:
current_month = self.get_month_first_day(end_date)
allowed = {current_month}
for offset in range(1, history_months + 1):
allowed.add(self.get_month_first_day(self._shift_months(current_month, -offset)))
filtered = [m for m in months if m in allowed]
skipped = [m for m in months if m not in allowed]
if skipped:
self.logger.info(
"%s: 跳过历史月份 %s",
self.get_task_code(),
[str(m) for m in skipped]
)
return filtered
allow_history = bool(self.config.get("dws.monthly.allow_history", False))
if allow_history:
return months
current_month = self.get_month_first_day(end_date)
allowed = {current_month}
grace_days = self.safe_int(self.config.get("dws.monthly.prev_month_grace_days", 5))
if grace_days > 0 and end_date.day <= grace_days:
prev_month = self.get_month_first_day(self._shift_months(current_month, -1))
allowed.add(prev_month)
filtered = [m for m in months if m in allowed]
skipped = [m for m in months if m not in allowed]
if skipped:
self.logger.info(
"%s: 跳过历史月份 %s",
self.get_task_code(),
[str(m) for m in skipped]
)
return filtered
def _extract_daily_aggregates(
self,
site_id: int,
months: List[date]
) -> List[Dict[str, Any]]:
"""
从日度明细表提取并按月聚合
"""
if not months:
return []
# 构建月份条件
month_conditions = []
for month in months:
next_month = (month.replace(day=28) + timedelta(days=4)).replace(day=1)
month_conditions.append(f"(stat_date >= '{month}' AND stat_date < '{next_month}')")
month_where = " OR ".join(month_conditions)
sql = f"""
SELECT
assistant_id,
assistant_nickname,
assistant_level_code,
assistant_level_name,
DATE_TRUNC('month', stat_date)::DATE AS stat_month,
COUNT(DISTINCT stat_date) AS work_days,
SUM(total_service_count) AS total_service_count,
SUM(base_service_count) AS base_service_count,
SUM(bonus_service_count) AS bonus_service_count,
SUM(room_service_count) AS room_service_count,
SUM(total_hours) AS total_hours,
SUM(base_hours) AS base_hours,
SUM(bonus_hours) AS bonus_hours,
SUM(room_hours) AS room_hours,
SUM(total_ledger_amount) AS total_ledger_amount,
SUM(base_ledger_amount) AS base_ledger_amount,
SUM(bonus_ledger_amount) AS bonus_ledger_amount,
SUM(room_ledger_amount) AS room_ledger_amount,
SUM(unique_customers) AS total_unique_customers,
SUM(unique_tables) AS total_unique_tables,
SUM(trashed_seconds) AS trashed_seconds,
SUM(trashed_count) AS trashed_count
FROM billiards_dws.dws_assistant_daily_detail
WHERE site_id = %s AND ({month_where})
GROUP BY assistant_id, assistant_nickname, assistant_level_code, assistant_level_name,
DATE_TRUNC('month', stat_date)
"""
rows = self.db.query(sql, (site_id,))
return [dict(row) for row in rows] if rows else []
def _extract_monthly_uniques(
self,
site_id: int,
months: List[date]
) -> List[Dict[str, Any]]:
"""
从DWD按月直接去重客户与台桌
"""
if not months:
return []
start_month = min(months)
end_month = max(months)
next_month = (end_month.replace(day=28) + timedelta(days=4)).replace(day=1)
sql = """
SELECT
site_assistant_id AS assistant_id,
DATE_TRUNC('month', start_use_time)::DATE AS stat_month,
COUNT(DISTINCT CASE WHEN tenant_member_id > 0 THEN tenant_member_id END) AS unique_customers,
COUNT(DISTINCT site_table_id) AS unique_tables
FROM billiards_dwd.dwd_assistant_service_log
WHERE site_id = %s
AND start_use_time >= %s
AND start_use_time < %s
AND is_delete = 0
GROUP BY site_assistant_id, DATE_TRUNC('month', start_use_time)
"""
rows = self.db.query(sql, (site_id, start_month, next_month))
return [dict(row) for row in rows] if rows else []
def _extract_assistant_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取助教基本信息
"""
sql = """
SELECT
assistant_id,
nickname,
level AS assistant_level,
entry_time AS hire_date
FROM billiards_dwd.dim_assistant
WHERE site_id = %s
AND scd2_is_current = 1 -- 当前有效记录
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['assistant_id']] = row_dict
return result
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _process_month(
self,
daily_aggregates: List[Dict[str, Any]],
assistant_info: Dict[int, Dict[str, Any]],
monthly_unique_index: Dict[Tuple[int, date], Dict[str, Any]],
month: date,
site_id: int
) -> List[Dict[str, Any]]:
"""
处理单个月份的数据
"""
# 筛选该月份的数据
month_data = [
agg for agg in daily_aggregates
if agg.get('stat_month') == month
]
if not month_data:
return []
# 构建月度汇总记录
month_records = []
for agg in month_data:
assistant_id = agg.get('assistant_id')
asst_info = assistant_info.get(assistant_id, {})
# 计算有效业绩
total_hours = self.safe_decimal(agg.get('total_hours', 0))
trashed_hours = self.seconds_to_hours(self.safe_int(agg.get('trashed_seconds', 0)))
effective_hours = total_hours - trashed_hours
# 判断是否新入职
hire_date = asst_info.get('hire_date')
is_new_hire = False
if hire_date:
if isinstance(hire_date, datetime):
hire_date = hire_date.date()
is_new_hire = self.is_new_hire_in_month(hire_date, month)
# 匹配档位
tier_hours = effective_hours
max_tier_level = None
if is_new_hire:
tier_hours = self._calc_new_hire_tier_hours(effective_hours, self.safe_int(agg.get('work_days', 0)))
if self._should_apply_new_hire_tier_cap(month, hire_date):
max_tier_level = self._get_new_hire_max_tier_level()
tier = self.get_performance_tier(
tier_hours,
is_new_hire,
effective_date=month,
max_tier_level=max_tier_level
)
# 获取月末的等级信息(用于记录)
month_end = self._get_month_end(month)
level_info = self.get_assistant_level_asof(assistant_id, month_end)
# 月度去重客户/台桌从DWD直接去重
unique_info = monthly_unique_index.get((assistant_id, month), {})
unique_customers = self.safe_int(
unique_info.get('unique_customers', agg.get('total_unique_customers', 0))
)
unique_tables = self.safe_int(
unique_info.get('unique_tables', agg.get('total_unique_tables', 0))
)
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': agg.get('assistant_nickname'),
'stat_month': month,
'assistant_level_code': level_info.get('level_code') if level_info else agg.get('assistant_level_code'),
'assistant_level_name': level_info.get('level_name') if level_info else agg.get('assistant_level_name'),
'hire_date': hire_date,
'is_new_hire': is_new_hire,
'work_days': self.safe_int(agg.get('work_days', 0)),
'total_service_count': self.safe_int(agg.get('total_service_count', 0)),
'base_service_count': self.safe_int(agg.get('base_service_count', 0)),
'bonus_service_count': self.safe_int(agg.get('bonus_service_count', 0)),
'room_service_count': self.safe_int(agg.get('room_service_count', 0)),
'total_hours': total_hours,
'base_hours': self.safe_decimal(agg.get('base_hours', 0)),
'bonus_hours': self.safe_decimal(agg.get('bonus_hours', 0)),
'room_hours': self.safe_decimal(agg.get('room_hours', 0)),
'effective_hours': effective_hours,
'trashed_hours': trashed_hours,
'total_ledger_amount': self.safe_decimal(agg.get('total_ledger_amount', 0)),
'base_ledger_amount': self.safe_decimal(agg.get('base_ledger_amount', 0)),
'bonus_ledger_amount': self.safe_decimal(agg.get('bonus_ledger_amount', 0)),
'room_ledger_amount': self.safe_decimal(agg.get('room_ledger_amount', 0)),
'unique_customers': unique_customers,
'unique_tables': unique_tables,
'avg_service_seconds': self._calc_avg_service_seconds(agg),
'tier_id': tier.get('tier_id') if tier else None,
'tier_code': tier.get('tier_code') if tier else None,
'tier_name': tier.get('tier_name') if tier else None,
'rank_by_hours': None, # 后面计算
'rank_with_ties': None, # 后面计算
}
month_records.append(record)
# 计算排名
self._calculate_ranks(month_records)
return month_records
def _get_month_end(self, month: date) -> date:
"""
获取月末日期
"""
if month.month == 12:
next_month = month.replace(year=month.year + 1, month=1, day=1)
else:
next_month = month.replace(month=month.month + 1, day=1)
return next_month - timedelta(days=1)
def _calc_avg_service_seconds(self, agg: Dict[str, Any]) -> Decimal:
"""
计算平均单次服务时长
"""
total_count = self.safe_int(agg.get('total_service_count', 0))
if total_count == 0:
return Decimal('0')
total_hours = self.safe_decimal(agg.get('total_hours', 0))
total_seconds = total_hours * Decimal('3600')
return total_seconds / Decimal(str(total_count))
def _calc_new_hire_tier_hours(self, effective_hours: Decimal, work_days: int) -> Decimal:
"""
新入职定档:日均 * 30仅用于定档不影响奖金与排名
"""
if work_days <= 0:
return Decimal('0')
return (effective_hours / Decimal(str(work_days))) * Decimal('30')
def _should_apply_new_hire_tier_cap(self, stat_month: date, hire_date: Optional[date]) -> bool:
"""
新入职封顶规则是否生效:
- 仅在规则生效月及之后(默认 2026-03-01 起)
- 仅当入职日期晚于封顶日(默认当月 25 日)
"""
if not hire_date:
return False
effective_from = self._get_new_hire_cap_effective_from()
cap_day = self._get_new_hire_cap_day()
return stat_month >= effective_from and hire_date.day > cap_day
def _get_new_hire_cap_effective_from(self) -> date:
"""
获取新入职封顶规则生效月份(默认 2026-03-01
"""
raw_value = self.config.get("dws.monthly.new_hire_cap_effective_from", "2026-03-01")
if isinstance(raw_value, datetime):
return raw_value.date()
if isinstance(raw_value, date):
return raw_value
if isinstance(raw_value, str):
try:
return datetime.strptime(raw_value.strip(), "%Y-%m-%d").date()
except ValueError:
pass
return date(2026, 3, 1)
def _get_new_hire_cap_day(self) -> int:
"""
获取新入职封顶日(默认 25
"""
value = self.safe_int(self.config.get("dws.monthly.new_hire_cap_day", 25))
return min(max(value, 1), 31)
def _get_new_hire_max_tier_level(self) -> int:
"""
获取新入职封顶档位等级(默认 2 档)
"""
value = self.safe_int(self.config.get("dws.monthly.new_hire_max_tier_level", 2))
return max(value, 0)
def _calculate_ranks(self, records: List[Dict[str, Any]]) -> None:
"""
计算排名(考虑并列)
Top3排名口径按有效业绩总小时数排名
如遇并列则都算比如2个第一则记为2个第一一个第三
"""
if not records:
return
# 按有效业绩降序排序
sorted_records = sorted(
records,
key=lambda x: x.get('effective_hours', Decimal('0')),
reverse=True
)
# 计算考虑并列的排名
values = [
(r.get('assistant_id'), r.get('effective_hours', Decimal('0')))
for r in sorted_records
]
ranked = self.calculate_rank_with_ties(values)
# 创建排名映射
rank_map = {
assistant_id: (rank, dense_rank)
for assistant_id, rank, dense_rank in ranked
}
# 更新记录
for record in records:
assistant_id = record.get('assistant_id')
if assistant_id in rank_map:
rank, _ = rank_map[assistant_id]
record['rank_by_hours'] = rank
record['rank_with_ties'] = rank # 使用考虑并列的排名
def _delete_by_months(
self,
context: TaskContext,
records: List[Dict[str, Any]]
) -> int:
"""
按月份删除已存在的数据
"""
# 获取涉及的月份
months = set(r.get('stat_month') for r in records if r.get('stat_month'))
if not months:
return 0
target_table = self.get_target_table()
full_table = f"{self.DWS_SCHEMA}.{target_table}"
total_deleted = 0
with self.db.conn.cursor() as cur:
for month in months:
sql = f"""
DELETE FROM {full_table}
WHERE site_id = %s AND stat_month = %s
"""
cur.execute(sql, (context.store_id, month))
total_deleted += cur.rowcount
return total_deleted
# 便于外部导入
__all__ = ['AssistantMonthlyTask']

View File

@@ -0,0 +1,437 @@
# -*- coding: utf-8 -*-
"""
助教工资计算任务
功能说明:
"助教+月份"为粒度,计算月度工资明细
数据来源:
- dws_assistant_monthly_summary: 月度业绩汇总
- dws_assistant_recharge_commission: 充值提成Excel导入
- cfg_performance_tier: 绩效档位配置
- cfg_assistant_level_price: 等级定价配置
- cfg_bonus_rules: 奖金规则配置
目标表:
billiards_dws.dws_assistant_salary_calc
更新策略:
- 更新频率:月初计算上月工资
- 幂等方式delete-before-insert按月份
业务规则来自DWS数据库处理需求.md
- 基础课收入 = 基础课小时数 × (客户支付价格 - 专业课抽成)
中级助教基础课170小时3档 = 170 × (108 - 13) = 16,150元
- 附加课收入 = 附加课小时数 × 附加课价格 × (1 - 打赏课抽成比例)
附加课15小时3档 = 15 × 190 × (1 - 0.35) = 1,852.5元
- 包厢课收入 = 包厢课小时数 × (包厢课客户支付价格 - 专业课抽成)
- 冲刺奖金:按规则表配置(历史口径,不累计取最高档)
- Top3奖金1st:1000, 2nd:600, 3rd:400并列都算
- 充值提成来自dws_assistant_recharge_commission
- SCD2口径等级定价使用月份对应的历史值
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class AssistantSalaryTask(BaseDwsTask):
"""
助教工资计算任务
计算每个助教每月的工资明细:
- 课时收入(基础课+附加课)
- 扣款(档位扣款+其他)
- 奖金(档位奖金+冲刺+Top3+充值提成+其他)
- 应发工资
"""
def get_task_code(self) -> str:
return "DWS_ASSISTANT_SALARY"
def get_target_table(self) -> str:
return "dws_assistant_salary_calc"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "salary_month"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
# 确定工资月份(通常是上月)
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
if self._should_skip_run(end_date):
self.logger.info("%s: 非工资结算期,跳过", self.get_task_code())
return {
'monthly_summary': [],
'recharge_commission': [],
'salary_month': None,
'site_id': context.store_id,
}
salary_month = self._get_salary_month(end_date)
site_id = context.store_id
self.logger.info(
"%s: 提取数据,工资月份 %s",
self.get_task_code(), salary_month
)
# 1. 获取月度业绩汇总
monthly_summary = self._extract_monthly_summary(site_id, salary_month)
# 2. 获取充值提成
recharge_commission = self._extract_recharge_commission(site_id, salary_month)
# 3. 加载配置缓存
self.load_config_cache()
return {
'monthly_summary': monthly_summary,
'recharge_commission': recharge_commission,
'salary_month': salary_month,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算工资
"""
if not extracted.get('salary_month'):
return []
monthly_summary = extracted['monthly_summary']
recharge_commission = extracted['recharge_commission']
salary_month = extracted['salary_month']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条月度汇总记录",
self.get_task_code(), len(monthly_summary)
)
# 构建充值提成索引
commission_index = {}
for comm in recharge_commission:
asst_id = comm.get('assistant_id')
if asst_id:
commission_index[asst_id] = commission_index.get(asst_id, Decimal('0')) + \
self.safe_decimal(comm.get('commission_amount', 0))
# 计算工资
results = []
for summary in monthly_summary:
record = self._calculate_salary(summary, commission_index, salary_month, site_id)
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据
deleted = self._delete_by_month(context, transformed)
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _get_salary_month(self, end_date: date) -> date:
"""
获取工资月份(默认为上月)
"""
# 如果是月初,计算上月工资
if end_date.day <= 5:
if end_date.month == 1:
return date(end_date.year - 1, 12, 1)
else:
return date(end_date.year, end_date.month - 1, 1)
else:
# 否则计算当月(可能是调整)
return end_date.replace(day=1)
def _should_skip_run(self, end_date: date) -> bool:
"""
工资计算仅在月初运行(默认前 N 天)
"""
allow_out_of_cycle = bool(self.config.get("dws.salary.allow_out_of_cycle", False))
if allow_out_of_cycle:
return False
run_days = self.safe_int(self.config.get("dws.salary.run_days", 5))
if run_days <= 0:
return False
return end_date.day > run_days
def _extract_monthly_summary(
self,
site_id: int,
salary_month: date
) -> List[Dict[str, Any]]:
"""
提取月度业绩汇总
"""
sql = """
SELECT
assistant_id,
assistant_nickname,
stat_month,
assistant_level_code,
assistant_level_name,
hire_date,
is_new_hire,
effective_hours,
base_hours,
bonus_hours,
room_hours,
tier_id,
tier_code,
tier_name,
rank_with_ties
FROM billiards_dws.dws_assistant_monthly_summary
WHERE site_id = %s AND stat_month = %s
"""
rows = self.db.query(sql, (site_id, salary_month))
return [dict(row) for row in rows] if rows else []
def _extract_recharge_commission(
self,
site_id: int,
salary_month: date
) -> List[Dict[str, Any]]:
"""
提取充值提成
"""
sql = """
SELECT
assistant_id,
commission_amount
FROM billiards_dws.dws_assistant_recharge_commission
WHERE site_id = %s AND commission_month = %s
"""
rows = self.db.query(sql, (site_id, salary_month))
return [dict(row) for row in rows] if rows else []
# ==========================================================================
# 工资计算方法
# ==========================================================================
def _calculate_salary(
self,
summary: Dict[str, Any],
commission_index: Dict[int, Decimal],
salary_month: date,
site_id: int
) -> Dict[str, Any]:
"""
计算单个助教的月度工资
"""
assistant_id = summary.get('assistant_id')
level_code = summary.get('assistant_level_code')
effective_hours = self.safe_decimal(summary.get('effective_hours', 0))
base_hours = self.safe_decimal(summary.get('base_hours', 0))
bonus_hours = self.safe_decimal(summary.get('bonus_hours', 0))
room_hours = self.safe_decimal(summary.get('room_hours', 0))
is_new_hire = summary.get('is_new_hire', False)
rank = summary.get('rank_with_ties')
# 获取等级定价SCD2口径按月份取值
# base_course_price: 客户支付价格初级98/中级108/高级118/星级138
# bonus_course_price: 附加课客户支付价格固定190元
# room_course_price: 包厢课客户支付价格固定138元
level_price = self.get_level_price(level_code, salary_month)
base_course_price = self.safe_decimal(
level_price.get('base_course_price', 98) if level_price else 98
)
bonus_course_price = self.safe_decimal(
level_price.get('bonus_course_price', 190) if level_price else 190
)
room_course_price = self.safe_decimal(
self.config.get("dws.salary.room_course_price", 138)
)
# 获取档位配置
# base_deduction: 专业课抽成(元/小时),球房从每小时扣除
# bonus_deduction_ratio: 打赏课抽成比例,球房从附加课收入扣除的比例
tier = self.get_performance_tier_by_id(summary.get('tier_id'), salary_month)
if not tier:
tier = self.get_performance_tier(
effective_hours,
is_new_hire,
effective_date=salary_month
)
base_deduction = self.safe_decimal(tier.get('base_deduction', 18)) if tier else Decimal('18')
bonus_deduction_ratio = self.safe_decimal(tier.get('bonus_deduction_ratio', 0.40)) if tier else Decimal('0.40')
vacation_days = tier.get('vacation_days', 0) if tier else 0
vacation_unlimited = tier.get('vacation_unlimited', False) if tier else False
# ============================================================
# 工资计算公式来自DWS数据库处理需求.md
# ============================================================
# 基础课收入 = 基础课小时数 × (客户支付价格 - 专业课抽成)
# 例中级助教170小时3档 = 170 × (108 - 13) = 16,150元
base_income = base_hours * (base_course_price - base_deduction)
# 附加课收入 = 附加课小时数 × 附加课价格 × (1 - 打赏课抽成比例)
# 例15小时3档 = 15 × 190 × (1 - 0.35) = 1,852.5元
bonus_income = bonus_hours * bonus_course_price * (Decimal('1') - bonus_deduction_ratio)
# 包厢课收入(按包厢课统一价格口径)
room_income = room_hours * (room_course_price - base_deduction)
# 课时收入合计
total_course_income = base_income + bonus_income + room_income
# 计算冲刺奖金(按规则表配置,不累计取最高)
sprint_bonus = self.calculate_sprint_bonus(effective_hours, salary_month)
# 计算Top3排名奖金1st:1000, 2nd:600, 3rd:400并列都算
top_rank_bonus = Decimal('0')
if rank and rank <= 3:
top_rank_bonus = self.calculate_top_rank_bonus(rank, salary_month)
# 获取充值提成
recharge_commission = commission_index.get(assistant_id, Decimal('0'))
# 汇总奖金
other_bonus = Decimal('0') # 预留其他奖金
total_bonus = sprint_bonus + top_rank_bonus + recharge_commission + other_bonus
# 计算应发工资 = 课时收入 + 奖金
gross_salary = total_course_income + total_bonus
# 构建记录
return {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': summary.get('assistant_nickname'),
'salary_month': salary_month,
'assistant_level_code': level_code,
'assistant_level_name': summary.get('assistant_level_name'),
'hire_date': summary.get('hire_date'),
'is_new_hire': is_new_hire,
'effective_hours': effective_hours,
'base_hours': base_hours,
'bonus_hours': bonus_hours,
'room_hours': room_hours,
'tier_id': summary.get('tier_id'),
'tier_code': tier.get('tier_code') if tier else None,
'tier_name': tier.get('tier_name') if tier else None,
'rank_with_ties': rank,
# 定价信息
'base_course_price': base_course_price,
'bonus_course_price': bonus_course_price,
'base_deduction': base_deduction,
'bonus_deduction_ratio': bonus_deduction_ratio,
# 收入明细
'base_income': base_income,
'bonus_income': bonus_income,
'room_income': room_income,
'total_course_income': total_course_income,
# 奖金明细
'sprint_bonus': sprint_bonus,
'top_rank_bonus': top_rank_bonus,
'recharge_commission': recharge_commission,
'other_bonus': other_bonus,
'total_bonus': total_bonus,
# 应发工资
'gross_salary': gross_salary,
# 假期
'vacation_days': vacation_days,
'vacation_unlimited': vacation_unlimited,
'calc_notes': self._build_calc_notes(summary, tier, sprint_bonus, top_rank_bonus),
}
def _build_calc_notes(
self,
summary: Dict[str, Any],
tier: Optional[Dict[str, Any]],
sprint_bonus: Decimal,
top_rank_bonus: Decimal
) -> Optional[str]:
"""
构建计算备注
"""
notes = []
if summary.get('is_new_hire'):
notes.append("新入职首月")
if tier:
notes.append(f"档位: {tier.get('tier_name', 'N/A')}")
if sprint_bonus > 0:
notes.append(f"冲刺奖金: {sprint_bonus}")
if top_rank_bonus > 0:
rank = summary.get('rank_with_ties')
notes.append(f"Top{rank}奖金: {top_rank_bonus}")
return "; ".join(notes) if notes else None
def _delete_by_month(
self,
context: TaskContext,
records: List[Dict[str, Any]]
) -> int:
"""
按月份删除已存在的数据
"""
months = set(r.get('salary_month') for r in records if r.get('salary_month'))
if not months:
return 0
target_table = self.get_target_table()
full_table = f"{self.DWS_SCHEMA}.{target_table}"
total_deleted = 0
with self.db.conn.cursor() as cur:
for month in months:
sql = f"""
DELETE FROM {full_table}
WHERE site_id = %s AND salary_month = %s
"""
cur.execute(sql, (context.store_id, month))
total_deleted += cur.rowcount
return total_deleted
# 便于外部导入
__all__ = ['AssistantSalaryTask']

1222
tasks/dws/base_dws_task.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,627 @@
# -*- coding: utf-8 -*-
"""
财务日度汇总任务
功能说明:
"日期"为粒度,汇总当日财务数据
数据来源:
- dwd_settlement_head: 结账单头表
- dwd_groupbuy_redemption: 团购核销
- dwd_recharge_order: 充值订单
- dws_finance_expense_summary: 支出汇总Excel导入
- dws_platform_settlement: 平台回款/服务费Excel导入
目标表:
billiards_dws.dws_finance_daily_summary
更新策略:
- 更新频率:每小时更新当日数据
- 幂等方式delete-before-insert按日期
业务规则:
- 发生额table_charge_money + goods_money + assistant_pd_money + assistant_cx_money
- 团购优惠coupon_amount - 团购支付金额
- 团购支付pl_coupon_sale_amount 或关联 groupbuy_redemption.ledger_unit_price
- 首充/续充:通过 is_first 字段区分
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
import calendar
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class FinanceDailyTask(BaseDwsTask):
"""
财务日度汇总任务
汇总每日的:
- 发生额(正价)
- 优惠拆分
- 确认收入
- 现金流(流入/流出)
- 充值统计(首充/续充)
- 订单统计
"""
def get_task_code(self) -> str:
return "DWS_FINANCE_DAILY"
def get_target_table(self) -> str:
return "dws_finance_daily_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,日期范围 %s ~ %s",
self.get_task_code(), start_date, end_date
)
# 1. 获取结账单汇总
settlement_summary = self._extract_settlement_summary(site_id, start_date, end_date)
# 2. 获取团购核销汇总
groupbuy_summary = self._extract_groupbuy_summary(site_id, start_date, end_date)
# 3. 获取充值汇总
recharge_summary = self._extract_recharge_summary(site_id, start_date, end_date)
# 3.1 获取赠送卡消费汇总(余额变动)
gift_card_summary = self._extract_gift_card_consume_summary(site_id, start_date, end_date)
# 4. 获取支出汇总(来自导入表)
expense_summary = self._extract_expense_summary(site_id, start_date, end_date)
# 5. 获取平台回款汇总(来自导入表)
platform_summary = self._extract_platform_summary(site_id, start_date, end_date)
# 6. 获取大客户优惠明细(用于拆分手动优惠)
big_customer_summary = self._extract_big_customer_discounts(site_id, start_date, end_date)
return {
'settlement_summary': settlement_summary,
'groupbuy_summary': groupbuy_summary,
'recharge_summary': recharge_summary,
'gift_card_summary': gift_card_summary,
'expense_summary': expense_summary,
'platform_summary': platform_summary,
'big_customer_summary': big_customer_summary,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:按日期聚合
"""
settlement_summary = extracted['settlement_summary']
groupbuy_summary = extracted['groupbuy_summary']
recharge_summary = extracted['recharge_summary']
gift_card_summary = extracted['gift_card_summary']
expense_summary = extracted['expense_summary']
platform_summary = extracted['platform_summary']
big_customer_summary = extracted['big_customer_summary']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 天结账数据,%d 天充值数据",
self.get_task_code(), len(settlement_summary), len(recharge_summary)
)
# 按日期合并数据
dates = set()
for item in settlement_summary + recharge_summary + gift_card_summary + expense_summary + platform_summary:
stat_date = item.get('stat_date')
if stat_date:
dates.add(stat_date)
# 构建索引
settle_index = {s['stat_date']: s for s in settlement_summary}
groupbuy_index = {g['stat_date']: g for g in groupbuy_summary}
recharge_index = {r['stat_date']: r for r in recharge_summary}
gift_card_index = {g['stat_date']: g for g in gift_card_summary}
expense_index = {e['stat_date']: e for e in expense_summary}
platform_index = {p['stat_date']: p for p in platform_summary}
big_customer_index = {b['stat_date']: b for b in big_customer_summary}
results = []
for stat_date in sorted(dates):
settle = settle_index.get(stat_date, {})
groupbuy = groupbuy_index.get(stat_date, {})
recharge = recharge_index.get(stat_date, {})
gift_card = gift_card_index.get(stat_date, {})
expense = expense_index.get(stat_date, {})
platform = platform_index.get(stat_date, {})
big_customer = big_customer_index.get(stat_date, {})
record = self._build_daily_record(
stat_date, settle, groupbuy, recharge, gift_card, expense, platform, big_customer, site_id
)
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
deleted = self.delete_existing_data(context, date_col="stat_date")
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_settlement_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取结账单日汇总
"""
sql = """
SELECT
DATE(pay_time) AS stat_date,
COUNT(*) AS order_count,
COUNT(CASE WHEN member_id != 0 AND member_id IS NOT NULL THEN 1 END) AS member_order_count,
COUNT(CASE WHEN member_id = 0 OR member_id IS NULL THEN 1 END) AS guest_order_count,
-- 发生额(正价)
SUM(table_charge_money) AS table_fee_amount,
SUM(goods_money) AS goods_amount,
SUM(assistant_pd_money) AS assistant_pd_amount,
SUM(assistant_cx_money) AS assistant_cx_amount,
SUM(table_charge_money + goods_money + assistant_pd_money + assistant_cx_money) AS gross_amount,
-- 支付
SUM(pay_amount) AS cash_pay_amount,
SUM(recharge_card_amount) AS card_pay_amount,
SUM(balance_amount) AS balance_pay_amount,
-- 优惠
SUM(coupon_amount) AS coupon_amount,
SUM(adjust_amount) AS adjust_amount,
SUM(member_discount_amount) AS member_discount_amount,
SUM(rounding_amount) AS rounding_amount,
SUM(pl_coupon_sale_amount) AS pl_coupon_sale_amount,
-- 消费金额
SUM(consume_money) AS total_consume
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %s
AND DATE(pay_time) >= %s
AND DATE(pay_time) <= %s
GROUP BY DATE(pay_time)
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_groupbuy_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取团购核销日汇总
"""
sql = """
SELECT
sh.pay_time::DATE AS stat_date,
COUNT(CASE WHEN sh.coupon_amount > 0 THEN 1 END) AS groupbuy_count,
SUM(
CASE
WHEN sh.coupon_amount > 0 THEN
CASE
WHEN sh.pl_coupon_sale_amount > 0 THEN sh.pl_coupon_sale_amount
ELSE COALESCE(gr.ledger_unit_price, 0)
END
ELSE 0
END
) AS groupbuy_pay_total
FROM billiards_dwd.dwd_settlement_head sh
LEFT JOIN billiards_dwd.dwd_groupbuy_redemption gr
ON gr.order_settle_id = sh.order_settle_id
AND COALESCE(gr.is_delete, 0) = 0
WHERE sh.site_id = %s
AND sh.pay_time >= %s
AND sh.pay_time < %s + INTERVAL '1 day'
GROUP BY sh.pay_time::DATE
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_recharge_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取充值日汇总
"""
sql = """
SELECT
DATE(pay_time) AS stat_date,
COUNT(*) AS recharge_count,
SUM(pay_money + gift_money) AS recharge_total,
SUM(pay_money) AS recharge_cash,
SUM(gift_money) AS recharge_gift,
COUNT(CASE WHEN is_first = 1 THEN 1 END) AS first_recharge_count,
SUM(CASE WHEN is_first = 1 THEN pay_money + gift_money ELSE 0 END) AS first_recharge_total,
SUM(CASE WHEN is_first = 1 THEN pay_money ELSE 0 END) AS first_recharge_cash,
SUM(CASE WHEN is_first = 1 THEN gift_money ELSE 0 END) AS first_recharge_gift,
COUNT(CASE WHEN is_first = 0 OR is_first IS NULL THEN 1 END) AS renewal_count,
SUM(CASE WHEN is_first = 0 OR is_first IS NULL THEN pay_money + gift_money ELSE 0 END) AS renewal_total,
SUM(CASE WHEN is_first = 0 OR is_first IS NULL THEN pay_money ELSE 0 END) AS renewal_cash,
SUM(CASE WHEN is_first = 0 OR is_first IS NULL THEN gift_money ELSE 0 END) AS renewal_gift,
COUNT(DISTINCT member_id) AS recharge_member_count
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND DATE(pay_time) >= %s
AND DATE(pay_time) <= %s
GROUP BY DATE(pay_time)
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_gift_card_consume_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取赠送卡消费汇总(来自余额变动)
"""
gift_card_type_ids = (
2791990152417157, # 台费卡
2794699703437125, # 酒水卡
2793266846533445, # 活动抵用券
)
id_list = ", ".join(str(card_id) for card_id in gift_card_type_ids)
sql = f"""
SELECT
change_time::DATE AS stat_date,
SUM(ABS(change_amount)) AS gift_card_consume
FROM billiards_dwd.dwd_member_balance_change
WHERE site_id = %s
AND change_time >= %s
AND change_time < %s + INTERVAL '1 day'
AND from_type = 1
AND change_amount < 0
AND COALESCE(is_delete, 0) = 0
AND card_type_id IN ({id_list})
GROUP BY change_time::DATE
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_expense_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取支出汇总(来自导入表,按月分摊到日)
"""
if start_date > end_date:
return []
start_month = start_date.replace(day=1)
end_month = end_date.replace(day=1)
sql = """
SELECT
expense_month,
SUM(expense_amount) AS expense_amount
FROM billiards_dws.dws_finance_expense_summary
WHERE site_id = %s
AND expense_month >= %s
AND expense_month <= %s
GROUP BY expense_month
"""
rows = self.db.query(sql, (site_id, start_month, end_month))
if not rows:
return []
daily_totals: Dict[date, Decimal] = {}
for row in rows:
row_dict = dict(row)
month_date = row_dict.get('expense_month')
if not month_date:
continue
amount = self.safe_decimal(row_dict.get('expense_amount', 0))
days_in_month = calendar.monthrange(month_date.year, month_date.month)[1]
daily_amount = amount / Decimal(str(days_in_month)) if days_in_month > 0 else Decimal('0')
for day in range(1, days_in_month + 1):
stat_date = date(month_date.year, month_date.month, day)
if stat_date < start_date or stat_date > end_date:
continue
daily_totals[stat_date] = daily_totals.get(stat_date, Decimal('0')) + daily_amount
return [
{'stat_date': stat_date, 'expense_amount': amount}
for stat_date, amount in sorted(daily_totals.items())
]
def _extract_platform_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取平台回款/服务费汇总(来自导入表)
"""
sql = """
SELECT
settlement_date AS stat_date,
SUM(settlement_amount) AS settlement_amount,
SUM(commission_amount) AS commission_amount,
SUM(service_fee) AS service_fee
FROM billiards_dws.dws_platform_settlement
WHERE site_id = %s
AND settlement_date >= %s
AND settlement_date <= %s
GROUP BY settlement_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_big_customer_discounts(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取大客户优惠(用于拆分手动调整)
"""
member_ids = self._parse_id_list(self.config.get("dws.discount.big_customer_member_ids"))
order_ids = self._parse_id_list(self.config.get("dws.discount.big_customer_order_ids"))
if not member_ids and not order_ids:
return []
sql = """
SELECT
pay_time::DATE AS stat_date,
order_settle_id,
member_id,
adjust_amount
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %s
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
AND adjust_amount != 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
if not rows:
return []
result: Dict[date, Dict[str, Any]] = {}
for row in rows:
row_dict = dict(row)
stat_date = row_dict.get('stat_date')
if not stat_date:
continue
order_id = row_dict.get('order_settle_id')
member_id = row_dict.get('member_id')
if order_id not in order_ids and member_id not in member_ids:
continue
amount = abs(self.safe_decimal(row_dict.get('adjust_amount', 0)))
entry = result.setdefault(stat_date, {'stat_date': stat_date, 'big_customer_amount': Decimal('0'), 'big_customer_count': 0})
entry['big_customer_amount'] += amount
entry['big_customer_count'] += 1
return list(result.values())
def _parse_id_list(self, value: Any) -> set:
if not value:
return set()
if isinstance(value, str):
items = [v.strip() for v in value.split(",") if v.strip()]
return {int(v) for v in items if v.isdigit()}
if isinstance(value, (list, tuple, set)):
result = set()
for item in value:
if item is None:
continue
try:
result.add(int(item))
except (ValueError, TypeError):
continue
return result
return set()
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _build_daily_record(
self,
stat_date: date,
settle: Dict[str, Any],
groupbuy: Dict[str, Any],
recharge: Dict[str, Any],
gift_card: Dict[str, Any],
expense: Dict[str, Any],
platform: Dict[str, Any],
big_customer: Dict[str, Any],
site_id: int
) -> Dict[str, Any]:
"""
构建日度财务记录
"""
# 发生额
gross_amount = self.safe_decimal(settle.get('gross_amount', 0))
table_fee_amount = self.safe_decimal(settle.get('table_fee_amount', 0))
goods_amount = self.safe_decimal(settle.get('goods_amount', 0))
assistant_pd_amount = self.safe_decimal(settle.get('assistant_pd_amount', 0))
assistant_cx_amount = self.safe_decimal(settle.get('assistant_cx_amount', 0))
# 支付
cash_pay_amount = self.safe_decimal(settle.get('cash_pay_amount', 0))
card_pay_amount = self.safe_decimal(settle.get('card_pay_amount', 0))
balance_pay_amount = self.safe_decimal(settle.get('balance_pay_amount', 0))
# 优惠
coupon_amount = self.safe_decimal(settle.get('coupon_amount', 0))
pl_coupon_sale = self.safe_decimal(settle.get('pl_coupon_sale_amount', 0))
groupbuy_pay = self.safe_decimal(groupbuy.get('groupbuy_pay_total', 0))
# 团购支付金额优先使用pl_coupon_sale_amount否则使用groupbuy核销金额
if pl_coupon_sale > 0:
groupbuy_pay_amount = pl_coupon_sale
else:
groupbuy_pay_amount = groupbuy_pay
# 团购优惠 = 团购抵消台费 - 团购支付金额
discount_groupbuy = coupon_amount - groupbuy_pay_amount
if discount_groupbuy < 0:
discount_groupbuy = Decimal('0')
adjust_amount = self.safe_decimal(settle.get('adjust_amount', 0))
member_discount = self.safe_decimal(settle.get('member_discount_amount', 0))
rounding_amount = self.safe_decimal(settle.get('rounding_amount', 0))
big_customer_amount = self.safe_decimal(big_customer.get('big_customer_amount', 0))
other_discount = adjust_amount - big_customer_amount
if other_discount < 0:
other_discount = Decimal('0')
# 赠送卡消费(来自余额变动)
gift_card_consume_amount = self.safe_decimal(gift_card.get('gift_card_consume', 0))
# 优惠合计
discount_total = discount_groupbuy + member_discount + gift_card_consume_amount + adjust_amount + rounding_amount
# 确认收入
confirmed_income = gross_amount - discount_total
# 现金流
platform_settlement_amount = self.safe_decimal(platform.get('settlement_amount', 0))
platform_fee_amount = (
self.safe_decimal(platform.get('commission_amount', 0))
+ self.safe_decimal(platform.get('service_fee', 0))
)
recharge_cash_inflow = self.safe_decimal(recharge.get('recharge_cash', 0))
platform_inflow = platform_settlement_amount if platform_settlement_amount > 0 else groupbuy_pay_amount
cash_inflow_total = cash_pay_amount + platform_inflow + recharge_cash_inflow
cash_outflow_total = self.safe_decimal(expense.get('expense_amount', 0)) + platform_fee_amount
cash_balance_change = cash_inflow_total - cash_outflow_total
# 卡消费
cash_card_consume = card_pay_amount + balance_pay_amount
gift_card_consume = gift_card_consume_amount
card_consume_total = cash_card_consume + gift_card_consume
# 充值统计
recharge_count = self.safe_int(recharge.get('recharge_count', 0))
recharge_total = self.safe_decimal(recharge.get('recharge_total', 0))
recharge_cash = self.safe_decimal(recharge.get('recharge_cash', 0))
recharge_gift = self.safe_decimal(recharge.get('recharge_gift', 0))
first_recharge_count = self.safe_int(recharge.get('first_recharge_count', 0))
first_recharge_amount = self.safe_decimal(recharge.get('first_recharge_total', 0))
renewal_count = self.safe_int(recharge.get('renewal_count', 0))
renewal_amount = self.safe_decimal(recharge.get('renewal_total', 0))
# 订单统计
order_count = self.safe_int(settle.get('order_count', 0))
member_order_count = self.safe_int(settle.get('member_order_count', 0))
guest_order_count = self.safe_int(settle.get('guest_order_count', 0))
avg_order_amount = gross_amount / order_count if order_count > 0 else Decimal('0')
return {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'stat_date': stat_date,
# 发生额
'gross_amount': gross_amount,
'table_fee_amount': table_fee_amount,
'goods_amount': goods_amount,
'assistant_pd_amount': assistant_pd_amount,
'assistant_cx_amount': assistant_cx_amount,
# 优惠
'discount_total': discount_total,
'discount_groupbuy': discount_groupbuy,
'discount_vip': member_discount,
'discount_gift_card': gift_card_consume_amount,
'discount_manual': adjust_amount,
'discount_rounding': rounding_amount,
'discount_other': other_discount,
# 确认收入
'confirmed_income': confirmed_income,
# 现金流
'cash_inflow_total': cash_inflow_total,
'cash_pay_amount': cash_pay_amount,
'groupbuy_pay_amount': groupbuy_pay_amount,
'platform_settlement_amount': platform_settlement_amount,
'platform_fee_amount': platform_fee_amount,
'recharge_cash_inflow': recharge_cash_inflow,
'card_consume_total': card_consume_total,
'cash_card_consume': cash_card_consume,
'gift_card_consume': gift_card_consume,
'cash_outflow_total': cash_outflow_total,
'cash_balance_change': cash_balance_change,
# 充值统计
'recharge_count': recharge_count,
'recharge_total': recharge_total,
'recharge_cash': recharge_cash,
'recharge_gift': recharge_gift,
'first_recharge_count': first_recharge_count,
'first_recharge_amount': first_recharge_amount,
'renewal_count': renewal_count,
'renewal_amount': renewal_amount,
# 订单统计
'order_count': order_count,
'member_order_count': member_order_count,
'guest_order_count': guest_order_count,
'avg_order_amount': avg_order_amount,
}
# 便于外部导入
__all__ = ['FinanceDailyTask']

View File

@@ -0,0 +1,486 @@
# -*- coding: utf-8 -*-
"""
优惠明细分析任务
功能说明:
"日期+优惠类型"为粒度,分析优惠构成
数据来源:
- dwd_settlement_head: 结账单头表(优惠字段)
- dwd_groupbuy_redemption: 团购核销(团购实付金额)
- dwd_member_balance_change: 余额变动(赠送卡消费)
目标表:
billiards_dws.dws_finance_discount_detail
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期
业务规则:
- 团购优惠 (GROUPBUY): coupon_amount - 团购实付金额
- 会员折扣 (VIP): member_discount_amount
- 赠送卡抵扣 (GIFT_CARD_*): dwd_member_balance_change台费卡/酒水卡/活动抵用券)
- 抹零 (ROUNDING): rounding_amount
- 大客户优惠 (BIG_CUSTOMER): 手动调整中标记的大客户订单
- 其他优惠 (OTHER): 手动调整中除大客户外的部分
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class FinanceDiscountDetailTask(BaseDwsTask):
"""
优惠明细分析任务
分析各类优惠的使用情况:
- 团购优惠
- 会员折扣
- 赠送卡抵扣
- 手动调整
- 抹零
- 其他优惠
"""
def get_task_code(self) -> str:
return "DWS_FINANCE_DISCOUNT_DETAIL"
def get_target_table(self) -> str:
return "dws_finance_discount_detail"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date", "discount_type_code"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
抽取优惠相关数据
数据来源:
1. settlement_head: 各类优惠字段
2. groupbuy_redemption: 团购实付金额
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 从settlement_head抽取优惠数据
discount_summary = self._extract_discount_summary(site_id, start_date, end_date)
# 从groupbuy_redemption获取团购实付金额
groupbuy_payments = self._extract_groupbuy_payments(site_id, start_date, end_date)
# 提取大客户优惠(拆分手动调整)
big_customer_summary = self._extract_big_customer_discounts(site_id, start_date, end_date)
# 提取赠送卡消费(按卡类型拆分)
gift_card_consumes = self._extract_gift_card_consumes(site_id, start_date, end_date)
return {
'discount_summary': discount_summary,
'groupbuy_payments': groupbuy_payments,
'big_customer_summary': big_customer_summary,
'gift_card_consumes': gift_card_consumes,
}
def _extract_discount_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
从结账单头表抽取优惠汇总
字段说明:
- coupon_amount: 团购抵消台费金额
- adjust_amount: 手动调整金额(台费打折)
- member_discount_amount: 会员折扣
- rounding_amount: 抹零金额
- pl_coupon_sale_amount: 平台券销售金额团购实付路径1
"""
sql = """
SELECT
pay_time::DATE AS stat_date,
-- 团购相关
COALESCE(SUM(coupon_amount), 0) AS coupon_amount_total,
COALESCE(SUM(pl_coupon_sale_amount), 0) AS pl_coupon_sale_total,
COUNT(CASE WHEN coupon_amount > 0 THEN 1 END) AS coupon_order_count,
-- 手动调整
COALESCE(SUM(adjust_amount), 0) AS adjust_amount_total,
COUNT(CASE WHEN adjust_amount != 0 THEN 1 END) AS adjust_order_count,
-- 会员折扣
COALESCE(SUM(member_discount_amount), 0) AS member_discount_total,
COUNT(CASE WHEN member_discount_amount > 0 THEN 1 END) AS member_discount_order_count,
-- 抹零
COALESCE(SUM(rounding_amount), 0) AS rounding_amount_total,
COUNT(CASE WHEN rounding_amount != 0 THEN 1 END) AS rounding_order_count,
-- 总订单数
COUNT(*) AS total_orders
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %(site_id)s
AND pay_time >= %(start_date)s
AND pay_time < %(end_date)s + INTERVAL '1 day'
AND settle_status = 1 -- 已结账
GROUP BY pay_time::DATE
ORDER BY stat_date
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def _extract_groupbuy_payments(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[date, Decimal]:
"""
从团购核销表获取团购实付金额
团购实付金额计算:
- 若 pl_coupon_sale_amount > 0使用该值
- 否则使用 groupbuy_redemption.ledger_unit_price
返回:{日期: 团购实付总额}
"""
sql = """
SELECT
sh.pay_time::DATE AS stat_date,
SUM(
CASE
WHEN sh.pl_coupon_sale_amount > 0 THEN sh.pl_coupon_sale_amount
ELSE COALESCE(gr.ledger_unit_price, 0)
END
) AS groupbuy_payment
FROM billiards_dwd.dwd_settlement_head sh
LEFT JOIN billiards_dwd.dwd_groupbuy_redemption gr
ON gr.order_settle_id = sh.order_settle_id
AND COALESCE(gr.is_delete, 0) = 0
WHERE sh.site_id = %(site_id)s
AND sh.pay_time >= %(start_date)s
AND sh.pay_time < %(end_date)s + INTERVAL '1 day'
AND sh.settle_status = 1
AND sh.coupon_amount > 0 -- 只统计有团购的订单
GROUP BY sh.pay_time::DATE
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
result = {}
if rows:
for row in rows:
result[row['stat_date']] = self.safe_decimal(row.get('groupbuy_payment', 0))
return result
def _extract_gift_card_consumes(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取赠送卡消费(按卡类型)
"""
gift_card_type_ids = (
2791990152417157, # 台费卡
2794699703437125, # 酒水卡
2793266846533445, # 活动抵用券
)
id_list = ", ".join(str(card_id) for card_id in gift_card_type_ids)
sql = f"""
SELECT
change_time::DATE AS stat_date,
card_type_id,
COUNT(*) AS consume_count,
SUM(ABS(change_amount)) AS consume_amount
FROM billiards_dwd.dwd_member_balance_change
WHERE site_id = %(site_id)s
AND change_time >= %(start_date)s
AND change_time < %(end_date)s + INTERVAL '1 day'
AND from_type = 1
AND change_amount < 0
AND COALESCE(is_delete, 0) = 0
AND card_type_id IN ({id_list})
GROUP BY change_time::DATE, card_type_id
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def transform(self, data: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
将抽取的数据转换为目标表格式:
- 每种优惠类型一条记录
- 计算团购优惠coupon_amount - 团购实付)
- 计算优惠占比
"""
site_id = context.store_id
tenant_id = self.config.get("app.tenant_id", site_id)
discount_summary = data.get('discount_summary', [])
groupbuy_payments = data.get('groupbuy_payments', {})
big_customer_summary = {r['stat_date']: r for r in data.get('big_customer_summary', [])}
gift_card_consumes = data.get('gift_card_consumes', [])
records = []
# 优惠类型定义
# (type_code, type_name, amount_field, count_field, special_calc)
discount_types = [
('GROUPBUY', '团购优惠', 'coupon_amount_total', 'coupon_order_count', True),
('VIP', '会员折扣', 'member_discount_total', 'member_discount_order_count', False),
('ROUNDING', '抹零', 'rounding_amount_total', 'rounding_order_count', False),
]
gift_card_type_map = {
2791990152417157: ('GIFT_CARD_TABLE', '台费卡抵扣'),
2794699703437125: ('GIFT_CARD_DRINK', '酒水卡抵扣'),
2793266846533445: ('GIFT_CARD_COUPON', '活动抵用券抵扣'),
}
# 赠送卡消费按日期+类型聚合
gift_card_by_date: Dict[date, Dict[str, Dict[str, Any]]] = {}
for row in gift_card_consumes:
stat_date = row.get('stat_date')
card_type_id = row.get('card_type_id')
type_info = gift_card_type_map.get(card_type_id)
if not stat_date or not type_info:
continue
type_code, type_name = type_info
daily = gift_card_by_date.setdefault(stat_date, {})
entry = daily.setdefault(type_code, {'type_name': type_name, 'amount': Decimal('0'), 'count': 0})
entry['amount'] += self.safe_decimal(row.get('consume_amount', 0))
entry['count'] += self.safe_int(row.get('consume_count', 0))
discount_summary_map = {row.get('stat_date'): row for row in discount_summary if row.get('stat_date')}
stat_dates = set(discount_summary_map.keys())
stat_dates.update(groupbuy_payments.keys())
stat_dates.update(big_customer_summary.keys())
stat_dates.update(gift_card_by_date.keys())
for stat_date in sorted(stat_dates):
daily_data = discount_summary_map.get(stat_date, {})
# 计算各类优惠金额
daily_discounts = {}
total_discount = Decimal('0')
for type_code, type_name, amount_field, count_field, special_calc in discount_types:
if special_calc and type_code == 'GROUPBUY':
# 团购优惠 = 团购抵消台费 - 团购实付
coupon_amount = self.safe_decimal(daily_data.get(amount_field, 0))
groupbuy_paid = groupbuy_payments.get(stat_date, Decimal('0'))
discount_amount = coupon_amount - groupbuy_paid
# 确保优惠金额为正数
discount_amount = max(discount_amount, Decimal('0'))
else:
discount_amount = abs(self.safe_decimal(daily_data.get(amount_field, 0)))
usage_count = daily_data.get(count_field, 0) or 0
daily_discounts[type_code] = {
'type_name': type_name,
'amount': discount_amount,
'count': usage_count,
}
total_discount += discount_amount
# 赠送卡拆分(台费卡/酒水卡/活动券)
gift_daily = gift_card_by_date.get(stat_date, {})
for type_code, type_name in gift_card_type_map.values():
info = gift_daily.get(type_code, {'amount': Decimal('0'), 'count': 0})
daily_discounts[type_code] = {
'type_name': type_name,
'amount': self.safe_decimal(info.get('amount', 0)),
'count': self.safe_int(info.get('count', 0)),
}
total_discount += self.safe_decimal(info.get('amount', 0))
# 拆分手动调整为大客户/其他
adjust_amount = abs(self.safe_decimal(daily_data.get('adjust_amount_total', 0)))
adjust_count = daily_data.get('adjust_order_count', 0) or 0
big_customer_info = big_customer_summary.get(stat_date, {})
big_customer_amount = self.safe_decimal(big_customer_info.get('big_customer_amount', 0))
big_customer_count = big_customer_info.get('big_customer_count', 0) or 0
other_amount = adjust_amount - big_customer_amount
if other_amount < 0:
other_amount = Decimal('0')
other_count = adjust_count - big_customer_count
if other_count < 0:
other_count = 0
daily_discounts['BIG_CUSTOMER'] = {
'type_name': '大客户优惠',
'amount': big_customer_amount,
'count': big_customer_count,
}
daily_discounts['OTHER'] = {
'type_name': '其他优惠',
'amount': other_amount,
'count': other_count,
}
total_discount += big_customer_amount + other_amount
# 为每种优惠类型生成记录
for type_code, discount_info in daily_discounts.items():
discount_amount = discount_info['amount']
usage_count = discount_info['count']
# 计算占比(避免除零)
discount_ratio = (discount_amount / total_discount) if total_discount > 0 else Decimal('0')
records.append({
'site_id': site_id,
'tenant_id': tenant_id,
'stat_date': stat_date,
'discount_type_code': type_code,
'discount_type_name': discount_info['type_name'],
'discount_amount': discount_amount,
'discount_ratio': round(discount_ratio, 4),
'usage_count': usage_count,
'affected_orders': usage_count, # 简化:使用次数=影响订单数
})
return records
def _extract_big_customer_discounts(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取大客户优惠(基于手动调整)
"""
member_ids = self._parse_id_list(self.config.get("dws.discount.big_customer_member_ids"))
order_ids = self._parse_id_list(self.config.get("dws.discount.big_customer_order_ids"))
if not member_ids and not order_ids:
return []
sql = """
SELECT
pay_time::DATE AS stat_date,
order_settle_id,
member_id,
adjust_amount
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %(site_id)s
AND pay_time >= %(start_date)s
AND pay_time < %(end_date)s + INTERVAL '1 day'
AND adjust_amount != 0
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
if not rows:
return []
result: Dict[date, Dict[str, Any]] = {}
for row in rows:
row_dict = dict(row)
stat_date = row_dict.get('stat_date')
if not stat_date:
continue
order_id = row_dict.get('order_settle_id')
member_id = row_dict.get('member_id')
if order_id not in order_ids and member_id not in member_ids:
continue
amount = abs(self.safe_decimal(row_dict.get('adjust_amount', 0)))
entry = result.setdefault(stat_date, {'stat_date': stat_date, 'big_customer_amount': Decimal('0'), 'big_customer_count': 0})
entry['big_customer_amount'] += amount
entry['big_customer_count'] += 1
return list(result.values())
def _parse_id_list(self, value: Any) -> set:
if not value:
return set()
if isinstance(value, str):
items = [v.strip() for v in value.split(",") if v.strip()]
return {int(v) for v in items if v.isdigit()}
if isinstance(value, (list, tuple, set)):
result = set()
for item in value:
if item is None:
continue
try:
result.add(int(item))
except (ValueError, TypeError):
continue
return result
return set()
def load(self, records: List[Dict[str, Any]], context: TaskContext) -> Dict[str, Any]:
"""
加载数据到目标表
使用幂等方式delete-before-insert按日期范围
"""
if not records:
return {'inserted': 0, 'deleted': 0}
site_id = context.store_id
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
# 删除窗口内的旧数据
delete_sql = """
DELETE FROM billiards_dws.dws_finance_discount_detail
WHERE site_id = %(site_id)s
AND stat_date >= %(start_date)s
AND stat_date <= %(end_date)s
"""
deleted = self.db.execute(delete_sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
# 批量插入新数据
insert_sql = """
INSERT INTO billiards_dws.dws_finance_discount_detail (
site_id, tenant_id, stat_date,
discount_type_code, discount_type_name,
discount_amount, discount_ratio,
usage_count, affected_orders,
created_at, updated_at
) VALUES (
%(site_id)s, %(tenant_id)s, %(stat_date)s,
%(discount_type_code)s, %(discount_type_name)s,
%(discount_amount)s, %(discount_ratio)s,
%(usage_count)s, %(affected_orders)s,
NOW(), NOW()
)
"""
inserted = 0
for record in records:
self.db.execute(insert_sql, record)
inserted += 1
return {
'deleted': deleted or 0,
'inserted': inserted,
}

View File

@@ -0,0 +1,412 @@
# -*- coding: utf-8 -*-
"""
收入结构分析任务
功能说明:
"日期+区域/类型"为粒度,分析收入结构
数据来源:
- dwd_settlement_head: 结账单头表(台费、商品、助教正价)
- dwd_table_fee_log: 台费流水(区域关联)
- dwd_assistant_service_log: 助教服务流水(区域关联)
- cfg_area_category: 区域分类映射
目标表:
billiards_dws.dws_finance_income_structure
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期+类型)
业务规则:
- 结构类型1INCOME_TYPE按收入类型分析台费/商品/助教基础课/助教附加课)
- 结构类型2AREA按区域分析普通台球区/VIP包厢/斯诺克/麻将/KTV等
- 区域映射使用cfg_area_category配置
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class FinanceIncomeStructureTask(BaseDwsTask):
"""
收入结构分析任务
分析收入的两种维度:
1. INCOME_TYPE: 按收入类型(台费/商品/助教基础课/助教附加课)
2. AREA: 按区域使用cfg_area_category映射
"""
def get_task_code(self) -> str:
return "DWS_FINANCE_INCOME_STRUCTURE"
def get_target_table(self) -> str:
return "dws_finance_income_structure"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date", "structure_type", "category_code"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
抽取数据
分两条路径抽取:
1. 按收入类型汇总来自settlement_head
2. 按区域汇总来自table_fee_log和assistant_service_log
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 按收入类型汇总
income_by_type = self._extract_income_by_type(site_id, start_date, end_date)
# 按区域汇总
income_by_area = self._extract_income_by_area(site_id, start_date, end_date)
return {
'income_by_type': income_by_type,
'income_by_area': income_by_area,
}
def _extract_income_by_type(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
按收入类型汇总
收入类型分类:
- TABLE_FEE: 台费收入 (table_charge_money)
- GOODS: 商品收入 (goods_money)
- ASSISTANT_BASE: 助教基础课 (assistant_pd_money)
- ASSISTANT_BONUS: 助教附加课 (assistant_cx_money)
"""
sql = """
SELECT
pay_time::DATE AS stat_date,
-- 台费收入
COALESCE(SUM(table_charge_money), 0) AS table_fee_income,
COUNT(CASE WHEN table_charge_money > 0 THEN 1 END) AS table_fee_orders,
-- 商品收入
COALESCE(SUM(goods_money), 0) AS goods_income,
COUNT(CASE WHEN goods_money > 0 THEN 1 END) AS goods_orders,
-- 助教基础课收入PD=陪打)
COALESCE(SUM(assistant_pd_money), 0) AS assistant_base_income,
COUNT(CASE WHEN assistant_pd_money > 0 THEN 1 END) AS assistant_base_orders,
-- 助教附加课收入CX=超休/促销)
COALESCE(SUM(assistant_cx_money), 0) AS assistant_bonus_income,
COUNT(CASE WHEN assistant_cx_money > 0 THEN 1 END) AS assistant_bonus_orders,
-- 总订单数
COUNT(*) AS total_orders
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %(site_id)s
AND pay_time >= %(start_date)s
AND pay_time < %(end_date)s + INTERVAL '1 day'
AND settle_status = 1 -- 已结账
GROUP BY pay_time::DATE
ORDER BY stat_date
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def _extract_income_by_area(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
按区域汇总收入
关联dim_table获取区域名称再映射到cfg_area_category
"""
sql = """
WITH area_orders AS (
SELECT
tfl.pay_time::DATE AS stat_date,
dt.site_table_area_name AS area_name,
tfl.order_settle_id,
COALESCE(tfl.ledger_amount, 0) AS income_amount,
COALESCE(tfl.ledger_time_seconds, 0) AS duration_seconds
FROM billiards_dwd.dwd_table_fee_log tfl
LEFT JOIN billiards_dwd.dim_table dt
ON dt.site_table_id = tfl.site_table_id
WHERE tfl.site_id = %(site_id)s
AND tfl.pay_time >= %(start_date)s
AND tfl.pay_time < %(end_date)s + INTERVAL '1 day'
AND COALESCE(tfl.is_delete, 0) = 0
UNION ALL
SELECT
asl.start_use_time::DATE AS stat_date,
dt.site_table_area_name AS area_name,
asl.order_settle_id,
COALESCE(asl.ledger_amount, 0) AS income_amount,
COALESCE(asl.income_seconds, 0) AS duration_seconds
FROM billiards_dwd.dwd_assistant_service_log asl
LEFT JOIN billiards_dwd.dim_table dt
ON dt.site_table_id = asl.site_table_id
WHERE asl.site_id = %(site_id)s
AND asl.start_use_time >= %(start_date)s
AND asl.start_use_time < %(end_date)s + INTERVAL '1 day'
AND asl.is_delete = 0
)
SELECT
stat_date,
area_name,
COALESCE(SUM(income_amount), 0) AS income_amount,
COALESCE(SUM(duration_seconds), 0) AS duration_seconds,
COUNT(DISTINCT order_settle_id) AS order_count
FROM area_orders
GROUP BY stat_date, area_name
ORDER BY stat_date, area_name
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def transform(self, data: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
将抽取的数据转换为目标表格式:
1. 按收入类型展开(每种类型一条记录)
2. 按区域展开(每个区域一条记录)
3. 计算占比
"""
site_id = context.store_id
tenant_id = self.config.get("app.tenant_id", site_id)
records = []
# 处理按收入类型的数据
income_type_records = self._transform_income_by_type(
data.get('income_by_type', []),
site_id,
tenant_id
)
records.extend(income_type_records)
# 处理按区域的数据
area_records = self._transform_income_by_area(
data.get('income_by_area', []),
site_id,
tenant_id
)
records.extend(area_records)
return records
def _transform_income_by_type(
self,
income_data: List[Dict[str, Any]],
site_id: int,
tenant_id: int
) -> List[Dict[str, Any]]:
"""
转换按收入类型的数据
将每日汇总数据展开为4条记录台费/商品/基础课/附加课)
"""
# 收入类型定义
income_types = [
('TABLE_FEE', '台费收入', 'table_fee_income', 'table_fee_orders'),
('GOODS', '商品收入', 'goods_income', 'goods_orders'),
('ASSISTANT_BASE', '助教基础课', 'assistant_base_income', 'assistant_base_orders'),
('ASSISTANT_BONUS', '助教附加课', 'assistant_bonus_income', 'assistant_bonus_orders'),
]
records = []
for daily_data in income_data:
stat_date = daily_data.get('stat_date')
# 计算当日总收入(用于计算占比)
total_income = sum(
self.safe_decimal(daily_data.get(field, 0))
for _, _, field, _ in income_types
)
# 为每种收入类型生成一条记录
for type_code, type_name, income_field, order_field in income_types:
income_amount = self.safe_decimal(daily_data.get(income_field, 0))
order_count = daily_data.get(order_field, 0) or 0
# 计算占比(避免除零)
income_ratio = (income_amount / total_income) if total_income > 0 else Decimal('0')
records.append({
'site_id': site_id,
'tenant_id': tenant_id,
'stat_date': stat_date,
'structure_type': 'INCOME_TYPE',
'category_code': type_code,
'category_name': type_name,
'income_amount': income_amount,
'income_ratio': round(income_ratio, 4),
'order_count': order_count,
'duration_minutes': 0, # 收入类型维度不统计时长
})
return records
def _transform_income_by_area(
self,
area_data: List[Dict[str, Any]],
site_id: int,
tenant_id: int
) -> List[Dict[str, Any]]:
"""
转换按区域的数据
将区域名称映射到cfg_area_category的category_code
"""
records = []
# 加载区域分类配置
self.load_config_cache()
# 按日期分组计算总收入(用于计算占比)
daily_totals = {}
for row in area_data:
stat_date = row.get('stat_date')
income = self.safe_decimal(row.get('income_amount', 0))
daily_totals[stat_date] = daily_totals.get(stat_date, Decimal('0')) + income
# 按日期+区域聚合相同category_code需要合并
aggregated = {}
for row in area_data:
stat_date = row.get('stat_date')
area_name = row.get('area_name') or '未知区域'
income_amount = self.safe_decimal(row.get('income_amount', 0))
duration_seconds = row.get('duration_seconds', 0) or 0
order_count = row.get('order_count', 0) or 0
# 映射区域名称到分类代码
category = self.get_area_category(area_name)
category_code = category.get('category_code', 'OTHER')
category_name = category.get('category_name', '其他区域')
# 聚合键
key = (stat_date, category_code)
if key not in aggregated:
aggregated[key] = {
'stat_date': stat_date,
'category_code': category_code,
'category_name': category_name,
'income_amount': Decimal('0'),
'duration_seconds': 0,
'order_count': 0,
}
aggregated[key]['income_amount'] += income_amount
aggregated[key]['duration_seconds'] += duration_seconds
aggregated[key]['order_count'] += order_count
# 生成记录
for key, agg_data in aggregated.items():
stat_date = agg_data['stat_date']
total_income = daily_totals.get(stat_date, Decimal('1'))
income_amount = agg_data['income_amount']
# 计算占比
income_ratio = (income_amount / total_income) if total_income > 0 else Decimal('0')
records.append({
'site_id': site_id,
'tenant_id': tenant_id,
'stat_date': stat_date,
'structure_type': 'AREA',
'category_code': agg_data['category_code'],
'category_name': agg_data['category_name'],
'income_amount': income_amount,
'income_ratio': round(income_ratio, 4),
'order_count': agg_data['order_count'],
'duration_minutes': agg_data['duration_seconds'] // 60,
})
return records
def _map_area_to_category(
self,
area_name: str,
area_categories: Dict[str, Dict[str, Any]]
) -> Dict[str, Any]:
"""
兼容旧逻辑的映射方法(当前使用 get_area_category
"""
return self.get_area_category(area_name)
def load(self, records: List[Dict[str, Any]], context: TaskContext) -> Dict[str, Any]:
"""
加载数据到目标表
使用幂等方式delete-before-insert按日期范围
"""
if not records:
return {'inserted': 0, 'deleted': 0}
site_id = context.store_id
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
# 删除窗口内的旧数据
delete_sql = """
DELETE FROM billiards_dws.dws_finance_income_structure
WHERE site_id = %(site_id)s
AND stat_date >= %(start_date)s
AND stat_date <= %(end_date)s
"""
deleted = self.db.execute(delete_sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
# 批量插入新数据
insert_sql = """
INSERT INTO billiards_dws.dws_finance_income_structure (
site_id, tenant_id, stat_date,
structure_type, category_code, category_name,
income_amount, income_ratio,
order_count, duration_minutes,
created_at, updated_at
) VALUES (
%(site_id)s, %(tenant_id)s, %(stat_date)s,
%(structure_type)s, %(category_code)s, %(category_name)s,
%(income_amount)s, %(income_ratio)s,
%(order_count)s, %(duration_minutes)s,
NOW(), NOW()
)
"""
inserted = 0
for record in records:
self.db.execute(insert_sql, record)
inserted += 1
return {
'deleted': deleted or 0,
'inserted': inserted,
}

View File

@@ -0,0 +1,173 @@
# -*- coding: utf-8 -*-
"""
充值统计任务
功能说明:
"日期"为粒度,统计充值数据
数据来源:
- dwd_recharge_order: 充值订单
- dim_member_card_account: 会员卡账户(余额快照)
目标表:
billiards_dws.dws_finance_recharge_summary
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期
业务规则:
- 首充/续充:通过 is_first 字段区分
- 现金/赠送:通过 pay_money/gift_money 区分
- 卡余额:区分储值卡和赠送卡
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class FinanceRechargeTask(BaseDwsTask):
"""
充值统计任务
"""
def get_task_code(self) -> str:
return "DWS_FINANCE_RECHARGE"
def get_target_table(self) -> str:
return "dws_finance_recharge_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
recharge_summary = self._extract_recharge_summary(site_id, start_date, end_date)
card_balances = self._extract_card_balances(site_id, end_date)
return {
'recharge_summary': recharge_summary,
'card_balances': card_balances,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
recharge_summary = extracted['recharge_summary']
card_balances = extracted['card_balances']
site_id = extracted['site_id']
results = []
for recharge in recharge_summary:
stat_date = recharge.get('stat_date')
# 仅有当前快照时统一写入避免窗口内其他日期为0
balance = card_balances
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'stat_date': stat_date,
'recharge_count': self.safe_int(recharge.get('recharge_count', 0)),
'recharge_total': self.safe_decimal(recharge.get('recharge_total', 0)),
'recharge_cash': self.safe_decimal(recharge.get('recharge_cash', 0)),
'recharge_gift': self.safe_decimal(recharge.get('recharge_gift', 0)),
'first_recharge_count': self.safe_int(recharge.get('first_recharge_count', 0)),
'first_recharge_cash': self.safe_decimal(recharge.get('first_recharge_cash', 0)),
'first_recharge_gift': self.safe_decimal(recharge.get('first_recharge_gift', 0)),
'first_recharge_total': self.safe_decimal(recharge.get('first_recharge_total', 0)),
'renewal_count': self.safe_int(recharge.get('renewal_count', 0)),
'renewal_cash': self.safe_decimal(recharge.get('renewal_cash', 0)),
'renewal_gift': self.safe_decimal(recharge.get('renewal_gift', 0)),
'renewal_total': self.safe_decimal(recharge.get('renewal_total', 0)),
'recharge_member_count': self.safe_int(recharge.get('recharge_member_count', 0)),
'new_member_count': self.safe_int(recharge.get('new_member_count', 0)),
'total_card_balance': self.safe_decimal(balance.get('total_balance', 0)),
'cash_card_balance': self.safe_decimal(balance.get('cash_balance', 0)),
'gift_card_balance': self.safe_decimal(balance.get('gift_balance', 0)),
}
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
if not transformed:
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
deleted = self.delete_existing_data(context, date_col="stat_date")
inserted = self.bulk_insert(transformed)
return {
"counts": {"fetched": len(transformed), "inserted": inserted, "updated": 0, "skipped": 0, "errors": 0},
"extra": {"deleted": deleted}
}
def _extract_recharge_summary(self, site_id: int, start_date: date, end_date: date) -> List[Dict[str, Any]]:
sql = """
SELECT
DATE(pay_time) AS stat_date,
COUNT(*) AS recharge_count,
SUM(pay_money + gift_money) AS recharge_total,
SUM(pay_money) AS recharge_cash,
SUM(gift_money) AS recharge_gift,
COUNT(CASE WHEN is_first = 1 THEN 1 END) AS first_recharge_count,
SUM(CASE WHEN is_first = 1 THEN pay_money ELSE 0 END) AS first_recharge_cash,
SUM(CASE WHEN is_first = 1 THEN gift_money ELSE 0 END) AS first_recharge_gift,
SUM(CASE WHEN is_first = 1 THEN pay_money + gift_money ELSE 0 END) AS first_recharge_total,
COUNT(CASE WHEN is_first != 1 OR is_first IS NULL THEN 1 END) AS renewal_count,
SUM(CASE WHEN is_first != 1 OR is_first IS NULL THEN pay_money ELSE 0 END) AS renewal_cash,
SUM(CASE WHEN is_first != 1 OR is_first IS NULL THEN gift_money ELSE 0 END) AS renewal_gift,
SUM(CASE WHEN is_first != 1 OR is_first IS NULL THEN pay_money + gift_money ELSE 0 END) AS renewal_total,
COUNT(DISTINCT member_id) AS recharge_member_count,
COUNT(DISTINCT CASE WHEN is_first = 1 THEN member_id END) AS new_member_count
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s AND DATE(pay_time) >= %s AND DATE(pay_time) <= %s
GROUP BY DATE(pay_time)
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_card_balances(self, site_id: int, stat_date: date) -> Dict[str, Decimal]:
CASH_CARD_TYPE_ID = 2793249295533893
GIFT_CARD_TYPE_IDS = [2791990152417157, 2793266846533445, 2794699703437125]
sql = """
SELECT card_type_id, SUM(balance) AS total_balance
FROM billiards_dwd.dim_member_card_account
WHERE site_id = %s AND scd2_is_current = 1
AND COALESCE(is_delete, 0) = 0
GROUP BY card_type_id
"""
rows = self.db.query(sql, (site_id,))
cash_balance = Decimal('0')
gift_balance = Decimal('0')
for row in (rows or []):
card_type_id = row['card_type_id']
balance = self.safe_decimal(row['total_balance'])
if card_type_id == CASH_CARD_TYPE_ID:
cash_balance += balance
elif card_type_id in GIFT_CARD_TYPE_IDS:
gift_balance += balance
return {
'cash_balance': cash_balance,
'gift_balance': gift_balance,
'total_balance': cash_balance + gift_balance
}
__all__ = ['FinanceRechargeTask']

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
"""
指数算法任务模块
包含:
- WinbackIndexTask: 老客挽回指数 (WBI)
- NewconvIndexTask: 新客转化指数 (NCI)
- RecallIndexTask: 客户召回指数计算任务(旧版)
- IntimacyIndexTask: 客户-助教亲密指数计算任务
- MlManualImportTask: ML 人工台账导入任务
- RelationIndexTask: 关系指数计算任务RS/OS/MS/ML
"""
from .recall_index_task import RecallIndexTask
from .intimacy_index_task import IntimacyIndexTask
from .winback_index_task import WinbackIndexTask
from .newconv_index_task import NewconvIndexTask
from .ml_manual_import_task import MlManualImportTask
from .relation_index_task import RelationIndexTask
__all__ = [
'WinbackIndexTask',
'NewconvIndexTask',
'RecallIndexTask',
'IntimacyIndexTask',
'MlManualImportTask',
'RelationIndexTask',
]

View File

@@ -0,0 +1,571 @@
# -*- coding: utf-8 -*-
"""
指数算法任务基类
功能说明:
- 提供半衰期时间衰减函数
- 提供分位数计算和分位截断
- 提供0-10映射方法
- 提供算法参数加载
- 提供分位点历史记录用于EWMA平滑
算法原理:
1. 时间衰减函数半衰期模型decay(d; h) = exp(-ln(2) * d / h)
当 d=h 时权重衰减到 0.5,越近权重越大
2. 0-10映射流程
Raw Score → Winsorize(P5, P95) → [可选Log/asinh压缩] → MinMax(0, 10)
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from abc import abstractmethod
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from ..base_dws_task import BaseDwsTask, TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class IndexParameters:
"""指数算法参数数据类"""
params: Dict[str, float]
loaded_at: datetime
@dataclass
class PercentileHistory:
"""分位点历史记录"""
percentile_5: float
percentile_95: float
percentile_5_smoothed: float
percentile_95_smoothed: float
record_count: int
calc_time: datetime
# =============================================================================
# 指数任务基类
# =============================================================================
class BaseIndexTask(BaseDwsTask):
"""
指数算法任务基类
提供指数计算通用功能:
1. 半衰期时间衰减函数
2. 分位数计算与截断
3. 0-10归一化映射
4. 算法参数加载
5. 分位点历史管理EWMA平滑
"""
# 子类需要定义的指数类型
INDEX_TYPE: str = ""
# 参数缓存TTL
_index_params_ttl: int = 300
def __init__(self, config, db_connection, api_client, logger):
super().__init__(config, db_connection, api_client, logger)
# 参数缓存:按 index_type 隔离,避免单任务多指数串参
self._index_params_cache_by_type: Dict[str, IndexParameters] = {}
# 默认参数
DEFAULT_LOOKBACK_DAYS = 60
DEFAULT_PERCENTILE_LOWER = 5
DEFAULT_PERCENTILE_UPPER = 95
DEFAULT_EWMA_ALPHA = 0.2
# ==========================================================================
# 抽象方法(子类需实现)
# ==========================================================================
@abstractmethod
def get_index_type(self) -> str:
"""获取指数类型RECALL/INTIMACY"""
raise NotImplementedError
# ==========================================================================
# 时间衰减函数
# ==========================================================================
def decay(self, days: float, halflife: float) -> float:
"""
半衰期衰减函数
公式: decay(d; h) = exp(-ln(2) * d / h)
解释:当 d=h 时权重衰减到 0.5;越近权重越大,符合"近期更重要"的直觉
Args:
days: 事件距今天数 (d >= 0)
halflife: 半衰期 (h > 0),单位:天
Returns:
衰减后的权重,范围 (0, 1]
Examples:
>>> decay(0, 7) # 今天,权重=1.0
1.0
>>> decay(7, 7) # 7天前半衰期=7权重=0.5
0.5
>>> decay(14, 7) # 14天前权重=0.25
0.25
"""
if halflife <= 0:
raise ValueError("半衰期必须大于0")
if days < 0:
days = 0
return math.exp(-math.log(2) * days / halflife)
# ==========================================================================
# 分位数计算
# ==========================================================================
def calculate_percentiles(
self,
scores: List[float],
lower: int = 5,
upper: int = 95
) -> Tuple[float, float]:
"""
计算分位点
Args:
scores: 分数列表
lower: 下分位点百分比默认5
upper: 上分位点百分比默认95
Returns:
(下分位值, 上分位值) 元组
"""
if not scores:
return 0.0, 0.0
sorted_scores = sorted(scores)
n = len(sorted_scores)
# 计算分位点索引
lower_idx = max(0, int(n * lower / 100) - 1)
upper_idx = min(n - 1, int(n * upper / 100))
return sorted_scores[lower_idx], sorted_scores[upper_idx]
def winsorize(self, value: float, lower: float, upper: float) -> float:
"""
分位截断Winsorize
将值限制在 [lower, upper] 范围内
Args:
value: 原始值
lower: 下限P5分位
upper: 上限P95分位
Returns:
截断后的值
"""
return min(max(value, lower), upper)
# ==========================================================================
# 0-10映射
# ==========================================================================
def normalize_to_display(
self,
value: float,
min_val: float,
max_val: float,
use_log: bool = False,
compression: Optional[str] = None,
epsilon: float = 1e-6
) -> float:
"""
归一化到0-10分
映射流程:
1. [可选] 压缩y = ln(1 + x) / asinh(x)
2. MinMax映射score = 10 * (y - min) / (max - min)
Args:
value: 原始值已Winsorize
min_val: 最小值通常为P5
max_val: 最大值通常为P95
use_log: 是否使用log1p压缩兼容历史参数
compression: 压缩方式none/log1p/asinh优先级高于use_log
epsilon: 防除零小量
Returns:
0-10范围的分数
"""
compression_mode = self._resolve_compression(compression, use_log)
if compression_mode == "log1p":
value = math.log1p(value)
min_val = math.log1p(min_val)
max_val = math.log1p(max_val)
elif compression_mode == "asinh":
value = math.asinh(value)
min_val = math.asinh(min_val)
max_val = math.asinh(max_val)
# 防止分母为0
range_val = max_val - min_val
if range_val < epsilon:
return 5.0 # 几乎全员相同时返回中间值
score = 10.0 * (value - min_val) / range_val
# 确保在0-10范围内
return max(0.0, min(10.0, score))
def batch_normalize_to_display(
self,
raw_scores: List[Tuple[Any, float]], # [(entity_id, raw_score), ...]
use_log: bool = False,
compression: Optional[str] = None,
percentile_lower: int = 5,
percentile_upper: int = 95,
use_smoothing: bool = False,
site_id: Optional[int] = None,
index_type: Optional[str] = None,
) -> List[Tuple[Any, float, float]]:
"""
批量归一化Raw Score到Display Score
流程:
1. 提取所有raw_score
2. 计算分位点可选EWMA平滑
3. Winsorize截断
4. MinMax映射到0-10
Args:
raw_scores: (entity_id, raw_score) 元组列表
use_log: 是否使用log1p压缩兼容历史参数
compression: 压缩方式none/log1p/asinh优先级高于use_log
percentile_lower: 下分位百分比
percentile_upper: 上分位百分比
use_smoothing: 是否使用EWMA平滑分位点
site_id: 门店ID平滑时需要
index_type: 指数类型(平滑时用于分位历史隔离)
Returns:
(entity_id, raw_score, display_score) 元组列表
"""
if not raw_scores:
return []
# 提取raw_score
scores = [s for _, s in raw_scores]
# 计算分位点
q_l, q_u = self.calculate_percentiles(scores, percentile_lower, percentile_upper)
# EWMA平滑
if use_smoothing and site_id is not None:
q_l, q_u = self._apply_ewma_smoothing(
site_id=site_id,
current_p5=q_l,
current_p95=q_u,
index_type=index_type,
)
# 映射
results = []
compression_mode = self._resolve_compression(compression, use_log)
for entity_id, raw_score in raw_scores:
clipped = self.winsorize(raw_score, q_l, q_u)
display = self.normalize_to_display(
clipped,
q_l,
q_u,
compression=compression_mode,
)
results.append((entity_id, raw_score, round(display, 2)))
return results
# ==========================================================================
# 算法参数加载
# ==========================================================================
def load_index_parameters(
self,
index_type: Optional[str] = None,
force_reload: bool = False
) -> Dict[str, float]:
"""
加载指数算法参数
Args:
index_type: 指数类型默认使用子类定义的INDEX_TYPE
force_reload: 是否强制重新加载
Returns:
参数名到参数值的字典
"""
if index_type is None:
index_type = self.get_index_type()
now = datetime.now(self.tz)
cache_key = str(index_type).upper()
cache_item = self._index_params_cache_by_type.get(cache_key)
# 检查缓存
if (
not force_reload
and cache_item is not None
and (now - cache_item.loaded_at).total_seconds() < self._index_params_ttl
):
return cache_item.params
self.logger.debug("加载指数算法参数: %s", index_type)
sql = """
SELECT param_name, param_value
FROM billiards_dws.cfg_index_parameters
WHERE index_type = %s
AND effective_from <= CURRENT_DATE
AND (effective_to IS NULL OR effective_to >= CURRENT_DATE)
ORDER BY effective_from DESC
"""
rows = self.db.query(sql, (index_type,))
params = {}
seen = set()
for row in (rows or []):
row_dict = dict(row)
name = row_dict['param_name']
if name not in seen:
params[name] = float(row_dict['param_value'])
seen.add(name)
self._index_params_cache_by_type[cache_key] = IndexParameters(
params=params,
loaded_at=now
)
return params
def get_param(
self,
name: str,
default: float = 0.0,
index_type: Optional[str] = None,
) -> float:
"""
获取单个参数值
Args:
name: 参数名
default: 默认值
Returns:
参数值
"""
params = self.load_index_parameters(index_type=index_type)
return params.get(name, default)
# ==========================================================================
# 分位点历史管理EWMA平滑
# ==========================================================================
def get_last_percentile_history(
self,
site_id: int,
index_type: Optional[str] = None
) -> Optional[PercentileHistory]:
"""
获取最近一次分位点历史
Args:
site_id: 门店ID
index_type: 指数类型
Returns:
PercentileHistory 或 None
"""
if index_type is None:
index_type = self.get_index_type()
sql = """
SELECT
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, calc_time
FROM billiards_dws.dws_index_percentile_history
WHERE site_id = %s AND index_type = %s
ORDER BY calc_time DESC
LIMIT 1
"""
rows = self.db.query(sql, (site_id, index_type))
if not rows:
return None
row = dict(rows[0])
return PercentileHistory(
percentile_5=float(row['percentile_5'] or 0),
percentile_95=float(row['percentile_95'] or 0),
percentile_5_smoothed=float(row['percentile_5_smoothed'] or 0),
percentile_95_smoothed=float(row['percentile_95_smoothed'] or 0),
record_count=int(row['record_count'] or 0),
calc_time=row['calc_time']
)
def save_percentile_history(
self,
site_id: int,
percentile_5: float,
percentile_95: float,
percentile_5_smoothed: float,
percentile_95_smoothed: float,
record_count: int,
min_raw: float,
max_raw: float,
avg_raw: float,
index_type: Optional[str] = None
) -> None:
"""
保存分位点历史
Args:
site_id: 门店ID
percentile_5: 原始5分位
percentile_95: 原始95分位
percentile_5_smoothed: 平滑后5分位
percentile_95_smoothed: 平滑后95分位
record_count: 记录数
min_raw: 最小Raw Score
max_raw: 最大Raw Score
avg_raw: 平均Raw Score
index_type: 指数类型
"""
if index_type is None:
index_type = self.get_index_type()
sql = """
INSERT INTO billiards_dws.dws_index_percentile_history (
site_id, index_type, calc_time,
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, min_raw_score, max_raw_score, avg_raw_score
) VALUES (%s, %s, NOW(), %s, %s, %s, %s, %s, %s, %s, %s)
"""
with self.db.conn.cursor() as cur:
cur.execute(sql, (
site_id, index_type,
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, min_raw, max_raw, avg_raw
))
self.db.conn.commit()
def _apply_ewma_smoothing(
self,
site_id: int,
current_p5: float,
current_p95: float,
alpha: Optional[float] = None,
index_type: Optional[str] = None,
) -> Tuple[float, float]:
"""
应用EWMA平滑到分位点
公式: Q_t = (1 - α) * Q_{t-1} + α * Q_now
Args:
site_id: 门店ID
current_p5: 当前5分位
current_p95: 当前95分位
alpha: 平滑系数默认0.2
index_type: 指数类型(用于参数和历史隔离)
Returns:
(平滑后的P5, 平滑后的P95)
"""
if index_type is None:
index_type = self.get_index_type()
if alpha is None:
alpha = self.get_param(
'ewma_alpha',
self.DEFAULT_EWMA_ALPHA,
index_type=index_type,
)
history = self.get_last_percentile_history(site_id, index_type=index_type)
if history is None:
# 首次计算,不平滑
return current_p5, current_p95
smoothed_p5 = (1 - alpha) * history.percentile_5_smoothed + alpha * current_p5
smoothed_p95 = (1 - alpha) * history.percentile_95_smoothed + alpha * current_p95
return smoothed_p5, smoothed_p95
# ==========================================================================
# 统计工具方法
# ==========================================================================
def calculate_median(self, values: List[float]) -> float:
"""计算中位数"""
if not values:
return 0.0
sorted_vals = sorted(values)
n = len(sorted_vals)
mid = n // 2
if n % 2 == 0:
return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
return sorted_vals[mid]
def calculate_mad(self, values: List[float]) -> float:
"""
计算MAD中位绝对偏差
MAD = median(|x - median(x)|)
MAD是比标准差更稳健的离散度度量不受极端值影响
"""
if not values:
return 0.0
median_val = self.calculate_median(values)
deviations = [abs(v - median_val) for v in values]
return self.calculate_median(deviations)
def safe_log(self, value: float, default: float = 0.0) -> float:
"""安全的对数运算"""
if value <= 0:
return default
return math.log(value)
def safe_ln1p(self, value: float) -> float:
"""安全的ln(1+x)运算"""
if value < -1:
return 0.0
return math.log1p(value)
def _resolve_compression(self, compression: Optional[str], use_log: bool) -> str:
"""规范化压缩方式"""
if compression is None:
return "log1p" if use_log else "none"
compression_key = str(compression).strip().lower()
if compression_key in ("none", "log1p", "asinh"):
return compression_key
if hasattr(self, "logger"):
self.logger.warning("未知压缩方式: %s,已降级为 none", compression)
return "none"

View File

@@ -0,0 +1,694 @@
# -*- coding: utf-8 -*-
"""
客户-助教亲密指数计算任务
功能说明:
- 衡量客户与助教的关系强度和近期温度
- 用于助教约课精力分配和约课成功率预估
- 附加课权重 = 基础课的1.5倍
- 检测频率激增并放大权重
算法公式:
Raw Score = (w_F × F + w_R × R + w_M × M + w_D × D) × mult
其中:
- F = Σ(τ_i × decay(d_i, h_sess)) # 频次强度
- R = decay(d_last, h_last) # 最近温度
- M = Σ(ln(1+amt/A0) × decay(d_r, h_pay)) # 归因充值强度
- D = Σ(sqrt(dur/60) × τ × decay(d, h)) # 时长贡献
- mult = 1 + γ × burst # 激增放大
- burst = max(0, ln(1 + (F_short/F_long - 1)))
特殊逻辑:
- 会话合并:同一客人对同一助教,间隔<4小时算同次服务
- 充值归因服务结束后1小时内的充值算做该助教贡献
数据来源:
- dwd_assistant_service_log: 服务记录
- dwd_recharge_order: 充值记录
更新频率每4小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import CourseType, TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class ServiceSession:
"""合并后的服务会话"""
session_start: datetime
session_end: datetime
total_duration_minutes: int = 0
course_weight: float = 1.0 # 1.0=基础课, 1.5=附加课
is_incentive: bool = False # 是否为附加课
@dataclass
class AttributedRecharge:
"""归因充值"""
pay_time: datetime
pay_amount: float
days_ago: float
@dataclass
class MemberAssistantIntimacyData:
"""客户-助教亲密数据"""
member_id: int
assistant_id: int # 助教IDdim_assistant.assistant_id通过user_id关联获取
assistant_user_id: int # 助教user_id来自服务日志用于中间关联
site_id: int
tenant_id: int
# 计算输入特征
session_count: int = 0
total_duration_minutes: int = 0
basic_session_count: int = 0
incentive_session_count: int = 0
days_since_last_session: Optional[int] = None
attributed_recharge_count: int = 0
attributed_recharge_amount: float = 0.0
# 分项得分
score_frequency: float = 0.0
score_recency: float = 0.0
score_recharge: float = 0.0
score_duration: float = 0.0
burst_multiplier: float = 1.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# 中间数据
sessions: List[ServiceSession] = field(default_factory=list)
recharges: List[AttributedRecharge] = field(default_factory=list)
# =============================================================================
# 亲密指数任务
# =============================================================================
class IntimacyIndexTask(BaseIndexTask):
"""
客户-助教亲密指数计算任务
计算流程:
1. 提取近60天的助教服务记录
2. 按(member_id, assistant_id)分组合并4小时内的服务
3. 提取归因充值服务结束后1小时内
4. 计算5项分数频次、最近、充值、时长、激增
5. 汇总Raw Score
6. 分位截断 + Log压缩 + MinMax映射到0-10
7. 写入DWS表
"""
INDEX_TYPE = "INTIMACY"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'halflife_session': 14.0,
'halflife_last': 10.0,
'halflife_recharge': 21.0,
'halflife_short': 7.0,
'halflife_long': 30.0,
'amount_base': 500.0,
'incentive_weight': 1.5,
'session_merge_hours': 4,
'recharge_attribute_hours': 1,
'weight_frequency': 2.0,
'weight_recency': 1.5,
'weight_recharge': 2.0,
'weight_duration': 0.5,
'burst_gamma': 0.6,
'compression_mode': 1, # 0=none, 1=log1p, 2=asinh
'use_smoothing': 1, # 1=启用EWMA平滑, 0=关闭
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_INTIMACY_INDEX"
def get_target_table(self) -> str:
return "dws_member_assistant_intimacy"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id', 'assistant_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行亲密指数计算"""
self.logger.info("开始计算客户-助教亲密指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期和时间
now = datetime.now(self.tz)
base_date = now.date()
start_datetime = now - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, h_sess=%.1f, h_last=%.1f, h_pay=%.1f, γ=%.2f",
lookback_days, params['halflife_session'], params['halflife_last'],
params['halflife_recharge'], params['burst_gamma']
)
# 1. 提取服务记录
raw_services = self._extract_service_records(site_id, start_datetime, now)
self.logger.info("提取到 %d 条原始服务记录", len(raw_services))
if not raw_services:
self.logger.warning("没有服务记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 按(member_id, assistant_id)分组并合并会话
pair_data = self._group_and_merge_sessions(raw_services, params, now)
self.logger.info("合并为 %d 个客户-助教对", len(pair_data))
# 3. 提取归因充值
self._extract_attributed_recharges(site_id, pair_data, params, now)
# 4. 计算每个pair的特征和分数
intimacy_data_list: List[MemberAssistantIntimacyData] = []
for key, data in pair_data.items():
data.site_id = site_id
data.tenant_id = tenant_id
# 计算分项得分
self._calculate_component_scores(data, params, now)
# 汇总Raw Score
base_score = (
params['weight_frequency'] * data.score_frequency +
params['weight_recency'] * data.score_recency +
params['weight_recharge'] * data.score_recharge +
params['weight_duration'] * data.score_duration
)
data.raw_score = base_score * data.burst_multiplier
intimacy_data_list.append(data)
self.logger.info("计算完成 %d 个pair的Raw Score", len(intimacy_data_list))
# 5. 归一化到Display Score支持log1p/asinh压缩
compression_mode = int(params.get('compression_mode', 1))
compression = {1: "log1p", 2: "asinh"}.get(compression_mode, "none")
use_smoothing = bool(int(params.get('use_smoothing', 1)))
raw_scores = [((d.member_id, d.assistant_id), d.raw_score) for d in intimacy_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
# 更新display_score
score_map = {key: (raw, display) for key, raw, display in normalized}
for data in intimacy_data_list:
key = (data.member_id, data.assistant_id)
if key in score_map:
_, data.display_score = score_map[key]
# 6. 保存分位点历史
if intimacy_data_list:
all_raw = [d.raw_score for d in intimacy_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_intimacy_data(intimacy_data_list)
self.logger.info("亲密指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'pair_count': len(intimacy_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_records(
self,
site_id: int,
start_datetime: datetime,
end_datetime: datetime
) -> List[Dict[str, Any]]:
"""
提取服务记录
注意: 使用 assistant_no (助教工号) 作为助教标识,而不是 site_assistant_id
因为 site_assistant_id 在数据中是每次服务的唯一ID不是助教的唯一标识
Returns:
[{'member_id', 'assistant_no', 'assistant_nickname', 'start_time', 'end_time', 'duration_minutes', 'skill_id'}, ...]
"""
# 通过 user_id 关联 dim_assistant 获取 assistant_id
sql = """
SELECT
s.tenant_member_id AS member_id,
s.user_id AS assistant_user_id,
d.assistant_id,
s.start_use_time,
s.last_use_time,
COALESCE(s.income_seconds, 0) / 60 AS duration_minutes,
s.skill_id
FROM billiards_dwd.dwd_assistant_service_log s
JOIN billiards_dwd.dim_assistant d
ON s.user_id = d.user_id AND d.scd2_is_current = 1
WHERE s.site_id = %s
AND s.tenant_member_id > 0 -- 排除散客
AND s.is_delete = 0
AND s.user_id > 0 -- 确保有助教user_id
AND s.last_use_time >= %s
AND s.last_use_time < %s
ORDER BY s.tenant_member_id, d.assistant_id, s.start_use_time
"""
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
result = []
for row in (rows or []):
row_dict = dict(row)
assistant_id = row_dict['assistant_id']
if assistant_id:
result.append({
'member_id': int(row_dict['member_id']),
'assistant_id': int(assistant_id), # 助教IDdim_assistant主键
'assistant_user_id': int(row_dict['assistant_user_id']), # user_id用于中间处理
'start_time': row_dict['start_use_time'],
'end_time': row_dict['last_use_time'],
'duration_minutes': int(row_dict['duration_minutes'] or 0),
'skill_id': int(row_dict['skill_id'] or 0)
})
return result
def _group_and_merge_sessions(
self,
raw_services: List[Dict[str, Any]],
params: Dict[str, float],
now: datetime
) -> Dict[Tuple[int, int], MemberAssistantIntimacyData]:
"""
按(member_id, assistant_id)分组并合并会话
合并逻辑:同一客人对同一助教,间隔<4小时算同次服务
"""
merge_threshold_hours = int(params['session_merge_hours'])
merge_threshold = timedelta(hours=merge_threshold_hours)
incentive_weight = params['incentive_weight']
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData] = {}
# 按pair分组使用assistant_id
pair_services: Dict[Tuple[int, int], List[Dict[str, Any]]] = {}
for svc in raw_services:
key = (svc['member_id'], svc['assistant_id'])
if key not in pair_services:
pair_services[key] = []
pair_services[key].append(svc)
# 对每个pair合并会话
for key, services in pair_services.items():
member_id, assistant_id = key
# 取第一个服务记录的user_id
assistant_user_id = services[0]['assistant_user_id'] if services else 0
data = MemberAssistantIntimacyData(
member_id=member_id,
assistant_id=assistant_id,
assistant_user_id=assistant_user_id,
site_id=0, # 稍后填充
tenant_id=0
)
# 按开始时间排序
sorted_services = sorted(services, key=lambda x: x['start_time'])
# 合并会话
current_session: Optional[ServiceSession] = None
for svc in sorted_services:
start_time = svc['start_time']
end_time = svc['end_time']
duration = svc['duration_minutes']
skill_id = svc['skill_id']
# 判断课型(附加课权重更高,包厢课按基础课处理)
course_type = self.get_course_type(skill_id)
is_incentive = course_type == CourseType.BONUS
weight = incentive_weight if is_incentive else 1.0
if current_session is None:
# 开始新会话
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
elif start_time - current_session.session_end <= merge_threshold:
# 合并到当前会话
current_session.session_end = max(current_session.session_end, end_time)
current_session.total_duration_minutes += duration
# 同次服务取最高权重
current_session.course_weight = max(current_session.course_weight, weight)
current_session.is_incentive = current_session.is_incentive or is_incentive
else:
# 保存当前会话,开始新会话
data.sessions.append(current_session)
current_session = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive
)
# 保存最后一个会话
if current_session is not None:
data.sessions.append(current_session)
# 统计特征
data.session_count = len(data.sessions)
data.total_duration_minutes = sum(s.total_duration_minutes for s in data.sessions)
data.basic_session_count = sum(1 for s in data.sessions if not s.is_incentive)
data.incentive_session_count = sum(1 for s in data.sessions if s.is_incentive)
# 最近一次服务
if data.sessions:
last_session = max(data.sessions, key=lambda s: s.session_end)
data.days_since_last_session = (now - last_session.session_end).days
pair_data[key] = data
return pair_data
def _extract_attributed_recharges(
self,
site_id: int,
pair_data: Dict[Tuple[int, int], MemberAssistantIntimacyData],
params: Dict[str, float],
now: datetime
) -> None:
"""
提取归因充值
归因逻辑服务结束后1小时内的充值算做该助教贡献
"""
attribution_hours = int(params['recharge_attribute_hours'])
attribution_window = timedelta(hours=attribution_hours)
# 获取所有相关会员ID
member_ids = set(key[0] for key in pair_data.keys())
if not member_ids:
return
member_ids_str = ','.join(str(m) for m in member_ids)
# 查询充值记录
sql = f"""
SELECT
member_id,
pay_time,
pay_amount
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id IN ({member_ids_str})
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
"""
lookback_days = int(params['lookback_days'])
start_datetime = now - timedelta(days=lookback_days)
rows = self.db.query(sql, (site_id, start_datetime))
# 为每个充值找到归因助教
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
pay_time = row_dict['pay_time']
pay_amount = float(row_dict['pay_amount'] or 0)
if pay_amount <= 0:
continue
# 查找该会员在pay_time前1小时内结束服务的助教
for key, data in pair_data.items():
if key[0] != member_id:
continue
for session in data.sessions:
# 服务结束后1小时内的充值
if (session.session_end <= pay_time and
pay_time - session.session_end <= attribution_window):
# 归因给这个助教
data.attributed_recharge_count += 1
data.attributed_recharge_amount += pay_amount
data.recharges.append(AttributedRecharge(
pay_time=pay_time,
pay_amount=pay_amount,
days_ago=(now - pay_time).total_seconds() / 86400
))
break # 一笔充值只归因给一个助教
# ==========================================================================
# 分数计算方法
# ==========================================================================
def _calculate_component_scores(
self,
data: MemberAssistantIntimacyData,
params: Dict[str, float],
now: datetime
) -> None:
"""计算5项分数"""
epsilon = 1e-6
lookback_days = int(params['lookback_days'])
h_sess = params['halflife_session']
h_last = params['halflife_last']
h_pay = params['halflife_recharge']
h_short = params['halflife_short']
h_long = params['halflife_long']
A0 = params['amount_base']
gamma = params['burst_gamma']
# 1. 频次强度 F = Σ(τ_i × decay(d_i, h_sess))
F = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
days_ago = min(days_ago, lookback_days)
F += session.course_weight * self.decay(days_ago, h_sess)
data.score_frequency = F
# 2. 最近温度 R = decay(d_last, h_last)
if data.days_since_last_session is not None:
data.score_recency = self.decay(min(data.days_since_last_session, lookback_days), h_last)
else:
data.score_recency = 0.0
# 3. 归因充值强度 M = Σ(ln(1+amt/A0) × decay(d_r, h_pay))
M = 0.0
for recharge in data.recharges:
m_amt = math.log1p(recharge.pay_amount / A0)
M += m_amt * self.decay(min(recharge.days_ago, lookback_days), h_pay)
data.score_recharge = M
# 4. 时长贡献 D = Σ(sqrt(dur/60) × τ × decay(d, h_sess))
D = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
dur_hours = session.total_duration_minutes / 60.0
days_ago = min(days_ago, lookback_days)
D += math.sqrt(dur_hours) * session.course_weight * self.decay(days_ago, h_sess)
data.score_duration = D
# 5. 频率激增放大 mult = 1 + γ × burst
# F_short = Σ(τ × decay(d, h_short))
# F_long = Σ(τ × decay(d, h_long))
F_short = 0.0
F_long = 0.0
for session in data.sessions:
days_ago = (now - session.session_end).total_seconds() / 86400
days_ago = min(days_ago, lookback_days)
F_short += session.course_weight * self.decay(days_ago, h_short)
F_long += session.course_weight * self.decay(days_ago, h_long)
# burst = max(0, ln(1 + (F_short/F_long - 1)))
ratio = F_short / (F_long + epsilon)
if ratio > 1:
burst = self.safe_ln1p(ratio - 1)
else:
burst = 0.0
data.burst_multiplier = 1 + gamma * burst
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_intimacy_data(self, data_list: List[MemberAssistantIntimacyData]) -> int:
"""保存亲密数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
# 构建删除条件使用assistant_id
keys = [(d.member_id, d.assistant_id) for d in data_list]
conditions = " OR ".join(
f"(member_id = {m} AND assistant_id = {a})" for m, a in keys
)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_assistant_intimacy
WHERE site_id = %s AND ({conditions})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_assistant_intimacy (
site_id, tenant_id, member_id, assistant_id,
session_count, total_duration_minutes,
basic_session_count, incentive_session_count,
days_since_last_session,
attributed_recharge_count, attributed_recharge_amount,
score_frequency, score_recency, score_recharge, score_duration,
burst_multiplier, raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id, data.assistant_id,
data.session_count, data.total_duration_minutes,
data.basic_session_count, data.incentive_session_count,
data.days_since_last_session,
data.attributed_recharge_count, data.attributed_recharge_amount,
data.score_frequency, data.score_recency, data.score_recharge, data.score_duration,
data.burst_multiplier, data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
site_id = self.config.get('app.default_site_id')
if site_id:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['site_id'])
raise ValueError("无法确定门店ID")
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_assistant_service_log LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0])['tenant_id'])
return 0

View File

@@ -0,0 +1,461 @@
# -*- coding: utf-8 -*-
"""
会员层召回/转化指数共享逻辑
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask
from ..base_dws_task import TaskContext
@dataclass
class MemberActivityData:
"""Shared member activity features for WBI/NCI."""
member_id: int
site_id: int
tenant_id: int
member_create_time: Optional[datetime] = None
first_visit_time: Optional[datetime] = None
last_visit_time: Optional[datetime] = None
last_recharge_time: Optional[datetime] = None
t_v: float = 60.0
t_r: float = 60.0
t_a: float = 60.0
days_since_first_visit: Optional[int] = None
days_since_last_visit: Optional[int] = None
days_since_last_recharge: Optional[int] = None
visits_14d: int = 0
visits_60d: int = 0
visits_total: int = 0
spend_30d: float = 0.0
spend_180d: float = 0.0
sv_balance: float = 0.0
recharge_60d_amt: float = 0.0
interval_count: int = 0
intervals: List[float] = field(default_factory=list)
interval_ages_days: List[int] = field(default_factory=list)
recharge_unconsumed: int = 0
class MemberIndexBaseTask(BaseIndexTask):
"""Shared extraction and feature building for WBI/NCI."""
DEFAULT_VISIT_LOOKBACK_DAYS = 180
DEFAULT_RECENCY_LOOKBACK_DAYS = 60
CASH_CARD_TYPE_ID = 2793249295533893
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
if site_id is not None:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('site_id')
if value is not None:
return int(value)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('tenant_id')
if value is not None:
return int(value)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0
def _load_params(self) -> Dict[str, float]:
"""Load index parameters with defaults and runtime overrides."""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
# GUI/环境变量可通过 run.index_lookback_days 覆盖 recency 窗口
override_days = self.config.get('run.index_lookback_days')
if override_days is not None:
try:
override_days_int = int(override_days)
if override_days_int < 7 or override_days_int > 180:
self.logger.warning(
"%s: run.index_lookback_days=%s 超出建议范围[7,180],已自动截断",
self.get_task_code(),
override_days,
)
override_days_int = max(7, min(180, override_days_int))
result['lookback_days_recency'] = float(override_days_int)
self.logger.info(
"%s: 使用回溯天数覆盖 lookback_days_recency=%d",
self.get_task_code(),
override_days_int,
)
except (TypeError, ValueError):
self.logger.warning(
"%s: run.index_lookback_days=%s is invalid; ignore override and use parameter table value",
self.get_task_code(),
override_days,
)
return result
def _build_visit_condition_sql(self) -> str:
"""Build visit-scope condition SQL."""
return """
(
s.settle_type = 1
OR (
s.settle_type = 3
AND EXISTS (
SELECT 1
FROM billiards_dwd.dwd_assistant_service_log asl
JOIN billiards_dws.cfg_skill_type st
ON asl.skill_id = st.skill_id
AND st.course_type_code = 'BONUS'
AND st.is_active = TRUE
WHERE asl.order_settle_id = s.order_settle_id
AND asl.site_id = s.site_id
AND asl.tenant_member_id = s.member_id
AND asl.is_delete = 0
)
)
)
"""
def _extract_visit_day_rows(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""提取到店记录(按天去重)"""
condition_sql = self._build_visit_condition_sql()
sql = f"""
WITH visit_source AS (
SELECT
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time,
s.pay_amount
FROM billiards_dwd.dwd_settlement_head s
LEFT JOIN billiards_dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = %s
AND s.pay_time >= %s
AND s.pay_time < %s + INTERVAL '1 day'
AND {condition_sql}
)
SELECT
canonical_member_id AS member_id,
DATE(pay_time) AS visit_date,
MAX(pay_time) AS last_visit_time,
SUM(COALESCE(pay_amount, 0)) AS day_pay_amount
FROM visit_source
WHERE canonical_member_id > 0
GROUP BY canonical_member_id, DATE(pay_time)
ORDER BY canonical_member_id, visit_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in (rows or [])]
def _extract_recharge_rows(
self,
site_id: int,
start_date: date,
end_date: date,
) -> Dict[int, Dict[str, Any]]:
"""提取充值记录近60天"""
sql = """
WITH recharge_source AS (
SELECT
COALESCE(NULLIF(r.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
r.pay_time,
r.pay_amount
FROM billiards_dwd.dwd_recharge_order r
LEFT JOIN billiards_dwd.dim_member_card_account mca
ON r.tenant_member_card_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = r.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE r.site_id = %s
AND r.settle_type = 5
AND r.pay_time >= %s
AND r.pay_time < %s + INTERVAL '1 day'
)
SELECT
canonical_member_id AS member_id,
MAX(pay_time) AS last_recharge_time,
SUM(COALESCE(pay_amount, 0)) AS recharge_60d_amt
FROM recharge_source
WHERE canonical_member_id > 0
GROUP BY canonical_member_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, Dict[str, Any]] = {}
for row in (rows or []):
row_dict = dict(row)
result[int(row_dict['member_id'])] = row_dict
return result
def _extract_member_create_times(self, member_ids: List[int]) -> Dict[int, datetime]:
"""提取会员建档时间"""
if not member_ids:
return {}
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
member_id,
create_time
FROM billiards_dwd.dim_member
WHERE member_id IN ({member_ids_str})
AND scd2_is_current = 1
"""
rows = self.db.query(sql)
result = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
create_time = row_dict.get('create_time')
if create_time:
result[member_id] = create_time
return result
def _extract_first_visit_times(self, site_id: int, member_ids: List[int]) -> Dict[int, datetime]:
"""提取首次到店时间(全量)"""
if not member_ids:
return {}
member_ids_str = ','.join(str(m) for m in member_ids)
condition_sql = self._build_visit_condition_sql()
sql = f"""
WITH visit_source AS (
SELECT
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time
FROM billiards_dwd.dwd_settlement_head s
LEFT JOIN billiards_dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = %s
AND {condition_sql}
)
SELECT
canonical_member_id AS member_id,
MIN(pay_time) AS first_visit_time
FROM visit_source
WHERE canonical_member_id IN ({member_ids_str})
GROUP BY canonical_member_id
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
first_visit_time = row_dict.get('first_visit_time')
if first_visit_time:
result[member_id] = first_visit_time
return result
def _extract_sv_balances(self, site_id: int, tenant_id: int, member_ids: List[int]) -> Dict[int, Decimal]:
"""Fetch member stored-value card balances."""
if not member_ids:
return {}
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
tenant_member_id AS member_id,
SUM(CASE WHEN card_type_id = %s THEN balance ELSE 0 END) AS sv_balance
FROM billiards_dwd.dim_member_card_account
WHERE tenant_id = %s
AND register_site_id = %s
AND scd2_is_current = 1
AND COALESCE(is_delete, 0) = 0
AND tenant_member_id IN ({member_ids_str})
GROUP BY tenant_member_id
"""
rows = self.db.query(sql, (self.CASH_CARD_TYPE_ID, tenant_id, site_id))
result: Dict[int, Decimal] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
result[member_id] = row_dict.get('sv_balance') or Decimal('0')
return result
def _build_member_activity(
self,
site_id: int,
tenant_id: int,
params: Dict[str, float],
) -> Dict[int, MemberActivityData]:
"""构建会员活动特征"""
now = datetime.now(self.tz)
base_date = now.date()
visit_lookback_days = int(params.get('visit_lookback_days', self.DEFAULT_VISIT_LOOKBACK_DAYS))
recency_days = int(params.get('lookback_days_recency', self.DEFAULT_RECENCY_LOOKBACK_DAYS))
visit_start_date = base_date - timedelta(days=visit_lookback_days)
visit_rows = self._extract_visit_day_rows(site_id, visit_start_date, base_date)
member_day_rows: Dict[int, List[Dict[str, Any]]] = {}
for row in (visit_rows or []):
member_id = int(row['member_id'])
member_day_rows.setdefault(member_id, []).append(row)
recharge_start_date = base_date - timedelta(days=recency_days)
recharge_rows = self._extract_recharge_rows(site_id, recharge_start_date, base_date)
member_ids = set(member_day_rows.keys()) | set(recharge_rows.keys())
if not member_ids:
return {}
member_id_list = list(member_ids)
member_create_times = self._extract_member_create_times(member_id_list)
first_visit_times = self._extract_first_visit_times(site_id, member_id_list)
sv_balances = self._extract_sv_balances(site_id, tenant_id, member_id_list)
results: Dict[int, MemberActivityData] = {}
for member_id in member_ids:
data = MemberActivityData(
member_id=member_id,
site_id=site_id,
tenant_id=tenant_id,
)
day_rows = member_day_rows.get(member_id, [])
if day_rows:
day_rows_sorted = sorted(day_rows, key=lambda x: x['visit_date'])
data.visits_total = len(day_rows_sorted)
last_visit_time = max(r.get('last_visit_time') for r in day_rows_sorted)
data.last_visit_time = last_visit_time
# 近14/60天到店次数
days_14_ago = base_date - timedelta(days=14)
days_60_ago = base_date - timedelta(days=60)
for r in day_rows_sorted:
visit_date = r.get('visit_date')
if visit_date is None:
continue
if visit_date >= days_14_ago:
data.visits_14d += 1
if visit_date >= days_60_ago:
data.visits_60d += 1
# 消费金额
days_30_ago = base_date - timedelta(days=30)
for r in day_rows_sorted:
visit_date = r.get('visit_date')
day_pay = float(r.get('day_pay_amount') or 0)
data.spend_180d += day_pay
if visit_date and visit_date >= days_30_ago:
data.spend_30d += day_pay
# 计算到店间隔(按天)
visit_dates = [r.get('visit_date') for r in day_rows_sorted if r.get('visit_date')]
intervals: List[float] = []
interval_ages_days: List[int] = []
for i in range(1, len(visit_dates)):
interval = (visit_dates[i] - visit_dates[i - 1]).days
intervals.append(float(min(recency_days, interval)))
interval_ages_days.append(max(0, (base_date - visit_dates[i]).days))
data.intervals = intervals
data.interval_ages_days = interval_ages_days
data.interval_count = len(intervals)
recharge_info = recharge_rows.get(member_id)
if recharge_info:
data.last_recharge_time = recharge_info.get('last_recharge_time')
data.recharge_60d_amt = float(recharge_info.get('recharge_60d_amt') or 0)
data.member_create_time = member_create_times.get(member_id)
data.first_visit_time = first_visit_times.get(member_id)
sv_balance = sv_balances.get(member_id)
if sv_balance is not None:
data.sv_balance = float(sv_balance)
# 时间差计算
if data.first_visit_time:
data.days_since_first_visit = (base_date - data.first_visit_time.date()).days
if data.last_visit_time:
data.days_since_last_visit = (base_date - data.last_visit_time.date()).days
if data.last_recharge_time:
data.days_since_last_recharge = (base_date - data.last_recharge_time.date()).days
# tV/tR/tA
data.t_v = float(min(recency_days, data.days_since_last_visit)) if data.days_since_last_visit is not None else float(recency_days)
data.t_r = float(min(recency_days, data.days_since_last_recharge)) if data.days_since_last_recharge is not None else float(recency_days)
data.t_a = float(min(data.t_v, data.t_r))
# 充值是否未回访
if data.last_recharge_time and (data.last_visit_time is None or data.last_recharge_time > data.last_visit_time):
data.recharge_unconsumed = 1
results[member_id] = data
return results
def classify_segment(
self,
data: MemberActivityData,
params: Dict[str, float],
) -> Tuple[str, str, bool]:
"""Classify member into NEW/OLD/STOP buckets."""
recency_days = int(params.get('lookback_days_recency', self.DEFAULT_RECENCY_LOOKBACK_DAYS))
enable_stop_exception = int(params.get('enable_stop_high_balance_exception', 0)) == 1
high_balance_threshold = float(params.get('high_balance_threshold', 1000))
if data.t_a >= recency_days:
if enable_stop_exception and data.sv_balance >= high_balance_threshold:
return "STOP", "STOP_HIGH_BALANCE", True
return "STOP", "STOP", False
new_visit_threshold = int(params.get('new_visit_threshold', 2))
new_days_threshold = int(params.get('new_days_threshold', 30))
recharge_recent_days = int(params.get('recharge_recent_days', 14))
new_recharge_max_visits = int(params.get('new_recharge_max_visits', 10))
is_new_by_visits = data.visits_total <= new_visit_threshold
is_new_by_first_visit = data.days_since_first_visit is not None and data.days_since_first_visit <= new_days_threshold
is_new_by_recharge = (
data.recharge_unconsumed == 1
and data.days_since_last_recharge is not None
and data.days_since_last_recharge <= recharge_recent_days
and data.visits_total <= new_recharge_max_visits
)
if is_new_by_visits or is_new_by_first_visit or is_new_by_recharge:
return "NEW", "NEW", True
return "OLD", "OLD", True

View File

@@ -0,0 +1,623 @@
# -*- coding: utf-8 -*-
"""
ML 人工台账导入任务。
设计目标:
1. 人工台账作为 ML 唯一真源;
2. 同一订单支持多助教归因,默认均分;
3. 覆盖策略:
- 近 30 天:按 site_id + biz_date 日覆盖;
- 超过 30 天按固定纪元2026-01-01切 30 天批次覆盖。
"""
from __future__ import annotations
import os
import uuid
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from decimal import Decimal
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
from .base_index_task import BaseIndexTask
from ..base_dws_task import TaskContext
@dataclass(frozen=True)
class ImportScope:
"""导入覆盖范围定义。"""
site_id: int
scope_type: str # DAY / P30
start_date: date
end_date: date
@property
def scope_key(self) -> str:
if self.scope_type == "DAY":
return f"DAY:{self.site_id}:{self.start_date.isoformat()}"
return (
f"P30:{self.site_id}:{self.start_date.isoformat()}:{self.end_date.isoformat()}"
)
class MlManualImportTask(BaseIndexTask):
"""导入并拆分 ML 人工台账(订单宽表 + 助教分摊窄表)。"""
INDEX_TYPE = "ML"
EPOCH_ANCHOR = date(2026, 1, 1)
HISTORICAL_BUCKET_DAYS = 30
ASSISTANT_SLOT_COUNT = 5
# Excel 模板字段(按列顺序)
TEMPLATE_COLUMNS = [
"site_id",
"biz_date",
"external_id",
"member_id",
"pay_time",
"order_amount",
"currency",
"assistant_id_1",
"assistant_name_1",
"assistant_id_2",
"assistant_name_2",
"assistant_id_3",
"assistant_name_3",
"assistant_id_4",
"assistant_name_4",
"assistant_id_5",
"assistant_name_5",
"remark",
]
def get_task_code(self) -> str:
return "DWS_ML_MANUAL_IMPORT"
def get_target_table(self) -> str:
return "dws_ml_manual_order_source"
def get_primary_keys(self) -> List[str]:
return ["site_id", "external_id", "import_scope_key", "row_no"]
def get_index_type(self) -> str:
return self.INDEX_TYPE
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""
执行导入。
说明:该任务按“文件”运行,不依赖时间窗口。调度器会以工具任务方式直接触发。
"""
file_path = self._resolve_file_path()
if not file_path:
raise ValueError(
"未找到 ML 台账文件,请通过环境变量 ML_MANUAL_LEDGER_FILE 或配置 run.ml_manual_ledger_file 指定"
)
rows = self._read_excel_rows(file_path)
if not rows:
self.logger.warning("台账文件为空:%s", file_path)
return {
"status": "SUCCESS",
"counts": {
"source_rows": 0,
"alloc_rows": 0,
"deleted_source_rows": 0,
"deleted_alloc_rows": 0,
"scopes": 0,
},
}
now = datetime.now(self.tz)
today = now.date()
import_batch_no = self._build_import_batch_no(now)
import_file_name = Path(file_path).name
import_user = self._resolve_import_user()
source_rows: List[Dict[str, Any]] = []
alloc_rows: List[Dict[str, Any]] = []
scope_set: Dict[Tuple[int, str, date, date], ImportScope] = {}
for idx, raw in enumerate(rows, start=2):
normalized = self._normalize_row(raw, row_no=idx, file_path=file_path)
row_scope = self.resolve_scope(
site_id=normalized["site_id"],
biz_date=normalized["biz_date"],
today=today,
)
scope_set[(row_scope.site_id, row_scope.scope_type, row_scope.start_date, row_scope.end_date)] = row_scope
source_row = self._build_source_row(
normalized=normalized,
scope=row_scope,
import_batch_no=import_batch_no,
import_file_name=import_file_name,
import_user=import_user,
import_time=now,
)
source_rows.append(source_row)
alloc_rows.extend(
self._build_alloc_rows(
normalized=normalized,
scope=row_scope,
import_batch_no=import_batch_no,
import_file_name=import_file_name,
import_user=import_user,
import_time=now,
)
)
scopes = list(scope_set.values())
deleted_source_rows, deleted_alloc_rows = self._delete_by_scopes(scopes)
inserted_source = self._insert_source_rows(source_rows)
upserted_alloc = self._upsert_alloc_rows(alloc_rows)
self.db.conn.commit()
self.logger.info(
"ML 人工台账导入完成: file=%s source=%d alloc=%d scopes=%d",
file_path,
inserted_source,
upserted_alloc,
len(scopes),
)
return {
"status": "SUCCESS",
"counts": {
"source_rows": inserted_source,
"alloc_rows": upserted_alloc,
"deleted_source_rows": deleted_source_rows,
"deleted_alloc_rows": deleted_alloc_rows,
"scopes": len(scopes),
},
}
def _resolve_file_path(self) -> Optional[str]:
"""解析台账文件路径。"""
raw_path = (
self.config.get("run.ml_manual_ledger_file")
or self.config.get("run.ml_manual_file")
or os.getenv("ML_MANUAL_LEDGER_FILE")
)
if not raw_path:
return None
candidate = Path(str(raw_path)).expanduser()
if not candidate.is_absolute():
candidate = Path.cwd() / candidate
if not candidate.exists():
raise FileNotFoundError(f"台账文件不存在: {candidate}")
return str(candidate)
def _read_excel_rows(self, file_path: str) -> List[Dict[str, Any]]:
"""读取 Excel 为行字典列表。"""
try:
from openpyxl import load_workbook
except Exception as exc: # noqa: BLE001
raise RuntimeError(
"缺少 openpyxl 依赖,无法读取 Excel请先安装 openpyxl"
) from exc
wb = load_workbook(file_path, data_only=True)
ws = wb.active
header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), None)
if not header_row:
return []
headers = [str(col).strip() if col is not None else "" for col in header_row]
if not headers:
return []
rows: List[Dict[str, Any]] = []
for values in ws.iter_rows(min_row=2, values_only=True):
if values is None:
continue
row_dict = {headers[i]: values[i] for i in range(min(len(headers), len(values)))}
if self._is_empty_row(row_dict):
continue
rows.append(row_dict)
return rows
@staticmethod
def _is_empty_row(row: Dict[str, Any]) -> bool:
for value in row.values():
if value is None:
continue
if isinstance(value, str) and not value.strip():
continue
return False
return True
def _normalize_row(
self,
raw: Dict[str, Any],
row_no: int,
file_path: str,
) -> Dict[str, Any]:
"""规范化单行字段。"""
site_id = self._to_int(raw.get("site_id"), fallback=self.config.get("app.store_id"))
biz_date = self._to_date(raw.get("biz_date"))
pay_time = self._to_datetime(raw.get("pay_time"), fallback_date=biz_date)
external_id = str(raw.get("external_id") or "").strip()
if not external_id:
raise ValueError(f"台账行 {row_no} 缺少 external_id订单ID: {file_path}")
member_id = self._to_int(raw.get("member_id"), fallback=0)
order_amount = self._to_decimal(raw.get("order_amount"))
currency = str(raw.get("currency") or "CNY").strip().upper() or "CNY"
remark = str(raw.get("remark") or "").strip()
assistants: List[Tuple[int, str]] = []
for idx in range(1, self.ASSISTANT_SLOT_COUNT + 1):
aid = self._to_int(raw.get(f"assistant_id_{idx}"), fallback=None)
name = str(raw.get(f"assistant_name_{idx}") or "").strip()
if aid is None:
continue
assistants.append((aid, name))
return {
"site_id": site_id,
"biz_date": biz_date,
"external_id": external_id,
"member_id": member_id,
"pay_time": pay_time,
"order_amount": order_amount,
"currency": currency,
"assistants": assistants,
"remark": remark,
"row_no": row_no,
}
def _build_source_row(
self,
*,
normalized: Dict[str, Any],
scope: ImportScope,
import_batch_no: str,
import_file_name: str,
import_user: str,
import_time: datetime,
) -> Dict[str, Any]:
"""构造宽表入库行。"""
assistants: Sequence[Tuple[int, str]] = normalized["assistants"]
row = {
"site_id": normalized["site_id"],
"biz_date": normalized["biz_date"],
"external_id": normalized["external_id"],
"member_id": normalized["member_id"],
"pay_time": normalized["pay_time"],
"order_amount": normalized["order_amount"],
"currency": normalized["currency"],
"import_batch_no": import_batch_no,
"import_file_name": import_file_name,
"import_scope_key": scope.scope_key,
"import_time": import_time,
"import_user": import_user,
"row_no": normalized["row_no"],
"remark": normalized["remark"],
}
for idx in range(1, self.ASSISTANT_SLOT_COUNT + 1):
aid, aname = (assistants[idx - 1] if idx - 1 < len(assistants) else (None, None))
row[f"assistant_id_{idx}"] = aid
row[f"assistant_name_{idx}"] = aname
return row
def _build_alloc_rows(
self,
*,
normalized: Dict[str, Any],
scope: ImportScope,
import_batch_no: str,
import_file_name: str,
import_user: str,
import_time: datetime,
) -> List[Dict[str, Any]]:
"""构造窄表分摊行。"""
assistants: Sequence[Tuple[int, str]] = normalized["assistants"]
if not assistants:
return []
n = Decimal(str(len(assistants)))
share_ratio = Decimal("1") / n
rows: List[Dict[str, Any]] = []
for assistant_id, assistant_name in assistants:
allocated_amount = normalized["order_amount"] * share_ratio
rows.append(
{
"site_id": normalized["site_id"],
"biz_date": normalized["biz_date"],
"external_id": normalized["external_id"],
"member_id": normalized["member_id"],
"pay_time": normalized["pay_time"],
"order_amount": normalized["order_amount"],
"assistant_id": assistant_id,
"assistant_name": assistant_name,
"share_ratio": share_ratio,
"allocated_amount": allocated_amount,
"currency": normalized["currency"],
"import_scope_key": scope.scope_key,
"import_batch_no": import_batch_no,
"import_file_name": import_file_name,
"import_time": import_time,
"import_user": import_user,
}
)
return rows
@classmethod
def resolve_scope(cls, site_id: int, biz_date: date, today: date) -> ImportScope:
"""按规则解析覆盖范围。"""
day_diff = (today - biz_date).days
if day_diff <= cls.HISTORICAL_BUCKET_DAYS:
return ImportScope(
site_id=site_id,
scope_type="DAY",
start_date=biz_date,
end_date=biz_date,
)
bucket_start, bucket_end = cls.resolve_p30_bucket(biz_date)
return ImportScope(
site_id=site_id,
scope_type="P30",
start_date=bucket_start,
end_date=bucket_end,
)
@classmethod
def resolve_p30_bucket(cls, biz_date: date) -> Tuple[date, date]:
"""固定纪元 30 天分桶。"""
delta_days = (biz_date - cls.EPOCH_ANCHOR).days
bucket_index = delta_days // cls.HISTORICAL_BUCKET_DAYS
bucket_start = cls.EPOCH_ANCHOR + timedelta(days=bucket_index * cls.HISTORICAL_BUCKET_DAYS)
bucket_end = bucket_start + timedelta(days=cls.HISTORICAL_BUCKET_DAYS - 1)
return bucket_start, bucket_end
def _delete_by_scopes(self, scopes: Iterable[ImportScope]) -> Tuple[int, int]:
"""按 scope 先删后写,保证整批覆盖。"""
deleted_source = 0
deleted_alloc = 0
with self.db.conn.cursor() as cur:
for scope in scopes:
if scope.scope_type == "DAY":
cur.execute(
"""
DELETE FROM billiards_dws.dws_ml_manual_order_source
WHERE site_id = %s AND biz_date = %s
""",
(scope.site_id, scope.start_date),
)
deleted_source += max(cur.rowcount, 0)
cur.execute(
"""
DELETE FROM billiards_dws.dws_ml_manual_order_alloc
WHERE site_id = %s AND biz_date = %s
""",
(scope.site_id, scope.start_date),
)
deleted_alloc += max(cur.rowcount, 0)
else:
cur.execute(
"""
DELETE FROM billiards_dws.dws_ml_manual_order_source
WHERE site_id = %s AND biz_date >= %s AND biz_date <= %s
""",
(scope.site_id, scope.start_date, scope.end_date),
)
deleted_source += max(cur.rowcount, 0)
cur.execute(
"""
DELETE FROM billiards_dws.dws_ml_manual_order_alloc
WHERE site_id = %s AND biz_date >= %s AND biz_date <= %s
""",
(scope.site_id, scope.start_date, scope.end_date),
)
deleted_alloc += max(cur.rowcount, 0)
return deleted_source, deleted_alloc
def _insert_source_rows(self, rows: List[Dict[str, Any]]) -> int:
if not rows:
return 0
columns = [
"site_id",
"biz_date",
"external_id",
"member_id",
"pay_time",
"order_amount",
"currency",
"assistant_id_1",
"assistant_name_1",
"assistant_id_2",
"assistant_name_2",
"assistant_id_3",
"assistant_name_3",
"assistant_id_4",
"assistant_name_4",
"assistant_id_5",
"assistant_name_5",
"import_batch_no",
"import_file_name",
"import_scope_key",
"import_time",
"import_user",
"row_no",
"remark",
"created_at",
"updated_at",
]
sql = f"""
INSERT INTO billiards_dws.dws_ml_manual_order_source ({", ".join(columns)})
VALUES ({", ".join(["%s"] * len(columns))})
"""
inserted = 0
with self.db.conn.cursor() as cur:
for row in rows:
values = [
row.get("site_id"),
row.get("biz_date"),
row.get("external_id"),
row.get("member_id"),
row.get("pay_time"),
row.get("order_amount"),
row.get("currency"),
row.get("assistant_id_1"),
row.get("assistant_name_1"),
row.get("assistant_id_2"),
row.get("assistant_name_2"),
row.get("assistant_id_3"),
row.get("assistant_name_3"),
row.get("assistant_id_4"),
row.get("assistant_name_4"),
row.get("assistant_id_5"),
row.get("assistant_name_5"),
row.get("import_batch_no"),
row.get("import_file_name"),
row.get("import_scope_key"),
row.get("import_time"),
row.get("import_user"),
row.get("row_no"),
row.get("remark"),
row.get("import_time"),
row.get("import_time"),
]
cur.execute(sql, values)
inserted += max(cur.rowcount, 0)
return inserted
def _upsert_alloc_rows(self, rows: List[Dict[str, Any]]) -> int:
if not rows:
return 0
columns = [
"site_id",
"biz_date",
"external_id",
"member_id",
"pay_time",
"order_amount",
"assistant_id",
"assistant_name",
"share_ratio",
"allocated_amount",
"currency",
"import_scope_key",
"import_batch_no",
"import_file_name",
"import_time",
"import_user",
"created_at",
"updated_at",
]
sql = f"""
INSERT INTO billiards_dws.dws_ml_manual_order_alloc ({", ".join(columns)})
VALUES ({", ".join(["%s"] * len(columns))})
ON CONFLICT (site_id, external_id, assistant_id)
DO UPDATE SET
biz_date = EXCLUDED.biz_date,
member_id = EXCLUDED.member_id,
pay_time = EXCLUDED.pay_time,
order_amount = EXCLUDED.order_amount,
assistant_name = EXCLUDED.assistant_name,
share_ratio = EXCLUDED.share_ratio,
allocated_amount = EXCLUDED.allocated_amount,
currency = EXCLUDED.currency,
import_scope_key = EXCLUDED.import_scope_key,
import_batch_no = EXCLUDED.import_batch_no,
import_file_name = EXCLUDED.import_file_name,
import_time = EXCLUDED.import_time,
import_user = EXCLUDED.import_user,
updated_at = NOW()
"""
affected = 0
with self.db.conn.cursor() as cur:
for row in rows:
values = [
row.get("site_id"),
row.get("biz_date"),
row.get("external_id"),
row.get("member_id"),
row.get("pay_time"),
row.get("order_amount"),
row.get("assistant_id"),
row.get("assistant_name"),
row.get("share_ratio"),
row.get("allocated_amount"),
row.get("currency"),
row.get("import_scope_key"),
row.get("import_batch_no"),
row.get("import_file_name"),
row.get("import_time"),
row.get("import_user"),
row.get("import_time"),
row.get("import_time"),
]
cur.execute(sql, values)
affected += max(cur.rowcount, 0)
return affected
@staticmethod
def _to_int(value: Any, fallback: Optional[int] = None) -> Optional[int]:
if value is None:
return fallback
if isinstance(value, str) and not value.strip():
return fallback
try:
return int(value)
except Exception: # noqa: BLE001
return fallback
@staticmethod
def _to_decimal(value: Any) -> Decimal:
if value is None or value == "":
return Decimal("0")
return Decimal(str(value))
@staticmethod
def _to_date(value: Any) -> date:
if isinstance(value, datetime):
return value.date()
if isinstance(value, date):
return value
if isinstance(value, str):
text = value.strip()
if not text:
raise ValueError("biz_date 不能为空")
if len(text) >= 10:
return datetime.fromisoformat(text[:10]).date()
return datetime.fromisoformat(text).date()
raise ValueError(f"无法解析 biz_date: {value}")
@staticmethod
def _to_datetime(value: Any, fallback_date: date) -> datetime:
if isinstance(value, datetime):
return value
if isinstance(value, date):
return datetime.combine(value, datetime.min.time())
if isinstance(value, str):
text = value.strip()
if text:
text = text.replace("/", "-")
try:
return datetime.fromisoformat(text)
except Exception: # noqa: BLE001
if len(text) >= 19:
return datetime.strptime(text[:19], "%Y-%m-%d %H:%M:%S")
return datetime.fromisoformat(text[:10])
return datetime.combine(fallback_date, datetime.min.time())
@staticmethod
def _build_import_batch_no(now: datetime) -> str:
return f"MLM_{now.strftime('%Y%m%d%H%M%S')}_{str(uuid.uuid4())[:8]}"
@staticmethod
def _resolve_import_user() -> str:
return (
os.getenv("ETL_OPERATOR")
or os.getenv("USERNAME")
or os.getenv("USER")
or "system"
)
__all__ = ["MlManualImportTask", "ImportScope"]

View File

@@ -0,0 +1,381 @@
# -*- coding: utf-8 -*-
"""
新客转化指数NCI计算任务。"""
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from .member_index_base import MemberActivityData, MemberIndexBaseTask
from ..base_dws_task import TaskContext
@dataclass
class MemberNewconvData:
activity: MemberActivityData
status: str
segment: str
need_new: float = 0.0
salvage_new: float = 0.0
recharge_new: float = 0.0
value_new: float = 0.0
welcome_new: float = 0.0
raw_score_welcome: Optional[float] = None
raw_score_convert: Optional[float] = None
raw_score: Optional[float] = None
display_score_welcome: Optional[float] = None
display_score_convert: Optional[float] = None
display_score: Optional[float] = None
class NewconvIndexTask(MemberIndexBaseTask):
"""新客转化指数NCI计算任务。"""
INDEX_TYPE = "NCI"
DEFAULT_PARAMS = {
# 通用参数
'lookback_days_recency': 60,
'visit_lookback_days': 180,
'percentile_lower': 5,
'percentile_upper': 95,
'compression_mode': 0,
'use_smoothing': 1,
'ewma_alpha': 0.2,
# 分流参数
'new_visit_threshold': 2,
'new_days_threshold': 30,
'recharge_recent_days': 14,
'new_recharge_max_visits': 10,
# NCI参数
'no_touch_days_new': 3,
't2_target_days': 7,
'salvage_start': 30,
'salvage_end': 60,
'welcome_window_days': 3,
'active_new_visit_threshold_14d': 2,
'active_new_recency_days': 7,
'active_new_penalty': 0.2,
'h_recharge': 7,
'amount_base_M0': 300,
'balance_base_B0': 500,
'value_w_spend': 1.0,
'value_w_bal': 0.8,
'w_welcome': 1.0,
'w_need': 1.6,
'w_re': 0.8,
'w_value': 1.0,
# STOP高余额例外默认关闭
'enable_stop_high_balance_exception': 0,
'high_balance_threshold': 1000,
}
def get_task_code(self) -> str:
return "DWS_NEWCONV_INDEX"
def get_target_table(self) -> str:
return "dws_member_newconv_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行 NCI 计算"""
self.logger.info("开始计算新客转化指数(NCI)")
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
params = self._load_params()
activity_map = self._build_member_activity(site_id, tenant_id, params)
if not activity_map:
self.logger.warning("No member activity data available; skip calculation")
return {'status': 'skipped', 'reason': 'no_data'}
newconv_list: List[MemberNewconvData] = []
for activity in activity_map.values():
segment, status, in_scope = self.classify_segment(activity, params)
if not in_scope:
continue
if segment != "NEW":
continue
data = MemberNewconvData(activity=activity, status=status, segment=segment)
self._calculate_nci_scores(data, params)
newconv_list.append(data)
if not newconv_list:
self.logger.warning("No new-member rows to calculate")
return {'status': 'skipped', 'reason': 'no_new_members'}
# 归一化 Display Score
raw_scores = [
(d.activity.member_id, d.raw_score)
for d in newconv_list
if d.raw_score is not None
]
if raw_scores:
use_smoothing = int(params.get('use_smoothing', 1)) == 1
total_score_map = self._normalize_score_pairs(
raw_scores,
params=params,
site_id=site_id,
use_smoothing=use_smoothing,
)
for data in newconv_list:
if data.activity.member_id in total_score_map:
data.display_score = total_score_map[data.activity.member_id]
raw_scores_welcome = [
(d.activity.member_id, d.raw_score_welcome)
for d in newconv_list
if d.raw_score_welcome is not None
]
welcome_score_map = self._normalize_score_pairs(
raw_scores_welcome,
params=params,
site_id=site_id,
use_smoothing=False,
)
for data in newconv_list:
if data.activity.member_id in welcome_score_map:
data.display_score_welcome = welcome_score_map[data.activity.member_id]
raw_scores_convert = [
(d.activity.member_id, d.raw_score_convert)
for d in newconv_list
if d.raw_score_convert is not None
]
convert_score_map = self._normalize_score_pairs(
raw_scores_convert,
params=params,
site_id=site_id,
use_smoothing=False,
)
for data in newconv_list:
if data.activity.member_id in convert_score_map:
data.display_score_convert = convert_score_map[data.activity.member_id]
# 保存分位点历史
all_raw = [float(score) for _, score in raw_scores]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
inserted = self._save_newconv_data(newconv_list)
self.logger.info("NCI calculation finished, inserted %d rows", inserted)
return {
'status': 'success',
'member_count': len(newconv_list),
'records_inserted': inserted
}
def _calculate_nci_scores(self, data: MemberNewconvData, params: Dict[str, float]) -> None:
"""计算 NCI 分项与 Raw Score"""
activity = data.activity
# 1) 紧迫度
no_touch_days = float(params['no_touch_days_new'])
t2_target_days = float(params['t2_target_days'])
t2_max_days = t2_target_days * 2.0
if t2_max_days <= no_touch_days:
data.need_new = 0.0
else:
data.need_new = self._clip(
(activity.t_v - no_touch_days) / (t2_max_days - no_touch_days),
0.0, 1.0
)
# 2) Salvage30-60天线性衰减
salvage_start = float(params['salvage_start'])
salvage_end = float(params['salvage_end'])
if salvage_end <= salvage_start:
data.salvage_new = 0.0
elif activity.t_a <= salvage_start:
data.salvage_new = 1.0
elif activity.t_a >= salvage_end:
data.salvage_new = 0.0
else:
data.salvage_new = (salvage_end - activity.t_a) / (salvage_end - salvage_start)
# 3) 充值未回访压力
if activity.recharge_unconsumed == 1:
data.recharge_new = self.decay(activity.t_r, params['h_recharge'])
else:
data.recharge_new = 0.0
# 4) 价值分
m0 = float(params['amount_base_M0'])
b0 = float(params['balance_base_B0'])
spend_score = math.log1p(activity.spend_180d / m0) if m0 > 0 else 0.0
bal_score = math.log1p(activity.sv_balance / b0) if b0 > 0 else 0.0
data.value_new = float(params['value_w_spend']) * spend_score + float(params['value_w_bal']) * bal_score
# 5) 欢迎建联分:优先首访后立即触达
welcome_window_days = float(params.get('welcome_window_days', 3))
data.welcome_new = 0.0
if welcome_window_days > 0 and activity.visits_total <= 1 and activity.t_v <= welcome_window_days:
data.welcome_new = self._clip(1.0 - (activity.t_v / welcome_window_days), 0.0, 1.0)
# 6) 抑制高活跃新客在转化召回排名中的权重
active_visit_threshold = int(params.get('active_new_visit_threshold_14d', 2))
active_recency_days = float(params.get('active_new_recency_days', 7))
active_penalty = float(params.get('active_new_penalty', 0.2))
if activity.visits_14d >= active_visit_threshold and activity.t_v <= active_recency_days:
active_multiplier = self._clip(active_penalty, 0.0, 1.0)
else:
active_multiplier = 1.0
# 7) 价值/充值分主要在进入免打扰窗口后生效
if no_touch_days > 0:
touch_multiplier = self._clip(activity.t_v / no_touch_days, 0.0, 1.0)
else:
touch_multiplier = 1.0
data.raw_score_welcome = float(params.get('w_welcome', 1.0)) * data.welcome_new
data.raw_score_convert = active_multiplier * (
float(params['w_need']) * (data.need_new * data.salvage_new)
+ float(params['w_re']) * data.recharge_new * touch_multiplier
+ float(params['w_value']) * data.value_new * touch_multiplier
)
data.raw_score_welcome = max(0.0, data.raw_score_welcome)
data.raw_score_convert = max(0.0, data.raw_score_convert)
data.raw_score = data.raw_score_welcome + data.raw_score_convert
if data.raw_score < 0:
data.raw_score = 0.0
def _save_newconv_data(self, data_list: List[MemberNewconvData]) -> int:
"""保存 NCI 数据"""
if not data_list:
return 0
site_id = data_list[0].activity.site_id
# 按门店全量刷新,避免因分群变化导致过期数据残留。
delete_sql = """
DELETE FROM billiards_dws.dws_member_newconv_index
WHERE site_id = %s
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
insert_sql = """
INSERT INTO billiards_dws.dws_member_newconv_index (
site_id, tenant_id, member_id,
status, segment,
member_create_time, first_visit_time, last_visit_time, last_recharge_time,
t_v, t_r, t_a,
visits_14d, visits_60d, visits_total,
spend_30d, spend_180d, sv_balance, recharge_60d_amt,
interval_count,
need_new, salvage_new, recharge_new, value_new,
welcome_new,
raw_score_welcome, raw_score_convert, raw_score,
display_score_welcome, display_score_convert, display_score,
last_wechat_touch_time,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s,
%s,
%s, %s, %s, %s,
%s,
%s, %s, %s,
%s, %s, %s,
%s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
activity = data.activity
cur.execute(insert_sql, (
activity.site_id, activity.tenant_id, activity.member_id,
data.status, data.segment,
activity.member_create_time, activity.first_visit_time, activity.last_visit_time, activity.last_recharge_time,
activity.t_v, activity.t_r, activity.t_a,
activity.visits_14d, activity.visits_60d, activity.visits_total,
activity.spend_30d, activity.spend_180d, activity.sv_balance, activity.recharge_60d_amt,
activity.interval_count,
data.need_new, data.salvage_new, data.recharge_new, data.value_new,
data.welcome_new,
data.raw_score_welcome, data.raw_score_convert, data.raw_score,
data.display_score_welcome, data.display_score_convert, data.display_score,
None,
))
inserted += cur.rowcount
self.db.conn.commit()
return inserted
def _clip(self, value: float, low: float, high: float) -> float:
return max(low, min(high, value))
def _map_compression(self, params: Dict[str, float]) -> str:
mode = int(params.get('compression_mode', 0))
if mode == 1:
return "log1p"
if mode == 2:
return "asinh"
return "none"
def _normalize_score_pairs(
self,
raw_scores: List[tuple[int, Optional[float]]],
params: Dict[str, float],
site_id: int,
use_smoothing: bool,
) -> Dict[int, float]:
valid_scores = [(member_id, float(score)) for member_id, score in raw_scores if score is not None]
if not valid_scores:
return {}
# 全为0时直接返回避免 MinMax 归一化退化
if all(abs(score) <= 1e-9 for _, score in valid_scores):
return {member_id: 0.0 for member_id, _ in valid_scores}
compression = self._map_compression(params)
normalized = self.batch_normalize_to_display(
valid_scores,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
return {member_id: display for member_id, _, display in normalized}
__all__ = ['NewconvIndexTask']

View File

@@ -0,0 +1,587 @@
# -*- coding: utf-8 -*-
"""
客户召回指数计算任务
功能说明:
- 衡量客户召回的必要性和紧急程度
- 尊重客户个人到店周期(μ=中位数, σ=MAD
- 对新客户、刚充值客户增加召回倾向
- 检测"热了又断"的情况
算法公式:
Raw Score = w_over × overdue + w_new × new_bonus + w_re × re_bonus + w_hot × hot_drop
其中:
- overdue = 1 - exp(-max(0, (t-μ)/σ)) # 超期紧急性
- new_bonus = decay(d_first, h_new) # 新客户加分
- re_bonus = decay(d_recharge, h_re) # 刚充值加分
- hot_drop = max(0, ln(1 + (r14/r60 - 1))) # 热度断档加分
数据来源:
- dwd_settlement_head: 会员到店记录
- dwd_recharge_order: 充值记录
- dim_member: 首访时间
更新频率每2小时
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask, PercentileHistory
from ..base_dws_task import TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class MemberRecallData:
"""会员召回数据"""
member_id: int
site_id: int
tenant_id: int
# 计算输入特征
days_since_last_visit: Optional[int] = None
visit_interval_median: Optional[float] = None
visit_interval_mad: Optional[float] = None
days_since_first_visit: Optional[int] = None
days_since_last_recharge: Optional[int] = None
visits_last_14_days: int = 0
visits_last_60_days: int = 0
# 分项得分
score_overdue: float = 0.0
score_new_bonus: float = 0.0
score_recharge_bonus: float = 0.0
score_hot_drop: float = 0.0
# 最终分数
raw_score: float = 0.0
display_score: float = 0.0
# =============================================================================
# 召回指数任务
# =============================================================================
class RecallIndexTask(BaseIndexTask):
"""
客户召回指数计算任务
计算流程:
1. 提取近60天有到店记录的会员
2. 计算每个会员的到店间隔特征中位数、MAD
3. 计算4项分数超期、新客、充值、热度断档
4. 汇总Raw Score
5. 分位截断 + MinMax映射到0-10
6. 写入DWS表
"""
INDEX_TYPE = "RECALL"
# 默认参数
DEFAULT_PARAMS = {
'lookback_days': 60,
'sigma_min': 2.0,
'halflife_new': 7.0,
'halflife_recharge': 10.0,
'weight_overdue': 3.0,
'weight_new': 1.0,
'weight_recharge': 1.0,
'weight_hot': 1.0,
'percentile_lower': 5,
'percentile_upper': 95,
}
# ==========================================================================
# 抽象方法实现
# ==========================================================================
def get_task_code(self) -> str:
return "DWS_RECALL_INDEX"
def get_target_table(self) -> str:
return "dws_member_recall_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# ==========================================================================
# 任务执行
# ==========================================================================
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行召回指数计算"""
self.logger.info("开始计算客户召回指数")
# 获取门店ID
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
# 加载参数
params = self._load_params()
lookback_days = int(params['lookback_days'])
# 计算基准日期
base_date = date.today()
start_date = base_date - timedelta(days=lookback_days)
self.logger.info(
"参数: lookback=%d天, sigma_min=%.1f, h_new=%.1f, h_re=%.1f",
lookback_days, params['sigma_min'], params['halflife_new'], params['halflife_recharge']
)
# 1. 提取会员到店数据
member_visits = self._extract_member_visits(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的到店记录", len(member_visits))
if not member_visits:
self.logger.warning("没有会员到店记录,跳过计算")
return {'status': 'skipped', 'reason': 'no_data'}
# 2. 提取充值记录
recharge_data = self._extract_recharge_data(site_id, start_date, base_date)
self.logger.info("提取到 %d 个会员的充值记录", len(recharge_data))
# 3. 提取首访时间
first_visit_data = self._extract_first_visit_data(site_id, list(member_visits.keys()))
self.logger.info("提取到 %d 个会员的首访时间", len(first_visit_data))
# 4. 计算每个会员的召回数据
recall_data_list: List[MemberRecallData] = []
for member_id, visit_dates in member_visits.items():
data = MemberRecallData(
member_id=member_id,
site_id=site_id,
tenant_id=tenant_id
)
# 计算特征
self._calculate_visit_features(data, visit_dates, base_date, params)
# 补充充值特征
if member_id in recharge_data:
last_recharge_date = recharge_data[member_id]
data.days_since_last_recharge = (base_date - last_recharge_date).days
# 补充首访特征
if member_id in first_visit_data:
first_visit_date = first_visit_data[member_id]
data.days_since_first_visit = (base_date - first_visit_date).days
# 计算分项得分
self._calculate_component_scores(data, params)
# 汇总Raw Score
data.raw_score = (
params['weight_overdue'] * data.score_overdue +
params['weight_new'] * data.score_new_bonus +
params['weight_recharge'] * data.score_recharge_bonus +
params['weight_hot'] * data.score_hot_drop
)
recall_data_list.append(data)
self.logger.info("计算完成 %d 个会员的Raw Score", len(recall_data_list))
# 5. 归一化到Display Score
raw_scores = [(d.member_id, d.raw_score) for d in recall_data_list]
normalized = self.batch_normalize_to_display(
raw_scores,
use_log=False,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=True,
site_id=site_id
)
# 更新display_score
score_map = {member_id: (raw, display) for member_id, raw, display in normalized}
for data in recall_data_list:
if data.member_id in score_map:
_, data.display_score = score_map[data.member_id]
# 6. 保存分位点历史
if recall_data_list:
all_raw = [d.raw_score for d in recall_data_list]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
# 7. 写入DWS表
inserted = self._save_recall_data(recall_data_list)
self.logger.info("召回指数计算完成,写入 %d 条记录", inserted)
return {
'status': 'success',
'member_count': len(recall_data_list),
'records_inserted': inserted
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_member_visits(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, List[date]]:
"""
提取会员到店记录
Returns:
{member_id: [visit_date1, visit_date2, ...]}
"""
sql = """
SELECT
member_id,
DATE(pay_time) AS visit_date
FROM billiards_dwd.dwd_settlement_head s
WHERE s.site_id = %s
AND s.member_id > 0 -- 排除散客
AND s.pay_time >= %s
AND s.pay_time < %s + INTERVAL '1 day'
AND (
s.settle_type = 1
OR (
s.settle_type = 3
AND EXISTS (
SELECT 1
FROM billiards_dwd.dwd_assistant_service_log asl
JOIN billiards_dws.cfg_skill_type st
ON asl.skill_id = st.skill_id
AND st.course_type_code = 'BONUS'
AND st.is_active = TRUE
WHERE asl.order_settle_id = s.order_settle_id
AND asl.site_id = s.site_id
AND asl.tenant_member_id = s.member_id
AND asl.is_delete = 0
)
)
)
GROUP BY member_id, DATE(pay_time)
ORDER BY member_id, visit_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, List[date]] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
visit_date = row_dict['visit_date']
if member_id not in result:
result[member_id] = []
result[member_id].append(visit_date)
return result
def _extract_recharge_data(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[int, date]:
"""
提取最近充值记录
Returns:
{member_id: last_recharge_date}
"""
sql = """
SELECT
member_id,
MAX(DATE(pay_time)) AS last_recharge_date
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND member_id > 0
AND settle_type = 5 -- 充值订单
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
GROUP BY member_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
result[int(row_dict['member_id'])] = row_dict['last_recharge_date']
return result
def _extract_first_visit_data(
self,
site_id: int,
member_ids: List[int]
) -> Dict[int, date]:
"""
提取首访时间
优先使用dim_member.create_time如果没有则使用dwd_settlement_head中的首次消费时间
Returns:
{member_id: first_visit_date}
"""
if not member_ids:
return {}
# 使用dim_member的create_time作为首访时间
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
member_id,
DATE(create_time) AS first_visit_date
FROM billiards_dwd.dim_member
WHERE member_id IN ({member_ids_str})
AND scd2_is_current = 1
"""
rows = self.db.query(sql)
result: Dict[int, date] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
first_date = row_dict['first_visit_date']
if first_date:
result[member_id] = first_date
return result
# ==========================================================================
# 特征计算方法
# ==========================================================================
def _calculate_visit_features(
self,
data: MemberRecallData,
visit_dates: List[date],
base_date: date,
params: Dict[str, float]
) -> None:
"""计算到店特征"""
if not visit_dates:
return
# 最近一次到店
last_visit = max(visit_dates)
data.days_since_last_visit = (base_date - last_visit).days
# 到店间隔
sorted_dates = sorted(visit_dates)
intervals = []
for i in range(1, len(sorted_dates)):
interval = (sorted_dates[i] - sorted_dates[i-1]).days
intervals.append(float(interval))
if intervals:
# 中位数(μ)
data.visit_interval_median = self.calculate_median(intervals)
# MADσ下限为sigma_min
mad = self.calculate_mad(intervals)
data.visit_interval_mad = max(mad, params['sigma_min'])
else:
# 只有一次到店,使用默认值
data.visit_interval_median = 7.0 # 默认周期7天
data.visit_interval_mad = params['sigma_min']
# 近14天/60天到店次数
days_14_ago = base_date - timedelta(days=14)
days_60_ago = base_date - timedelta(days=60)
data.visits_last_14_days = sum(1 for d in visit_dates if d >= days_14_ago)
data.visits_last_60_days = sum(1 for d in visit_dates if d >= days_60_ago)
def _calculate_component_scores(
self,
data: MemberRecallData,
params: Dict[str, float]
) -> None:
"""计算4项分数"""
# 1. 超期紧急性
if data.days_since_last_visit is not None and data.visit_interval_median is not None:
t = data.days_since_last_visit
mu = data.visit_interval_median
sigma = data.visit_interval_mad or params['sigma_min']
# z = max(0, (t - μ) / σ)
z = max(0.0, (t - mu) / sigma)
# overdue = 1 - exp(-z)
data.score_overdue = 1.0 - math.exp(-z)
# 2. 新客户加分
lookback_days = int(params['lookback_days'])
if data.days_since_first_visit is not None and data.days_since_first_visit <= lookback_days:
data.score_new_bonus = self.decay(
data.days_since_first_visit,
params['halflife_new']
)
# 3. 刚充值加分
if data.days_since_last_recharge is not None and data.days_since_last_recharge <= lookback_days:
data.score_recharge_bonus = self.decay(
data.days_since_last_recharge,
params['halflife_recharge']
)
# 4. 热度断档加分
epsilon = 1e-6
n14 = data.visits_last_14_days
n60 = data.visits_last_60_days
r14 = n14 / 14.0
r60 = (n60 + 1) / 60.0 # +1 平滑
hot_ratio = r14 / (r60 + epsilon)
# hot_drop = max(0, ln(1 + (hot_ratio - 1)))
if hot_ratio > 1:
data.score_hot_drop = self.safe_ln1p(hot_ratio - 1)
else:
data.score_hot_drop = 0.0
# ==========================================================================
# 数据保存方法
# ==========================================================================
def _save_recall_data(self, data_list: List[MemberRecallData]) -> int:
"""保存召回数据到DWS表"""
if not data_list:
return 0
# 先删除已存在的记录
site_id = data_list[0].site_id
member_ids = [d.member_id for d in data_list]
member_ids_str = ','.join(str(m) for m in member_ids)
delete_sql = f"""
DELETE FROM billiards_dws.dws_member_recall_index
WHERE site_id = %s AND member_id IN ({member_ids_str})
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
# 插入新记录
insert_sql = """
INSERT INTO billiards_dws.dws_member_recall_index (
site_id, tenant_id, member_id,
days_since_last_visit, visit_interval_median, visit_interval_mad,
days_since_first_visit, days_since_last_recharge,
visits_last_14_days, visits_last_60_days,
score_overdue, score_new_bonus, score_recharge_bonus, score_hot_drop,
raw_score, display_score,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
cur.execute(insert_sql, (
data.site_id, data.tenant_id, data.member_id,
data.days_since_last_visit, data.visit_interval_median, data.visit_interval_mad,
data.days_since_first_visit, data.days_since_last_recharge,
data.visits_last_14_days, data.visits_last_60_days,
data.score_overdue, data.score_new_bonus, data.score_recharge_bonus, data.score_hot_drop,
data.raw_score, data.display_score
))
inserted += cur.rowcount
# 提交事务
self.db.conn.commit()
return inserted
# ==========================================================================
# 辅助方法
# ==========================================================================
def _load_params(self) -> Dict[str, float]:
"""加载参数,缺失时使用默认值"""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
return result
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
# 从配置获取默认门店ID
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
if site_id is not None:
return int(site_id)
# 查询数据库获取第一个门店
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('site_id')
if value is not None:
return int(value)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('tenant_id')
if value is not None:
return int(value)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0

View File

@@ -0,0 +1,771 @@
# -*- coding: utf-8 -*-
"""
关系指数任务RS/OS/MS/ML
设计说明:
1. 单任务一次产出 RS / OS / MS / ML写入统一关系表
2. RS/MS 复用服务日志 + 会话合并口径;
3. ML 以人工台账窄表为唯一真源last-touch 仅保留备用路径(默认关闭);
4. RS/MS/ML 的 display 映射按 index_type 隔离分位历史。
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask
from ..base_dws_task import CourseType, TaskContext
@dataclass
class ServiceSession:
"""合并后的服务会话。"""
session_start: datetime
session_end: datetime
total_duration_minutes: int
course_weight: float
is_incentive: bool
@dataclass
class RelationPairMetrics:
"""单个 member-assistant 关系对的计算指标。"""
site_id: int
tenant_id: int
member_id: int
assistant_id: int
sessions: List[ServiceSession] = field(default_factory=list)
days_since_last_session: Optional[int] = None
session_count: int = 0
total_duration_minutes: int = 0
basic_session_count: int = 0
incentive_session_count: int = 0
rs_f: float = 0.0
rs_d: float = 0.0
rs_r: float = 0.0
rs_raw: float = 0.0
rs_display: float = 0.0
ms_f_short: float = 0.0
ms_f_long: float = 0.0
ms_raw: float = 0.0
ms_display: float = 0.0
ml_raw: float = 0.0
ml_display: float = 0.0
ml_order_count: int = 0
ml_allocated_amount: float = 0.0
os_share: float = 0.0
os_label: str = "POOL"
os_rank: Optional[int] = None
class RelationIndexTask(BaseIndexTask):
"""关系指数任务:单任务产出 RS / OS / MS / ML。"""
INDEX_TYPE = "RS"
DEFAULT_PARAMS_RS: Dict[str, float] = {
"lookback_days": 60,
"session_merge_hours": 4,
"incentive_weight": 1.5,
"halflife_session": 14.0,
"halflife_last": 10.0,
"weight_f": 1.0,
"weight_d": 0.7,
"gate_alpha": 0.6,
"percentile_lower": 5.0,
"percentile_upper": 95.0,
"compression_mode": 1.0,
"use_smoothing": 1.0,
"ewma_alpha": 0.2,
}
DEFAULT_PARAMS_OS: Dict[str, float] = {
"min_rs_raw_for_ownership": 0.05,
"min_total_rs_raw": 0.10,
"ownership_main_threshold": 0.60,
"ownership_comanage_threshold": 0.35,
"ownership_gap_threshold": 0.15,
"eps": 1e-6,
}
DEFAULT_PARAMS_MS: Dict[str, float] = {
"lookback_days": 60,
"session_merge_hours": 4,
"incentive_weight": 1.5,
"halflife_short": 7.0,
"halflife_long": 30.0,
"eps": 1e-6,
"percentile_lower": 5.0,
"percentile_upper": 95.0,
"compression_mode": 1.0,
"use_smoothing": 1.0,
"ewma_alpha": 0.2,
}
DEFAULT_PARAMS_ML: Dict[str, float] = {
"lookback_days": 60,
"source_mode": 0.0, # 0=manual_only, 1=last_touch_fallback
"recharge_attribute_hours": 1.0,
"amount_base": 500.0,
"halflife_recharge": 21.0,
"percentile_lower": 5.0,
"percentile_upper": 95.0,
"compression_mode": 1.0,
"use_smoothing": 1.0,
"ewma_alpha": 0.2,
}
def get_task_code(self) -> str:
return "DWS_RELATION_INDEX"
def get_target_table(self) -> str:
return "dws_member_assistant_relation_index"
def get_primary_keys(self) -> List[str]:
return ["site_id", "member_id", "assistant_id"]
def get_index_type(self) -> str:
# 多指数任务保留一个默认 index_type调用处应显式传 RS/MS/ML
return self.INDEX_TYPE
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
self.logger.info("开始计算关系指数RS/OS/MS/ML")
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
now = datetime.now(self.tz)
params_rs = self._load_params("RS", self.DEFAULT_PARAMS_RS)
params_os = self._load_params("OS", self.DEFAULT_PARAMS_OS)
params_ms = self._load_params("MS", self.DEFAULT_PARAMS_MS)
params_ml = self._load_params("ML", self.DEFAULT_PARAMS_ML)
service_lookback_days = max(
int(params_rs.get("lookback_days", 60)),
int(params_ms.get("lookback_days", 60)),
)
service_start = now - timedelta(days=service_lookback_days)
merge_hours = max(
int(params_rs.get("session_merge_hours", 4)),
int(params_ms.get("session_merge_hours", 4)),
)
raw_services = self._extract_service_records(site_id, service_start, now)
pair_map = self._group_and_merge_sessions(
raw_services=raw_services,
merge_hours=merge_hours,
incentive_weight=max(
float(params_rs.get("incentive_weight", 1.5)),
float(params_ms.get("incentive_weight", 1.5)),
),
now=now,
site_id=site_id,
tenant_id=tenant_id,
)
self.logger.info("服务关系对数量: %d", len(pair_map))
self._calculate_rs(pair_map, params_rs, now)
self._calculate_ms(pair_map, params_ms, now)
self._calculate_ml(pair_map, params_ml, site_id, now)
self._calculate_os(pair_map, params_os)
self._apply_display_scores(pair_map, params_rs, params_ms, params_ml, site_id)
inserted = self._save_relation_rows(site_id, list(pair_map.values()))
self.logger.info("关系指数计算完成,写入 %d 条记录", inserted)
return {
"status": "SUCCESS",
"records_inserted": inserted,
"pair_count": len(pair_map),
}
def _load_params(self, index_type: str, defaults: Dict[str, float]) -> Dict[str, float]:
params = dict(defaults)
params.update(self.load_index_parameters(index_type=index_type))
return params
def _extract_service_records(
self,
site_id: int,
start_datetime: datetime,
end_datetime: datetime,
) -> List[Dict[str, Any]]:
"""提取服务记录。"""
sql = """
SELECT
s.tenant_member_id AS member_id,
d.assistant_id AS assistant_id,
s.start_use_time AS start_time,
s.last_use_time AS end_time,
COALESCE(s.income_seconds, 0) / 60 AS duration_minutes,
s.skill_id
FROM billiards_dwd.dwd_assistant_service_log s
JOIN billiards_dwd.dim_assistant d
ON s.user_id = d.user_id
AND d.scd2_is_current = 1
AND COALESCE(d.is_delete, 0) = 0
WHERE s.site_id = %s
AND s.tenant_member_id > 0
AND s.user_id > 0
AND s.is_delete = 0
AND s.last_use_time >= %s
AND s.last_use_time < %s
ORDER BY s.tenant_member_id, d.assistant_id, s.start_use_time
"""
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
return [dict(row) for row in (rows or [])]
def _group_and_merge_sessions(
self,
*,
raw_services: List[Dict[str, Any]],
merge_hours: int,
incentive_weight: float,
now: datetime,
site_id: int,
tenant_id: int,
) -> Dict[Tuple[int, int], RelationPairMetrics]:
"""按 (member_id, assistant_id) 分组并合并会话。"""
result: Dict[Tuple[int, int], RelationPairMetrics] = {}
if not raw_services:
return result
merge_threshold = timedelta(hours=max(0, merge_hours))
grouped: Dict[Tuple[int, int], List[Dict[str, Any]]] = {}
for row in raw_services:
member_id = int(row["member_id"])
assistant_id = int(row["assistant_id"])
grouped.setdefault((member_id, assistant_id), []).append(row)
for (member_id, assistant_id), records in grouped.items():
metrics = RelationPairMetrics(
site_id=site_id,
tenant_id=tenant_id,
member_id=member_id,
assistant_id=assistant_id,
)
sorted_records = sorted(records, key=lambda r: r["start_time"])
current: Optional[ServiceSession] = None
for svc in sorted_records:
start_time = svc["start_time"]
end_time = svc["end_time"]
duration = int(svc.get("duration_minutes") or 0)
skill_id = int(svc.get("skill_id") or 0)
course_type = self.get_course_type(skill_id)
is_incentive = course_type == CourseType.BONUS
weight = incentive_weight if is_incentive else 1.0
if current is None:
current = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive,
)
continue
if start_time - current.session_end <= merge_threshold:
current.session_end = max(current.session_end, end_time)
current.total_duration_minutes += duration
current.course_weight = max(current.course_weight, weight)
current.is_incentive = current.is_incentive or is_incentive
else:
metrics.sessions.append(current)
current = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive,
)
if current is not None:
metrics.sessions.append(current)
metrics.session_count = len(metrics.sessions)
metrics.total_duration_minutes = sum(s.total_duration_minutes for s in metrics.sessions)
metrics.basic_session_count = sum(1 for s in metrics.sessions if not s.is_incentive)
metrics.incentive_session_count = sum(1 for s in metrics.sessions if s.is_incentive)
if metrics.sessions:
last_session = max(metrics.sessions, key=lambda s: s.session_end)
metrics.days_since_last_session = (now - last_session.session_end).days
result[(member_id, assistant_id)] = metrics
return result
def _calculate_rs(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
halflife_session = float(params.get("halflife_session", 14.0))
halflife_last = float(params.get("halflife_last", 10.0))
weight_f = float(params.get("weight_f", 1.0))
weight_d = float(params.get("weight_d", 0.7))
gate_alpha = max(0.0, float(params.get("gate_alpha", 0.6)))
for metrics in pair_map.values():
f_score = 0.0
d_score = 0.0
for session in metrics.sessions:
days_ago = min(
lookback_days,
max(0.0, (now - session.session_end).total_seconds() / 86400.0),
)
decay_factor = self.decay(days_ago, halflife_session)
f_score += session.course_weight * decay_factor
d_score += (
math.sqrt(max(session.total_duration_minutes, 0) / 60.0)
* session.course_weight
* decay_factor
)
if metrics.days_since_last_session is None:
r_score = 0.0
else:
r_score = self.decay(min(lookback_days, metrics.days_since_last_session), halflife_last)
base = weight_f * f_score + weight_d * d_score
gate = math.pow(r_score, gate_alpha) if r_score > 0 else 0.0
metrics.rs_f = f_score
metrics.rs_d = d_score
metrics.rs_r = r_score
metrics.rs_raw = max(0.0, base * gate)
def _calculate_ms(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
halflife_short = float(params.get("halflife_short", 7.0))
halflife_long = float(params.get("halflife_long", 30.0))
eps = float(params.get("eps", 1e-6))
for metrics in pair_map.values():
f_short = 0.0
f_long = 0.0
for session in metrics.sessions:
days_ago = min(
lookback_days,
max(0.0, (now - session.session_end).total_seconds() / 86400.0),
)
f_short += session.course_weight * self.decay(days_ago, halflife_short)
f_long += session.course_weight * self.decay(days_ago, halflife_long)
ratio = (f_short + eps) / (f_long + eps)
metrics.ms_f_short = f_short
metrics.ms_f_long = f_long
metrics.ms_raw = max(0.0, self.safe_log(ratio, 0.0))
def _calculate_ml(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
site_id: int,
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
source_mode = int(params.get("source_mode", 0))
amount_base = float(params.get("amount_base", 500.0))
halflife_recharge = float(params.get("halflife_recharge", 21.0))
start_time = now - timedelta(days=lookback_days)
manual_rows = self._extract_manual_alloc(site_id, start_time, now)
for row in manual_rows:
member_id = int(row["member_id"])
assistant_id = int(row["assistant_id"])
key = (member_id, assistant_id)
if key not in pair_map:
pair_map[key] = RelationPairMetrics(
site_id=site_id,
tenant_id=pair_map[next(iter(pair_map))].tenant_id if pair_map else self._get_tenant_id(),
member_id=member_id,
assistant_id=assistant_id,
)
metrics = pair_map[key]
amount = float(row.get("allocated_amount") or 0.0)
pay_time = row.get("pay_time")
if amount <= 0 or pay_time is None:
continue
days_ago = min(lookback_days, max(0.0, (now - pay_time).total_seconds() / 86400.0))
metrics.ml_raw += math.log1p(amount / max(amount_base, 1e-6)) * self.decay(
days_ago,
halflife_recharge,
)
metrics.ml_order_count += 1
metrics.ml_allocated_amount += amount
# 备用路径:仅在明确打开且人工台账为空时使用 last-touch。
if source_mode == 1 and not manual_rows:
self.logger.warning("ML source_mode=1 且人工台账为空,启用 last-touch 备用归因")
self._apply_last_touch_ml(pair_map, params, site_id, now)
def _extract_manual_alloc(
self,
site_id: int,
start_time: datetime,
end_time: datetime,
) -> List[Dict[str, Any]]:
sql = """
SELECT
member_id,
assistant_id,
pay_time,
allocated_amount
FROM billiards_dws.dws_ml_manual_order_alloc
WHERE site_id = %s
AND pay_time >= %s
AND pay_time < %s
"""
rows = self.db.query(sql, (site_id, start_time, end_time))
return [dict(row) for row in (rows or [])]
def _apply_last_touch_ml(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
site_id: int,
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
attribution_hours = int(params.get("recharge_attribute_hours", 1))
amount_base = float(params.get("amount_base", 500.0))
halflife_recharge = float(params.get("halflife_recharge", 21.0))
start_time = now - timedelta(days=lookback_days)
end_time = now
# 为 last-touch 建立 member -> sessions 索引
member_sessions: Dict[int, List[Tuple[datetime, int]]] = {}
for metrics in pair_map.values():
for session in metrics.sessions:
member_sessions.setdefault(metrics.member_id, []).append(
(session.session_end, metrics.assistant_id)
)
for sessions in member_sessions.values():
sessions.sort(key=lambda item: item[0])
sql = """
SELECT member_id, pay_time, pay_amount
FROM billiards_dwd.dwd_recharge_order
WHERE site_id = %s
AND settle_type = 5
AND COALESCE(is_delete, 0) = 0
AND member_id > 0
AND pay_time >= %s
AND pay_time < %s
"""
rows = self.db.query(sql, (site_id, start_time, end_time))
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict.get("member_id") or 0)
pay_time = row_dict.get("pay_time")
pay_amount = float(row_dict.get("pay_amount") or 0.0)
if member_id <= 0 or pay_time is None or pay_amount <= 0:
continue
candidates = member_sessions.get(member_id, [])
selected_assistant: Optional[int] = None
selected_end: Optional[datetime] = None
for end_time_candidate, assistant_id in candidates:
if end_time_candidate > pay_time:
continue
if pay_time - end_time_candidate > timedelta(hours=attribution_hours):
continue
if selected_end is None or end_time_candidate > selected_end:
selected_end = end_time_candidate
selected_assistant = assistant_id
if selected_assistant is None:
continue
key = (member_id, selected_assistant)
if key not in pair_map:
pair_map[key] = RelationPairMetrics(
site_id=site_id,
tenant_id=pair_map[next(iter(pair_map))].tenant_id if pair_map else self._get_tenant_id(),
member_id=member_id,
assistant_id=selected_assistant,
)
metrics = pair_map[key]
days_ago = min(lookback_days, max(0.0, (now - pay_time).total_seconds() / 86400.0))
metrics.ml_raw += math.log1p(pay_amount / max(amount_base, 1e-6)) * self.decay(
days_ago,
halflife_recharge,
)
metrics.ml_order_count += 1
metrics.ml_allocated_amount += pay_amount
def _calculate_os(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
) -> None:
min_rs = float(params.get("min_rs_raw_for_ownership", 0.05))
min_total = float(params.get("min_total_rs_raw", 0.10))
main_threshold = float(params.get("ownership_main_threshold", 0.60))
comanage_threshold = float(params.get("ownership_comanage_threshold", 0.35))
gap_threshold = float(params.get("ownership_gap_threshold", 0.15))
member_groups: Dict[int, List[RelationPairMetrics]] = {}
for metrics in pair_map.values():
member_groups.setdefault(metrics.member_id, []).append(metrics)
for _, rows in member_groups.items():
eligible = [row for row in rows if row.rs_raw >= min_rs]
sum_rs = sum(row.rs_raw for row in eligible)
if sum_rs < min_total:
for row in rows:
row.os_share = 0.0
row.os_label = "UNASSIGNED"
row.os_rank = None
continue
for row in rows:
if row.rs_raw >= min_rs:
row.os_share = row.rs_raw / sum_rs
else:
row.os_share = 0.0
sorted_eligible = sorted(
eligible,
key=lambda item: (
-item.os_share,
-item.rs_raw,
item.days_since_last_session if item.days_since_last_session is not None else 10**9,
item.assistant_id,
),
)
for idx, row in enumerate(sorted_eligible, start=1):
row.os_rank = idx
top1 = sorted_eligible[0]
top2_share = sorted_eligible[1].os_share if len(sorted_eligible) > 1 else 0.0
gap = top1.os_share - top2_share
has_main = top1.os_share >= main_threshold and gap >= gap_threshold
if has_main:
for row in rows:
if row is top1:
row.os_label = "MAIN"
elif row.os_share >= comanage_threshold:
row.os_label = "COMANAGE"
else:
row.os_label = "POOL"
else:
for row in rows:
if row.os_share >= comanage_threshold and row.rs_raw >= min_rs:
row.os_label = "COMANAGE"
else:
row.os_label = "POOL"
# 非 eligible 不赋 rank
for row in rows:
if row.rs_raw < min_rs:
row.os_rank = None
def _apply_display_scores(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params_rs: Dict[str, float],
params_ms: Dict[str, float],
params_ml: Dict[str, float],
site_id: int,
) -> None:
pair_items = list(pair_map.items())
rs_map = self._normalize_and_record(
raw_pairs=[(key, item.rs_raw) for key, item in pair_items],
params=params_rs,
index_type="RS",
site_id=site_id,
)
ms_map = self._normalize_and_record(
raw_pairs=[(key, item.ms_raw) for key, item in pair_items],
params=params_ms,
index_type="MS",
site_id=site_id,
)
ml_map = self._normalize_and_record(
raw_pairs=[(key, item.ml_raw) for key, item in pair_items],
params=params_ml,
index_type="ML",
site_id=site_id,
)
for key, item in pair_items:
item.rs_display = rs_map.get(key, 0.0)
item.ms_display = ms_map.get(key, 0.0)
item.ml_display = ml_map.get(key, 0.0)
def _normalize_and_record(
self,
*,
raw_pairs: List[Tuple[Any, float]],
params: Dict[str, float],
index_type: str,
site_id: int,
) -> Dict[Any, float]:
if not raw_pairs:
return {}
if all(abs(score) <= 1e-9 for _, score in raw_pairs):
return {entity: 0.0 for entity, _ in raw_pairs}
percentile_lower = int(params.get("percentile_lower", 5))
percentile_upper = int(params.get("percentile_upper", 95))
use_smoothing = int(params.get("use_smoothing", 1)) == 1
compression = self._map_compression(params)
normalized = self.batch_normalize_to_display(
raw_scores=raw_pairs,
compression=compression,
percentile_lower=percentile_lower,
percentile_upper=percentile_upper,
use_smoothing=use_smoothing,
site_id=site_id,
index_type=index_type,
)
display_map = {entity: display for entity, _, display in normalized}
raw_values = [float(score) for _, score in raw_pairs]
q_l, q_u = self.calculate_percentiles(raw_values, percentile_lower, percentile_upper)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(
site_id=site_id,
current_p5=q_l,
current_p95=q_u,
index_type=index_type,
)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(raw_values),
min_raw=min(raw_values),
max_raw=max(raw_values),
avg_raw=sum(raw_values) / len(raw_values),
index_type=index_type,
)
return display_map
@staticmethod
def _map_compression(params: Dict[str, float]) -> str:
mode = int(params.get("compression_mode", 0))
if mode == 1:
return "log1p"
if mode == 2:
return "asinh"
return "none"
def _save_relation_rows(self, site_id: int, rows: List[RelationPairMetrics]) -> int:
with self.db.conn.cursor() as cur:
cur.execute(
"DELETE FROM billiards_dws.dws_member_assistant_relation_index WHERE site_id = %s",
(site_id,),
)
if not rows:
self.db.conn.commit()
return 0
insert_sql = """
INSERT INTO billiards_dws.dws_member_assistant_relation_index (
site_id, tenant_id, member_id, assistant_id,
session_count, total_duration_minutes, basic_session_count, incentive_session_count,
days_since_last_session,
rs_f, rs_d, rs_r, rs_raw, rs_display,
os_share, os_label, os_rank,
ms_f_short, ms_f_long, ms_raw, ms_display,
ml_order_count, ml_allocated_amount, ml_raw, ml_display,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s, %s,
%s, %s, %s, %s,
%s,
%s, %s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
for row in rows:
cur.execute(
insert_sql,
(
row.site_id,
row.tenant_id,
row.member_id,
row.assistant_id,
row.session_count,
row.total_duration_minutes,
row.basic_session_count,
row.incentive_session_count,
row.days_since_last_session,
row.rs_f,
row.rs_d,
row.rs_r,
row.rs_raw,
row.rs_display,
row.os_share,
row.os_label,
row.os_rank,
row.ms_f_short,
row.ms_f_long,
row.ms_raw,
row.ms_display,
row.ml_order_count,
row.ml_allocated_amount,
row.ml_raw,
row.ml_display,
),
)
inserted += max(cur.rowcount, 0)
self.db.conn.commit()
return inserted
def _get_site_id(self, context: Optional[TaskContext]) -> int:
if context and getattr(context, "store_id", None):
return int(context.store_id)
site_id = self.config.get("app.default_site_id") or self.config.get("app.store_id")
if site_id is not None:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM billiards_dwd.dwd_assistant_service_log WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0]).get("site_id") or 0)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
tenant_id = self.config.get("app.tenant_id")
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM billiards_dwd.dwd_assistant_service_log WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0]).get("tenant_id") or 0)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0
__all__ = ["RelationIndexTask", "RelationPairMetrics", "ServiceSession"]

View File

@@ -0,0 +1,402 @@
# -*- coding: utf-8 -*-
"""
老客挽回指数WBI计算任务。"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import date, timedelta
from typing import Any, Dict, List, Optional, Tuple
from .member_index_base import MemberActivityData, MemberIndexBaseTask
from ..base_dws_task import TaskContext
@dataclass
class MemberWinbackData:
activity: MemberActivityData
status: str
segment: str
overdue_old: float = 0.0
overdue_cdf_p: float = 0.0
drop_old: float = 0.0
recharge_old: float = 0.0
value_old: float = 0.0
ideal_interval_days: Optional[float] = None
ideal_next_visit_date: Optional[date] = None
raw_score: Optional[float] = None
display_score: Optional[float] = None
class WinbackIndexTask(MemberIndexBaseTask):
"""老客挽回指数WBI计算任务。"""
INDEX_TYPE = "WBI"
DEFAULT_PARAMS = {
# 通用参数
'lookback_days_recency': 60,
'visit_lookback_days': 180,
'percentile_lower': 5,
'percentile_upper': 95,
'compression_mode': 0,
'use_smoothing': 1,
'ewma_alpha': 0.2,
# 分流参数
'new_visit_threshold': 2,
'new_days_threshold': 30,
'recharge_recent_days': 14,
'new_recharge_max_visits': 10,
'recency_hard_floor_days': 14,
'recency_gate_days': 14,
'recency_gate_slope_days': 3,
# WBI参数
'overdue_alpha': 2.0,
'overdue_weight_halflife_days': 30,
'overdue_weight_blend_min_samples': 8,
'h_recharge': 7,
'amount_base_M0': 300,
'balance_base_B0': 500,
'value_w_spend': 1.0,
'value_w_bal': 1.0,
'w_over': 2.0,
'w_drop': 1.0,
'w_re': 0.4,
'w_value': 1.2,
# STOP高余额例外默认关闭
'enable_stop_high_balance_exception': 0,
'high_balance_threshold': 1000,
}
def get_task_code(self) -> str:
return "DWS_WINBACK_INDEX"
def get_target_table(self) -> str:
return "dws_member_winback_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""执行 WBI 计算"""
self.logger.info("开始计算老客挽回指数 (WBI)")
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
params = self._load_params()
activity_map = self._build_member_activity(site_id, tenant_id, params)
if not activity_map:
self.logger.warning("No member activity data available; skip calculation")
return {'status': 'skipped', 'reason': 'no_data'}
winback_list: List[MemberWinbackData] = []
for activity in activity_map.values():
segment, status, in_scope = self.classify_segment(activity, params)
if not in_scope:
continue
if segment != "OLD" and status != "STOP_HIGH_BALANCE":
continue
data = MemberWinbackData(activity=activity, status=status, segment=segment)
if segment == "OLD":
self._calculate_wbi_scores(data, params)
winback_list.append(data)
if not winback_list:
self.logger.warning("No old-member rows to calculate")
return {'status': 'skipped', 'reason': 'no_old_members'}
# 归一化 Display Score
raw_scores = [
(d.activity.member_id, d.raw_score)
for d in winback_list
if d.raw_score is not None
]
if raw_scores:
compression = self._map_compression(params)
use_smoothing = int(params.get('use_smoothing', 1)) == 1
normalized = self.batch_normalize_to_display(
raw_scores,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
score_map = {member_id: display for member_id, _, display in normalized}
for data in winback_list:
if data.activity.member_id in score_map:
data.display_score = score_map[data.activity.member_id]
# 保存分位点历史
all_raw = [float(score) for _, score in raw_scores]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
inserted = self._save_winback_data(winback_list)
self.logger.info("WBI calculation finished, inserted %d rows", inserted)
return {
'status': 'success',
'member_count': len(winback_list),
'records_inserted': inserted
}
def _weighted_cdf(
self,
samples: List[Tuple[float, int]],
t_v: float,
halflife_days: float,
blend_min_samples: int,
) -> float:
if not samples:
return 0.5
if halflife_days <= 0:
p_equal = sum(1.0 for interval, _ in samples if interval <= t_v) / len(samples)
return self._clip(p_equal, 0.0, 1.0)
ln2 = math.log(2.0)
weighted_hit = 0.0
weight_sum = 0.0
equal_hit = 0.0
for interval, age_days in samples:
weight = math.exp(-ln2 * float(age_days) / halflife_days)
indicator = 1.0 if interval <= t_v else 0.0
weighted_hit += weight * indicator
weight_sum += weight
equal_hit += indicator
p_weighted = 0.5 if weight_sum <= 0 else (weighted_hit / weight_sum)
p_equal = equal_hit / len(samples)
lam = min(1.0, float(len(samples)) / float(max(1, blend_min_samples)))
p_final = lam * p_weighted + (1.0 - lam) * p_equal
return self._clip(p_final, 0.0, 1.0)
def _weighted_quantile(
self,
samples: List[Tuple[float, int]],
quantile: float,
halflife_days: float,
blend_min_samples: int,
) -> Optional[float]:
if not samples:
return None
q = self._clip(quantile, 0.0, 1.0)
equal_weight = 1.0 / float(len(samples))
if halflife_days <= 0:
weighted = [(interval, equal_weight) for interval, _ in samples]
else:
ln2 = math.log(2.0)
raw_weighted: List[Tuple[float, float]] = []
total = 0.0
for interval, age_days in samples:
w = math.exp(-ln2 * float(age_days) / halflife_days)
raw_weighted.append((interval, w))
total += w
if total <= 0:
weighted = [(interval, equal_weight) for interval, _ in samples]
else:
weighted = [(interval, w / total) for interval, w in raw_weighted]
# 对小样本混合加权分布与等权分布。
lam = min(1.0, float(len(samples)) / float(max(1, blend_min_samples)))
blended: List[Tuple[float, float]] = []
for (interval_w, w), (interval_e, _) in zip(weighted, samples):
_ = interval_e # keep tuple alignment explicit
blended_weight = lam * w + (1.0 - lam) * equal_weight
blended.append((interval_w, blended_weight))
blended.sort(key=lambda item: item[0])
cumulative = 0.0
for interval, weight in blended:
cumulative += weight
if cumulative >= q:
return float(interval)
return float(blended[-1][0])
def _calculate_wbi_scores(self, data: MemberWinbackData, params: Dict[str, float]) -> None:
"""计算 WBI 分项与 Raw Score"""
activity = data.activity
# 1) 超期紧急性基于近期加权经验CDF
overdue_alpha = float(params['overdue_alpha'])
half_life_days = float(params.get('overdue_weight_halflife_days', 30))
blend_min_samples = int(params.get('overdue_weight_blend_min_samples', 8))
if activity.interval_count <= 0:
p = 0.5
ideal_interval = None
else:
if len(activity.interval_ages_days) == activity.interval_count:
samples = list(zip(activity.intervals, activity.interval_ages_days))
else:
samples = [(interval, 0) for interval in activity.intervals]
p = self._weighted_cdf(
samples=samples,
t_v=activity.t_v,
halflife_days=half_life_days,
blend_min_samples=blend_min_samples,
)
ideal_interval = self._weighted_quantile(
samples=samples,
quantile=0.5,
halflife_days=half_life_days,
blend_min_samples=blend_min_samples,
)
data.overdue_cdf_p = p
data.overdue_old = math.pow(p, overdue_alpha)
data.ideal_interval_days = ideal_interval
if ideal_interval is not None and activity.last_visit_time is not None:
ideal_days = max(0, int(round(ideal_interval)))
data.ideal_next_visit_date = activity.last_visit_time.date() + timedelta(days=ideal_days)
else:
data.ideal_next_visit_date = None
# 2) 降频分
expected14 = activity.visits_60d * 14.0 / 60.0
data.drop_old = self._clip((expected14 - activity.visits_14d) / (expected14 + 1), 0.0, 1.0)
# 3) 充值未回访压力
if activity.recharge_unconsumed == 1:
data.recharge_old = self.decay(activity.t_r, params['h_recharge'])
else:
data.recharge_old = 0.0
# 4) 价值分
m0 = float(params['amount_base_M0'])
b0 = float(params['balance_base_B0'])
spend_score = math.log1p(activity.spend_180d / m0) if m0 > 0 else 0.0
bal_score = math.log1p(activity.sv_balance / b0) if b0 > 0 else 0.0
data.value_old = float(params['value_w_spend']) * spend_score + float(params['value_w_bal']) * bal_score
data.raw_score = (
float(params['w_over']) * data.overdue_old
+ float(params['w_drop']) * data.drop_old
+ float(params['w_re']) * data.recharge_old
+ float(params['w_value']) * data.value_old
)
hard_floor_days = float(params.get('recency_hard_floor_days', 0))
gate_days = float(params.get('recency_gate_days', 14))
slope_days = float(params.get('recency_gate_slope_days', 3))
if hard_floor_days > 0 and activity.t_v < hard_floor_days:
suppression = 0.0
elif slope_days <= 0:
suppression = 1.0 if activity.t_v >= gate_days else 0.0
else:
x = (activity.t_v - gate_days) / slope_days
x = self._clip(x, -60.0, 60.0)
suppression = 1.0 / (1.0 + math.exp(-x))
data.raw_score *= suppression
# 限制在 0 以上
if data.raw_score < 0:
data.raw_score = 0.0
def _save_winback_data(self, data_list: List[MemberWinbackData]) -> int:
"""保存 WBI 数据"""
if not data_list:
return 0
site_id = data_list[0].activity.site_id
# 按门店全量刷新,避免因分群变化导致过期数据残留。
delete_sql = """
DELETE FROM billiards_dws.dws_member_winback_index
WHERE site_id = %s
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
insert_sql = """
INSERT INTO billiards_dws.dws_member_winback_index (
site_id, tenant_id, member_id,
status, segment,
member_create_time, first_visit_time, last_visit_time, last_recharge_time,
t_v, t_r, t_a,
visits_14d, visits_60d, visits_total,
spend_30d, spend_180d, sv_balance, recharge_60d_amt,
interval_count,
overdue_old, overdue_cdf_p, drop_old, recharge_old, value_old,
ideal_interval_days, ideal_next_visit_date,
raw_score, display_score,
last_wechat_touch_time,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s,
%s,
%s, %s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
activity = data.activity
cur.execute(insert_sql, (
activity.site_id, activity.tenant_id, activity.member_id,
data.status, data.segment,
activity.member_create_time, activity.first_visit_time, activity.last_visit_time, activity.last_recharge_time,
activity.t_v, activity.t_r, activity.t_a,
activity.visits_14d, activity.visits_60d, activity.visits_total,
activity.spend_30d, activity.spend_180d, activity.sv_balance, activity.recharge_60d_amt,
activity.interval_count,
data.overdue_old, data.overdue_cdf_p, data.drop_old, data.recharge_old, data.value_old,
data.ideal_interval_days, data.ideal_next_visit_date,
data.raw_score, data.display_score,
None,
))
inserted += cur.rowcount
self.db.conn.commit()
return inserted
def _clip(self, value: float, low: float, high: float) -> float:
return max(low, min(high, value))
def _map_compression(self, params: Dict[str, float]) -> str:
mode = int(params.get('compression_mode', 0))
if mode == 1:
return "log1p"
if mode == 2:
return "asinh"
return "none"
__all__ = ['WinbackIndexTask']

View File

@@ -0,0 +1,370 @@
# -*- coding: utf-8 -*-
"""
会员消费汇总任务
功能说明:
"会员"为粒度,统计消费行为和滚动窗口指标
数据来源:
- dwd_settlement_head: 结账单头表
- dim_member: 会员维度
- dim_member_card_account: 会员卡账户
目标表:
billiards_dws.dws_member_consumption_summary
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按统计日期
业务规则:
- 散客处理member_id=0 不进入此表
- 滚动窗口7/10/15/30/60/90天
- 卡余额:区分储值卡(现金卡)和赠送卡
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class MemberConsumptionTask(BaseDwsTask):
"""
会员消费汇总任务
统计每个会员的:
- 首次/最近消费日期
- 累计消费统计
- 滚动窗口统计7/10/15/30/60/90天
- 卡余额快照
- 活跃度指标和客户分层
"""
def get_task_code(self) -> str:
return "DWS_MEMBER_CONSUMPTION"
def get_target_table(self) -> str:
return "dws_member_consumption_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "member_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
stat_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,统计日期 %s",
self.get_task_code(), stat_date
)
# 1. 获取会员消费统计(含滚动窗口)
consumption_stats = self._extract_consumption_stats(site_id, stat_date)
# 2. 获取会员信息
member_info = self._extract_member_info(site_id)
# 3. 获取会员卡余额
card_balances = self._extract_card_balances(site_id)
return {
'consumption_stats': consumption_stats,
'member_info': member_info,
'card_balances': card_balances,
'stat_date': stat_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
"""
consumption_stats = extracted['consumption_stats']
member_info = extracted['member_info']
card_balances = extracted['card_balances']
stat_date = extracted['stat_date']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条会员消费记录",
self.get_task_code(), len(consumption_stats)
)
results = []
for stats in consumption_stats:
member_id = stats.get('member_id')
# 跳过散客
if self.is_guest(member_id):
continue
memb_info = member_info.get(member_id, {})
balance = card_balances.get(member_id, {})
# 计算活跃度和客户分层
days_since_last = self._calc_days_since(stat_date, stats.get('last_consume_date'))
customer_tier = self._calculate_customer_tier(stats, days_since_last)
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'member_id': member_id,
'stat_date': stat_date,
# 会员基本信息
'member_nickname': memb_info.get('nickname'),
'member_mobile': self._mask_mobile(memb_info.get('mobile')),
'card_grade_name': memb_info.get('member_card_grade_name'),
'register_date': memb_info.get('register_date'),
# 全量累计统计
'first_consume_date': stats.get('first_consume_date'),
'last_consume_date': stats.get('last_consume_date'),
'total_visit_count': self.safe_int(stats.get('total_visit_count', 0)),
'total_consume_amount': self.safe_decimal(stats.get('total_consume_amount', 0)),
'total_recharge_amount': self.safe_decimal(memb_info.get('recharge_money_sum', 0)),
'total_table_fee': self.safe_decimal(stats.get('total_table_fee', 0)),
'total_goods_amount': self.safe_decimal(stats.get('total_goods_amount', 0)),
'total_assistant_amount': self.safe_decimal(stats.get('total_assistant_amount', 0)),
# 滚动窗口统计
'visit_count_7d': self.safe_int(stats.get('visit_count_7d', 0)),
'visit_count_10d': self.safe_int(stats.get('visit_count_10d', 0)),
'visit_count_15d': self.safe_int(stats.get('visit_count_15d', 0)),
'visit_count_30d': self.safe_int(stats.get('visit_count_30d', 0)),
'visit_count_60d': self.safe_int(stats.get('visit_count_60d', 0)),
'visit_count_90d': self.safe_int(stats.get('visit_count_90d', 0)),
'consume_amount_7d': self.safe_decimal(stats.get('consume_amount_7d', 0)),
'consume_amount_10d': self.safe_decimal(stats.get('consume_amount_10d', 0)),
'consume_amount_15d': self.safe_decimal(stats.get('consume_amount_15d', 0)),
'consume_amount_30d': self.safe_decimal(stats.get('consume_amount_30d', 0)),
'consume_amount_60d': self.safe_decimal(stats.get('consume_amount_60d', 0)),
'consume_amount_90d': self.safe_decimal(stats.get('consume_amount_90d', 0)),
# 卡余额
'cash_card_balance': self.safe_decimal(balance.get('cash_balance', 0)),
'gift_card_balance': self.safe_decimal(balance.get('gift_balance', 0)),
'total_card_balance': self.safe_decimal(balance.get('total_balance', 0)),
# 活跃度指标
'days_since_last': days_since_last,
'is_active_7d': self.safe_int(stats.get('visit_count_7d', 0)) > 0,
'is_active_30d': self.safe_int(stats.get('visit_count_30d', 0)) > 0,
'is_active_90d': self.safe_int(stats.get('visit_count_90d', 0)) > 0,
# 客户分层
'customer_tier': customer_tier,
}
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
deleted = self.delete_existing_data(context, date_col="stat_date")
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_consumption_stats(
self,
site_id: int,
stat_date: date
) -> List[Dict[str, Any]]:
"""
提取会员消费统计(含滚动窗口)
"""
sql = """
WITH consume_base AS (
SELECT
member_id,
DATE(pay_time) AS consume_date,
consume_money,
table_charge_money,
goods_money,
assistant_pd_money + assistant_cx_money AS assistant_amount
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %s
AND member_id IS NOT NULL
AND member_id != 0
)
SELECT
member_id,
MIN(consume_date) AS first_consume_date,
MAX(consume_date) AS last_consume_date,
-- 全量累计
COUNT(*) AS total_visit_count,
SUM(consume_money) AS total_consume_amount,
SUM(table_charge_money) AS total_table_fee,
SUM(goods_money) AS total_goods_amount,
SUM(assistant_amount) AS total_assistant_amount,
-- 滚动窗口
COUNT(CASE WHEN consume_date >= %s - INTERVAL '6 days' THEN 1 END) AS visit_count_7d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '9 days' THEN 1 END) AS visit_count_10d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '14 days' THEN 1 END) AS visit_count_15d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '29 days' THEN 1 END) AS visit_count_30d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '59 days' THEN 1 END) AS visit_count_60d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '89 days' THEN 1 END) AS visit_count_90d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '6 days' THEN consume_money ELSE 0 END) AS consume_amount_7d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '9 days' THEN consume_money ELSE 0 END) AS consume_amount_10d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '14 days' THEN consume_money ELSE 0 END) AS consume_amount_15d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '29 days' THEN consume_money ELSE 0 END) AS consume_amount_30d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '59 days' THEN consume_money ELSE 0 END) AS consume_amount_60d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '89 days' THEN consume_money ELSE 0 END) AS consume_amount_90d
FROM consume_base
GROUP BY member_id
"""
params = [site_id] + [stat_date] * 12
rows = self.db.query(sql, tuple(params))
return [dict(row) for row in rows] if rows else []
def _extract_member_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取会员信息
"""
sql = """
SELECT
member_id,
nickname,
mobile,
member_card_grade_name,
DATE(create_time) AS register_date,
recharge_money_sum
FROM billiards_dwd.dim_member
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['member_id']] = row_dict
return result
def _extract_card_balances(self, site_id: int) -> Dict[int, Dict[str, Decimal]]:
"""
提取会员卡余额
"""
# 卡类型ID
CASH_CARD_TYPE_ID = 2793249295533893
GIFT_CARD_TYPE_IDS = [2791990152417157, 2793266846533445, 2794699703437125]
sql = """
SELECT
tenant_member_id AS member_id,
card_type_id,
balance
FROM billiards_dwd.dim_member_card_account
WHERE site_id = %s
AND scd2_is_current = 1
AND COALESCE(is_delete, 0) = 0
"""
rows = self.db.query(sql, (site_id,))
result: Dict[int, Dict[str, Decimal]] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = row_dict.get('member_id')
card_type_id = row_dict.get('card_type_id')
balance = self.safe_decimal(row_dict.get('balance', 0))
if member_id not in result:
result[member_id] = {
'cash_balance': Decimal('0'),
'gift_balance': Decimal('0'),
'total_balance': Decimal('0')
}
if card_type_id == CASH_CARD_TYPE_ID:
result[member_id]['cash_balance'] += balance
elif card_type_id in GIFT_CARD_TYPE_IDS:
result[member_id]['gift_balance'] += balance
result[member_id]['total_balance'] = (
result[member_id]['cash_balance'] + result[member_id]['gift_balance']
)
return result
# ==========================================================================
# 工具方法
# ==========================================================================
def _mask_mobile(self, mobile: Optional[str]) -> Optional[str]:
"""手机号脱敏"""
if not mobile or len(mobile) < 7:
return mobile
return mobile[:3] + "****" + mobile[-4:]
def _calc_days_since(self, stat_date: date, last_date: Optional[date]) -> Optional[int]:
"""计算距离最近消费的天数"""
if not last_date:
return None
if isinstance(last_date, datetime):
last_date = last_date.date()
return (stat_date - last_date).days
def _calculate_customer_tier(
self,
stats: Dict[str, Any],
days_since_last: Optional[int]
) -> str:
"""
计算客户分层
分层规则:
- 高价值90天内消费>=3次 且 消费金额>=1000
- 中等30天内有消费
- 低活跃90天内有消费但30天内无消费
- 流失90天内无消费
"""
visit_90d = self.safe_int(stats.get('visit_count_90d', 0))
visit_30d = self.safe_int(stats.get('visit_count_30d', 0))
amount_90d = self.safe_decimal(stats.get('consume_amount_90d', 0))
if visit_90d >= 3 and amount_90d >= 1000:
return "高价值"
elif visit_30d > 0:
return "中等"
elif visit_90d > 0:
return "低活跃"
else:
return "流失"
# 便于外部导入
__all__ = ['MemberConsumptionTask']

View File

@@ -0,0 +1,423 @@
# -*- coding: utf-8 -*-
"""
会员来店明细任务
功能说明:
"会员+订单"为粒度,记录每次来店消费明细
数据来源:
- dwd_settlement_head: 结账单头表
- dwd_assistant_service_log: 助教服务流水
- dim_member: 会员维度
- dim_table: 台桌维度
- cfg_area_category: 区域分类映射
目标表:
billiards_dws.dws_member_visit_detail
更新策略:
- 更新频率:每日增量更新
- 幂等方式delete-before-insert按日期窗口
业务规则:
- 散客处理member_id=0 不进入此表
- 区域分类使用cfg_area_category映射
- 助教服务以JSON格式存储多个助教的服务明细
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
import json
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class MemberVisitTask(BaseDwsTask):
"""
会员来店明细任务
记录每个会员每次来店的:
- 台桌信息和区域分类
- 消费金额明细
- 支付方式明细
- 助教服务明细JSON格式
"""
def get_task_code(self) -> str:
return "DWS_MEMBER_VISIT"
def get_target_table(self) -> str:
return "dws_member_visit_detail"
def get_primary_keys(self) -> List[str]:
return ["site_id", "member_id", "order_settle_id"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,日期范围 %s ~ %s",
self.get_task_code(), start_date, end_date
)
# 1. 获取结账单
settlements = self._extract_settlements(site_id, start_date, end_date)
# 2. 获取助教服务明细
assistant_services = self._extract_assistant_services(site_id, start_date, end_date)
# 2.1 获取台费时长(真实秒数)
table_fee_durations = self._extract_table_fee_durations(site_id, start_date, end_date)
# 3. 获取会员信息
member_info = self._extract_member_info(site_id)
# 4. 获取台桌信息
table_info = self._extract_table_info(site_id)
# 5. 加载配置
self.load_config_cache()
return {
'settlements': settlements,
'assistant_services': assistant_services,
'member_info': member_info,
'table_info': table_info,
'table_fee_durations': table_fee_durations,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
"""
settlements = extracted['settlements']
assistant_services = extracted['assistant_services']
member_info = extracted['member_info']
table_info = extracted['table_info']
table_fee_durations = extracted['table_fee_durations']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条结账单",
self.get_task_code(), len(settlements)
)
# 构建助教服务索引order_settle_id -> [services]
service_index = self._build_service_index(assistant_services)
# 构建台费时长索引order_settle_id -> total_seconds
table_duration_index = {
row.get('order_settle_id'): self.safe_int(row.get('table_use_seconds', 0))
for row in (table_fee_durations or [])
if row.get('order_settle_id')
}
results = []
for settle in settlements:
member_id = settle.get('member_id')
# 跳过散客
if self.is_guest(member_id):
continue
order_settle_id = settle.get('order_settle_id')
table_id = settle.get('table_id')
memb_info = member_info.get(member_id, {})
tbl_info = table_info.get(table_id, {})
services = service_index.get(order_settle_id, [])
# 获取区域分类
area_name = tbl_info.get('area_name')
area_cat = self.get_area_category(area_name)
# 构建助教服务JSON
assistant_services_json = self._build_assistant_services_json(services)
# 计算时长
table_seconds = table_duration_index.get(order_settle_id, 0)
table_duration = self._calc_table_duration(table_seconds)
assistant_duration = sum(
self.safe_int(s.get('income_seconds', 0))
for s in services
) // 60 # 转为分钟
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'member_id': member_id,
'order_settle_id': order_settle_id,
'visit_date': settle.get('visit_date'),
'visit_time': settle.get('create_time'),
# 会员信息
'member_nickname': memb_info.get('nickname'),
'member_mobile': self._mask_mobile(memb_info.get('mobile')),
'member_birthday': memb_info.get('birthday'),
# 台桌信息
'table_id': table_id,
'table_name': tbl_info.get('table_name'),
'area_name': area_name,
'area_category': area_cat.get('category_name'),
# 消费金额
'table_fee': self.safe_decimal(settle.get('table_charge_money', 0)),
'goods_amount': self.safe_decimal(settle.get('goods_money', 0)),
'assistant_amount': self.safe_decimal(settle.get('assistant_pd_money', 0)) + \
self.safe_decimal(settle.get('assistant_cx_money', 0)),
'total_consume': self.safe_decimal(settle.get('consume_money', 0)),
'total_discount': self._calc_total_discount(settle),
'actual_pay': self.safe_decimal(settle.get('pay_amount', 0)),
# 支付方式
'cash_pay': self.safe_decimal(settle.get('pay_amount', 0)),
'cash_card_pay': self.safe_decimal(settle.get('balance_amount', 0)),
'gift_card_pay': self.safe_decimal(settle.get('gift_card_amount', 0)),
'groupbuy_pay': self.safe_decimal(settle.get('coupon_amount', 0)),
# 时长
'table_duration_min': table_duration,
'assistant_duration_min': assistant_duration,
# 助教服务明细
'assistant_services': assistant_services_json,
}
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
deleted = self.delete_existing_data(context, date_col="visit_date")
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_settlements(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取结账单
"""
sql = """
SELECT
order_settle_id,
order_trade_no,
table_id,
member_id,
create_time,
pay_time,
DATE(pay_time) AS visit_date,
consume_money,
pay_amount,
table_charge_money,
goods_money,
assistant_pd_money,
assistant_cx_money,
coupon_amount,
adjust_amount,
member_discount_amount,
rounding_amount,
gift_card_amount,
balance_amount,
recharge_card_amount
FROM billiards_dwd.dwd_settlement_head
WHERE site_id = %s
AND DATE(pay_time) >= %s
AND DATE(pay_time) <= %s
AND member_id IS NOT NULL
AND member_id != 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_assistant_services(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取助教服务明细
"""
sql = """
SELECT
order_settle_id,
site_assistant_id AS assistant_id,
nickname AS assistant_nickname,
income_seconds,
ledger_amount
FROM billiards_dwd.dwd_assistant_service_log
WHERE site_id = %s
AND DATE(start_use_time) >= %s
AND DATE(start_use_time) <= %s
AND is_delete = 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_table_fee_durations(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取台费时长(真实秒数)
"""
sql = """
SELECT
order_settle_id,
SUM(COALESCE(real_table_use_seconds, 0)) AS table_use_seconds
FROM billiards_dwd.dwd_table_fee_log
WHERE site_id = %s
AND DATE(ledger_end_time) >= %s
AND DATE(ledger_end_time) <= %s
AND COALESCE(is_delete, 0) = 0
GROUP BY order_settle_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_member_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取会员信息
"""
sql = """
SELECT
member_id,
nickname,
mobile,
birthday
FROM billiards_dwd.dim_member
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
return {r['member_id']: dict(r) for r in (rows or [])}
def _extract_table_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取台桌信息
"""
sql = """
SELECT
site_table_id AS table_id,
site_table_name AS table_name,
site_table_area_name AS area_name
FROM billiards_dwd.dim_table
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
return {r['table_id']: dict(r) for r in (rows or [])}
# ==========================================================================
# 工具方法
# ==========================================================================
def _build_service_index(
self,
services: List[Dict[str, Any]]
) -> Dict[int, List[Dict[str, Any]]]:
"""
构建助教服务索引
"""
index: Dict[int, List[Dict[str, Any]]] = {}
for service in services:
order_id = service.get('order_settle_id')
if order_id:
if order_id not in index:
index[order_id] = []
index[order_id].append(service)
return index
def _build_assistant_services_json(
self,
services: List[Dict[str, Any]]
) -> Optional[str]:
"""
构建助教服务JSON
"""
if not services:
return None
json_data = []
for s in services:
json_data.append({
'assistant_id': s.get('assistant_id'),
'nickname': s.get('assistant_nickname'),
'duration_min': self.safe_int(s.get('income_seconds', 0)) // 60,
'amount': float(self.safe_decimal(s.get('ledger_amount', 0)))
})
return json.dumps(json_data, ensure_ascii=False)
def _calc_table_duration(self, table_use_seconds: int) -> int:
"""
计算台桌使用时长(分钟)
使用真实台费流水秒数
"""
if not table_use_seconds or table_use_seconds <= 0:
return 0
return int(table_use_seconds // 60)
def _calc_total_discount(self, settle: Dict[str, Any]) -> Decimal:
"""
计算总优惠
"""
adjust = self.safe_decimal(settle.get('adjust_amount', 0))
member_discount = self.safe_decimal(settle.get('member_discount_amount', 0))
rounding = self.safe_decimal(settle.get('rounding_amount', 0))
return adjust + member_discount + rounding
def _mask_mobile(self, mobile: Optional[str]) -> Optional[str]:
"""手机号脱敏"""
if not mobile or len(mobile) < 7:
return mobile
return mobile[:3] + "****" + mobile[-4:]
# 便于外部导入
__all__ = ['MemberVisitTask']

View File

@@ -0,0 +1,196 @@
# -*- coding: utf-8 -*-
"""
DWS 物化视图刷新任务
说明:
- 按 L1/L2/L3/L4 时间分层刷新物化视图
- 默认受 dws.mv.enabled 与 dws.retention.* 配置联动控制
"""
from __future__ import annotations
import json
from typing import Any, Dict, List, Optional
from .base_dws_task import BaseDwsTask, TaskContext, TimeLayer
class BaseMvRefreshTask(BaseDwsTask):
"""物化视图刷新基类"""
BASE_TABLE: str = ""
DATE_COL: str = ""
VIEW_PREFIX = "mv_"
LAYER_ORDER = [
TimeLayer.LAST_2_DAYS,
TimeLayer.LAST_1_MONTH,
TimeLayer.LAST_3_MONTHS,
TimeLayer.LAST_6_MONTHS,
]
LAYER_SUFFIX = {
TimeLayer.LAST_2_DAYS: "l1",
TimeLayer.LAST_1_MONTH: "l2",
TimeLayer.LAST_3_MONTHS: "l3",
TimeLayer.LAST_6_MONTHS: "l4",
}
def get_target_table(self) -> str:
return self.BASE_TABLE
def get_primary_keys(self) -> List[str]:
return []
def extract(self, context: TaskContext) -> Dict[str, Any]:
return {"site_id": context.store_id}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
return extracted
def load(self, transformed: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
if not self._is_enabled():
self.logger.info("%s: 未启用物化刷新,跳过", self.get_task_code())
return {"counts": {"refreshed": 0}}
layers = self._resolve_layers()
refreshed = 0
details = []
for layer in layers:
view_name = self._get_view_name(layer)
if not view_name:
continue
if not self._view_exists(view_name):
self.logger.warning("%s: 物化视图不存在,跳过 %s", self.get_task_code(), view_name)
continue
self._refresh_view(view_name)
refreshed += 1
details.append({"view": view_name, "layer": layer.value})
self.logger.info("%s: 刷新完成,物化视图数=%d", self.get_task_code(), refreshed)
return {"counts": {"refreshed": refreshed}, "extra": {"details": details}}
def _is_enabled(self) -> bool:
enabled = bool(self.config.get("dws.mv.enabled", False))
if not enabled:
return False
tables = self._parse_list(self.config.get("dws.mv.tables"))
if not tables:
tables = self._parse_list(self.config.get("dws.retention.tables"))
if tables and self.BASE_TABLE not in tables:
return False
return True
def _resolve_layers(self) -> List[TimeLayer]:
# 显式配置优先
configured = self._parse_layers(self.config.get("dws.mv.layers"))
if configured:
return configured
# 表级覆盖:优先 mv.table_layers其次 retention.table_layers
table_layers = self._resolve_layer_map(
self.config.get("dws.mv.table_layers") or self.config.get("dws.retention.table_layers")
)
layer_name = table_layers.get(self.BASE_TABLE)
if layer_name:
layer = self._get_layer(layer_name)
if layer and layer != TimeLayer.ALL:
return self._layers_up_to(layer)
# 默认使用 retention.layer
retention_layer = self._get_layer(self.config.get("dws.retention.layer"))
if retention_layer and retention_layer != TimeLayer.ALL:
return self._layers_up_to(retention_layer)
return list(self.LAYER_ORDER)
def _layers_up_to(self, target: TimeLayer) -> List[TimeLayer]:
layers = []
for layer in self.LAYER_ORDER:
layers.append(layer)
if layer == target:
break
return layers
def _get_view_name(self, layer: TimeLayer) -> Optional[str]:
suffix = self.LAYER_SUFFIX.get(layer)
if not suffix or not self.BASE_TABLE:
return None
return f"{self.VIEW_PREFIX}{self.BASE_TABLE}_{suffix}"
def _view_exists(self, view_name: str) -> bool:
sql = "SELECT to_regclass(%s) AS reg"
rows = self.db.query(sql, (f"{self.DWS_SCHEMA}.{view_name}",))
return bool(rows and rows[0].get("reg"))
def _refresh_view(self, view_name: str) -> None:
concurrently = bool(self.config.get("dws.mv.refresh_concurrently", False))
keyword = "CONCURRENTLY " if concurrently else ""
sql = f"REFRESH MATERIALIZED VIEW {keyword}{self.DWS_SCHEMA}.{view_name}"
self.db.execute(sql)
def _get_layer(self, layer_name: Optional[str]) -> Optional[TimeLayer]:
if not layer_name:
return None
name = str(layer_name).upper()
try:
return TimeLayer[name]
except KeyError:
return None
def _resolve_layer_map(self, raw: Any) -> Dict[str, str]:
if not raw:
return {}
if isinstance(raw, dict):
return {str(k): str(v) for k, v in raw.items()}
if isinstance(raw, str):
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return {str(k): str(v) for k, v in parsed.items()}
except json.JSONDecodeError:
return {}
return {}
def _parse_layers(self, raw: Any) -> List[TimeLayer]:
if not raw:
return []
if isinstance(raw, str):
items = [v.strip() for v in raw.split(",") if v.strip()]
elif isinstance(raw, (list, tuple, set)):
items = [str(v).strip() for v in raw if str(v).strip()]
else:
return []
layers = []
for item in items:
layer = self._get_layer(item)
if layer and layer not in layers:
layers.append(layer)
return layers
def _parse_list(self, raw: Any) -> List[str]:
if not raw:
return []
if isinstance(raw, str):
return [v.strip() for v in raw.split(",") if v.strip()]
if isinstance(raw, (list, tuple, set)):
return [str(v).strip() for v in raw if str(v).strip()]
return []
class DwsMvRefreshFinanceDailyTask(BaseMvRefreshTask):
BASE_TABLE = "dws_finance_daily_summary"
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_MV_REFRESH_FINANCE_DAILY"
class DwsMvRefreshAssistantDailyTask(BaseMvRefreshTask):
BASE_TABLE = "dws_assistant_daily_detail"
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_MV_REFRESH_ASSISTANT_DAILY"
__all__ = ["DwsMvRefreshFinanceDailyTask", "DwsMvRefreshAssistantDailyTask"]

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""
DWS 时间分层清理任务
功能说明:
按配置的时间分层范围,对 DWS 表执行历史数据清理。
该任务默认不启用,需通过配置显式开启。
配置示例(.env / settings
DWS_RETENTION_ENABLED=true
DWS_RETENTION_LAYER=LAST_3_MONTHS
DWS_RETENTION_TABLES=dws_finance_daily_summary,dws_assistant_daily_detail
DWS_RETENTION_TABLE_LAYERS={"dws_finance_expense_summary":"ALL"}
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import json
from datetime import date
from typing import Any, Dict, List, Optional
from .base_dws_task import BaseDwsTask, TaskContext, TimeLayer
class DwsRetentionCleanupTask(BaseDwsTask):
"""
DWS 时间分层清理任务
"""
DEFAULT_TABLES = [
{"table": "dws_assistant_daily_detail", "date_col": "stat_date"},
{"table": "dws_assistant_monthly_summary", "date_col": "stat_month"},
{"table": "dws_assistant_customer_stats", "date_col": "stat_date"},
{"table": "dws_assistant_salary_calc", "date_col": "salary_month"},
{"table": "dws_assistant_recharge_commission", "date_col": "commission_month"},
{"table": "dws_assistant_finance_analysis", "date_col": "stat_date"},
{"table": "dws_member_consumption_summary", "date_col": "stat_date"},
{"table": "dws_member_visit_detail", "date_col": "visit_date"},
{"table": "dws_finance_daily_summary", "date_col": "stat_date"},
{"table": "dws_finance_income_structure", "date_col": "stat_date"},
{"table": "dws_finance_discount_detail", "date_col": "stat_date"},
{"table": "dws_finance_recharge_summary", "date_col": "stat_date"},
{"table": "dws_finance_expense_summary", "date_col": "expense_month"},
{"table": "dws_platform_settlement", "date_col": "settlement_date"},
]
def get_task_code(self) -> str:
return "DWS_RETENTION_CLEANUP"
def get_target_table(self) -> str:
return "dws_finance_daily_summary"
def get_primary_keys(self) -> List[str]:
return []
def extract(self, context: TaskContext) -> Dict[str, Any]:
return {"site_id": context.store_id}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
return extracted
def load(self, transformed: Dict[str, Any], context: TaskContext) -> Dict:
"""
执行清理逻辑
"""
if not self._is_retention_enabled():
self.logger.info("%s: 未启用清理配置,跳过", self.get_task_code())
return {"counts": {"cleaned": 0}}
base_date = context.window_end.date() if hasattr(context.window_end, "date") else context.window_end
default_layer = self._get_retention_layer(self.config.get("dws.retention.layer", "ALL"))
if default_layer is None:
self.logger.warning("%s: 未识别的清理层级,跳过", self.get_task_code())
return {"counts": {"cleaned": 0}}
target_tables = self._resolve_target_tables()
if not target_tables:
self.logger.info("%s: 未配置需要清理的表,跳过", self.get_task_code())
return {"counts": {"cleaned": 0}}
table_layers = self._resolve_table_layers()
total_deleted = 0
details = []
for item in target_tables:
table = item["table"]
date_col = item["date_col"]
layer_name = table_layers.get(table, default_layer.value)
layer = self._get_retention_layer(layer_name)
if layer is None or layer == TimeLayer.ALL:
continue
time_range = self.get_time_layer_range(layer, base_date)
cutoff = self._normalize_cutoff(date_col, time_range.start)
deleted = self._cleanup_table(table, date_col, cutoff, context.store_id)
total_deleted += deleted
details.append({"table": table, "deleted": deleted, "cutoff": str(cutoff)})
self.logger.info("%s: 清理完成,总删除 %d", self.get_task_code(), total_deleted)
return {"counts": {"cleaned": total_deleted}, "extra": {"details": details}}
def _is_retention_enabled(self) -> bool:
return bool(self.config.get("dws.retention.enabled", False))
def _get_retention_layer(self, layer_name: Optional[str]) -> Optional[TimeLayer]:
if not layer_name:
return None
name = str(layer_name).upper()
try:
return TimeLayer[name]
except KeyError:
return None
def _resolve_target_tables(self) -> List[Dict[str, str]]:
table_list = self.config.get("dws.retention.tables")
if not table_list:
return self.DEFAULT_TABLES
if isinstance(table_list, str):
names = [t.strip() for t in table_list.split(",") if t.strip()]
else:
names = list(table_list)
selected = []
for item in self.DEFAULT_TABLES:
if item["table"] in names:
selected.append(item)
return selected
def _resolve_table_layers(self) -> Dict[str, str]:
raw = self.config.get("dws.retention.table_layers")
if not raw:
return {}
if isinstance(raw, dict):
return {str(k): str(v) for k, v in raw.items()}
if isinstance(raw, str):
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return {str(k): str(v) for k, v in parsed.items()}
except json.JSONDecodeError:
return {}
return {}
def _normalize_cutoff(self, date_col: str, cutoff: date) -> date:
monthly_cols = {"stat_month", "salary_month", "commission_month", "expense_month"}
if date_col in monthly_cols:
return cutoff.replace(day=1)
return cutoff
def _cleanup_table(self, table: str, date_col: str, cutoff: date, site_id: int) -> int:
full_table = f"{self.DWS_SCHEMA}.{table}"
sql = f"DELETE FROM {full_table} WHERE site_id = %s AND {date_col} < %s"
with self.db.conn.cursor() as cur:
cur.execute(sql, (site_id, cutoff))
return cur.rowcount
__all__ = ["DwsRetentionCleanupTask"]