在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

View File

@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
# AI_CHANGELOG [2026-02-13] 移除 RecallIndexTask/IntimacyIndexTask 导入,更新 __all__
"""
DWS层ETL任务模块
包含:
- BaseDwsTask: DWS任务基类
- 助教维度任务
- 客户维度任务
- 财务维度任务
- 指数算法任务
"""
from .base_dws_task import BaseDwsTask, TimeLayer, TimeWindow, CourseType, DiscountType
from .assistant_daily_task import AssistantDailyTask
from .assistant_monthly_task import AssistantMonthlyTask
from .assistant_customer_task import AssistantCustomerTask
from .assistant_salary_task import AssistantSalaryTask
from .assistant_finance_task import AssistantFinanceTask
from .member_consumption_task import MemberConsumptionTask
from .member_visit_task import MemberVisitTask
from .finance_daily_task import FinanceDailyTask
from .finance_recharge_task import FinanceRechargeTask
from .finance_income_task import FinanceIncomeStructureTask
from .finance_discount_task import FinanceDiscountDetailTask
from .finance_base_task import FinanceBaseTask
from .maintenance_task import DwsMaintenanceTask
# 指数算法任务
from .index import (
WinbackIndexTask,
NewconvIndexTask,
MlManualImportTask,
RelationIndexTask,
)
__all__ = [
# 基类
"BaseDwsTask",
"TimeLayer",
"TimeWindow",
"CourseType",
"DiscountType",
# 助教维度
"AssistantDailyTask",
"AssistantMonthlyTask",
"AssistantCustomerTask",
"AssistantSalaryTask",
"AssistantFinanceTask",
# 客户维度
"MemberConsumptionTask",
"MemberVisitTask",
# 财务维度
"FinanceBaseTask",
"FinanceDailyTask",
"FinanceRechargeTask",
"FinanceIncomeStructureTask",
"FinanceDiscountDetailTask",
"DwsMaintenanceTask",
# 指数算法
"WinbackIndexTask",
"NewconvIndexTask",
"MlManualImportTask",
"RelationIndexTask",
]

View File

@@ -0,0 +1,298 @@
# -*- coding: utf-8 -*-
"""
助教服务客户统计任务
功能说明:
"助教+客户"为粒度,统计服务关系和滚动窗口指标
数据来源:
- dwd_assistant_service_log: 助教服务流水
- dim_member: 会员维度
目标表:
dws.dws_assistant_customer_stats
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按统计日期
业务规则:
- 散客处理member_id=0 不进入此表统计
- 滚动窗口7/10/15/30/60/90天
- 活跃度近7天/30天是否有服务
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
from .dws_helpers import mask_mobile, calc_days_since
class AssistantCustomerTask(BaseDwsTask):
"""
助教服务客户统计任务
统计每个助教与每个客户的服务关系:
- 首次/最近服务日期
- 累计服务统计
- 滚动窗口统计7/10/15/30/60/90天
- 活跃度指标
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_ASSISTANT_CUSTOMER"
def get_target_table(self) -> str:
return "dws_assistant_customer_stats"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "member_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
stat_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,统计日期 %s",
self.get_task_code(), stat_date
)
# 计算最大回溯日期90天窗口
lookback_start = stat_date - timedelta(days=90)
# 1. 获取助教-客户服务记录(包含历史全量用于累计统计)
service_pairs = self._extract_service_pairs(site_id, stat_date)
# 2. 获取会员信息
member_info = self._extract_member_info(site_id)
# 3. 获取助教信息
assistant_info = self._extract_assistant_info(site_id)
return {
'service_pairs': service_pairs,
'member_info': member_info,
'assistant_info': assistant_info,
'stat_date': stat_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算各窗口统计
"""
service_pairs = extracted['service_pairs']
member_info = extracted['member_info']
assistant_info = extracted['assistant_info']
stat_date = extracted['stat_date']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条服务关系记录",
self.get_task_code(), len(service_pairs)
)
# 构建统计记录
results = []
for pair in service_pairs:
assistant_id = pair.get('assistant_id')
member_id = pair.get('member_id')
# 跳过散客
if self.is_guest(member_id):
continue
asst_info = assistant_info.get(assistant_id, {})
memb_info = member_info.get(member_id, {})
# 构建记录
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': asst_info.get('nickname', pair.get('assistant_nickname')),
'member_id': member_id,
'member_nickname': memb_info.get('nickname'),
'member_mobile': self._mask_mobile(memb_info.get('mobile')),
'stat_date': stat_date,
# 全量累计统计
'first_service_date': pair.get('first_service_date'),
'last_service_date': pair.get('last_service_date'),
'total_service_count': self.safe_int(pair.get('total_service_count', 0)),
'total_service_hours': self.safe_decimal(pair.get('total_service_hours', 0)),
'total_service_amount': self.safe_decimal(pair.get('total_service_amount', 0)),
# 滚动窗口统计
'service_count_7d': self.safe_int(pair.get('service_count_7d', 0)),
'service_count_10d': self.safe_int(pair.get('service_count_10d', 0)),
'service_count_15d': self.safe_int(pair.get('service_count_15d', 0)),
'service_count_30d': self.safe_int(pair.get('service_count_30d', 0)),
'service_count_60d': self.safe_int(pair.get('service_count_60d', 0)),
'service_count_90d': self.safe_int(pair.get('service_count_90d', 0)),
'service_hours_7d': self.safe_decimal(pair.get('service_hours_7d', 0)),
'service_hours_10d': self.safe_decimal(pair.get('service_hours_10d', 0)),
'service_hours_15d': self.safe_decimal(pair.get('service_hours_15d', 0)),
'service_hours_30d': self.safe_decimal(pair.get('service_hours_30d', 0)),
'service_hours_60d': self.safe_decimal(pair.get('service_hours_60d', 0)),
'service_hours_90d': self.safe_decimal(pair.get('service_hours_90d', 0)),
'service_amount_7d': self.safe_decimal(pair.get('service_amount_7d', 0)),
'service_amount_10d': self.safe_decimal(pair.get('service_amount_10d', 0)),
'service_amount_15d': self.safe_decimal(pair.get('service_amount_15d', 0)),
'service_amount_30d': self.safe_decimal(pair.get('service_amount_30d', 0)),
'service_amount_60d': self.safe_decimal(pair.get('service_amount_60d', 0)),
'service_amount_90d': self.safe_decimal(pair.get('service_amount_90d', 0)),
# 活跃度指标
'days_since_last': self._calc_days_since(stat_date, pair.get('last_service_date')),
'is_active_7d': self.safe_int(pair.get('service_count_7d', 0)) > 0,
'is_active_30d': self.safe_int(pair.get('service_count_30d', 0)) > 0,
}
results.append(record)
return results
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="stat_date"
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_pairs(
self,
site_id: int,
stat_date: date
) -> List[Dict[str, Any]]:
"""
提取助教-客户服务统计(含滚动窗口)
"""
sql = """
WITH service_base AS (
SELECT
site_assistant_id AS assistant_id,
nickname AS assistant_nickname,
tenant_member_id AS member_id,
DATE(start_use_time) AS service_date,
income_seconds,
ledger_amount
FROM dwd.dwd_assistant_service_log
WHERE site_id = %s
AND tenant_member_id IS NOT NULL
AND tenant_member_id != 0
AND is_delete = 0
)
SELECT
assistant_id,
MAX(assistant_nickname) AS assistant_nickname,
member_id,
MIN(service_date) AS first_service_date,
MAX(service_date) AS last_service_date,
-- 全量累计
COUNT(*) AS total_service_count,
SUM(income_seconds) / 3600.0 AS total_service_hours,
SUM(ledger_amount) AS total_service_amount,
-- 7天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '6 days' THEN 1 END) AS service_count_7d,
SUM(CASE WHEN service_date >= %s - INTERVAL '6 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_7d,
SUM(CASE WHEN service_date >= %s - INTERVAL '6 days' THEN ledger_amount ELSE 0 END) AS service_amount_7d,
-- 10天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '9 days' THEN 1 END) AS service_count_10d,
SUM(CASE WHEN service_date >= %s - INTERVAL '9 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_10d,
SUM(CASE WHEN service_date >= %s - INTERVAL '9 days' THEN ledger_amount ELSE 0 END) AS service_amount_10d,
-- 15天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '14 days' THEN 1 END) AS service_count_15d,
SUM(CASE WHEN service_date >= %s - INTERVAL '14 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_15d,
SUM(CASE WHEN service_date >= %s - INTERVAL '14 days' THEN ledger_amount ELSE 0 END) AS service_amount_15d,
-- 30天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '29 days' THEN 1 END) AS service_count_30d,
SUM(CASE WHEN service_date >= %s - INTERVAL '29 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_30d,
SUM(CASE WHEN service_date >= %s - INTERVAL '29 days' THEN ledger_amount ELSE 0 END) AS service_amount_30d,
-- 60天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '59 days' THEN 1 END) AS service_count_60d,
SUM(CASE WHEN service_date >= %s - INTERVAL '59 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_60d,
SUM(CASE WHEN service_date >= %s - INTERVAL '59 days' THEN ledger_amount ELSE 0 END) AS service_amount_60d,
-- 90天窗口
COUNT(CASE WHEN service_date >= %s - INTERVAL '89 days' THEN 1 END) AS service_count_90d,
SUM(CASE WHEN service_date >= %s - INTERVAL '89 days' THEN income_seconds ELSE 0 END) / 3600.0 AS service_hours_90d,
SUM(CASE WHEN service_date >= %s - INTERVAL '89 days' THEN ledger_amount ELSE 0 END) AS service_amount_90d
FROM service_base
GROUP BY assistant_id, member_id
HAVING MAX(service_date) >= %s - INTERVAL '90 days'
"""
# 构建参数每个窗口需要3个日期参数
params = [site_id]
for _ in range(6): # 6个窗口每个3个参数
params.extend([stat_date, stat_date, stat_date])
params.append(stat_date) # HAVING条件
rows = self.db.query(sql, tuple(params))
return [dict(row) for row in rows] if rows else []
def _extract_member_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取会员信息
"""
sql = """
SELECT
member_id,
nickname,
mobile
FROM dwd.dim_member
WHERE site_id = %s
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['member_id']] = row_dict
return result
def _extract_assistant_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取助教信息
"""
sql = """
SELECT
assistant_id,
nickname
FROM dwd.dim_assistant
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['assistant_id']] = row_dict
return result
# ==========================================================================
# 工具方法
# ==========================================================================
def _mask_mobile(self, mobile: Optional[str]) -> Optional[str]:
return mask_mobile(mobile)
def _calc_days_since(self, stat_date: date, last_date: Optional[date]) -> Optional[int]:
return calc_days_since(stat_date, last_date)
# 便于外部导入
__all__ = ['AssistantCustomerTask']

View File

@@ -0,0 +1,331 @@
# -*- coding: utf-8 -*-
"""
助教日度业绩明细任务
功能说明:
"助教+日期"为粒度,汇总每日业绩明细
数据来源:
- dwd_assistant_service_log: 助教服务流水
- dwd_assistant_trash_event: 废除记录(排除)
- dim_assistant: 助教维度SCD2获取当日等级
- cfg_skill_type: 技能→课程类型映射
目标表:
dws.dws_assistant_daily_detail
更新策略:
- 更新频率:每小时增量更新
- 幂等方式delete-before-insert按日期窗口
业务规则:
- 有效业绩需排除dwd_assistant_trash_event中的废除记录
- 助教等级使用SCD2 as-of取值获取统计日当日生效的等级
- 课程类型通过skill_id映射分为基础课和附加课
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, CourseType, TaskContext
class AssistantDailyTask(BaseDwsTask):
"""
助教日度业绩明细任务
汇总每个助教每天的:
- 服务次数(总/基础课/附加课)
- 计费时长(秒/小时)
- 计费金额
- 服务客户数(去重)
- 服务台桌数(去重)
- 被废除的记录统计
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_ASSISTANT_DAILY"
def get_target_table(self) -> str:
return "dws_assistant_daily_detail"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据从DWD层读取助教服务记录
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,日期范围 %s ~ %s",
self.get_task_code(), start_date, end_date
)
# 1. 获取助教服务记录
service_records = self._extract_service_records(site_id, start_date, end_date)
# 2. 获取废除记录
trash_records = self._extract_trash_records(site_id, start_date, end_date)
# 3. 加载配置缓存
self.load_config_cache()
return {
'service_records': service_records,
'trash_records': trash_records,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:按助教+日期聚合
"""
service_records = extracted['service_records']
trash_records = extracted['trash_records']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,服务记录 %d 条,废除记录 %d",
self.get_task_code(), len(service_records), len(trash_records)
)
# 构建废除记录索引assistant_service_id -> trash_info
trash_index = self._build_trash_index(trash_records)
# 按助教+日期聚合
aggregated = self._aggregate_by_assistant_date(
service_records,
trash_index,
site_id
)
return aggregated
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="stat_date"
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_service_records(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取助教服务记录
"""
sql = """
SELECT
asl.assistant_service_id,
asl.order_settle_id,
asl.site_assistant_id AS assistant_id,
asl.nickname AS assistant_nickname,
asl.assistant_level,
asl.skill_id,
asl.skill_name,
asl.tenant_member_id AS member_id,
asl.site_table_id AS table_id,
asl.income_seconds,
asl.real_use_seconds,
asl.ledger_amount,
asl.ledger_unit_price,
DATE(asl.start_use_time) AS service_date
FROM dwd.dwd_assistant_service_log asl
WHERE asl.site_id = %s
AND DATE(asl.start_use_time) >= %s
AND DATE(asl.start_use_time) <= %s
AND asl.is_delete = 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_trash_records(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取废除记录
有效业绩的排除规则:仅对"助教废除表"的记录进行处理排除
"""
sql = """
SELECT
assistant_service_id,
trash_seconds,
trash_reason,
trash_time
FROM dwd.dwd_assistant_trash_event
WHERE site_id = %s
AND DATE(trash_time) >= %s
AND DATE(trash_time) <= %s
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _build_trash_index(
self,
trash_records: List[Dict[str, Any]]
) -> Dict[int, Dict[str, Any]]:
"""
构建废除记录索引
"""
index = {}
for record in trash_records:
service_id = record.get('assistant_service_id')
if service_id:
index[service_id] = record
return index
def _aggregate_by_assistant_date(
self,
service_records: List[Dict[str, Any]],
trash_index: Dict[int, Dict[str, Any]],
site_id: int
) -> List[Dict[str, Any]]:
"""
按助教+日期聚合服务记录
"""
# 聚合字典:(assistant_id, service_date) -> aggregated_data
agg_dict: Dict[Tuple[int, date], Dict[str, Any]] = {}
for record in service_records:
assistant_id = record.get('assistant_id')
service_date = record.get('service_date')
if not assistant_id or not service_date:
continue
key = (assistant_id, service_date)
# 初始化聚合数据
if key not in agg_dict:
# 获取助教当日等级SCD2 as-of
level_info = self.get_assistant_level_asof(assistant_id, service_date)
agg_dict[key] = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': record.get('assistant_nickname'),
'stat_date': service_date,
'assistant_level_code': level_info.get('level_code') if level_info else record.get('assistant_level'),
'assistant_level_name': level_info.get('level_name') if level_info else None,
'total_service_count': 0,
'base_service_count': 0,
'bonus_service_count': 0,
'room_service_count': 0,
'total_seconds': 0,
'base_seconds': 0,
'bonus_seconds': 0,
'room_seconds': 0,
'total_hours': Decimal('0'),
'base_hours': Decimal('0'),
'bonus_hours': Decimal('0'),
'room_hours': Decimal('0'),
'total_ledger_amount': Decimal('0'),
'base_ledger_amount': Decimal('0'),
'bonus_ledger_amount': Decimal('0'),
'room_ledger_amount': Decimal('0'),
'unique_customers': set(),
'unique_tables': set(),
'trashed_seconds': 0,
'trashed_count': 0,
}
agg = agg_dict[key]
# 获取服务信息
service_id = record.get('assistant_service_id')
income_seconds = self.safe_int(record.get('income_seconds', 0))
ledger_amount = self.safe_decimal(record.get('ledger_amount', 0))
skill_id = record.get('skill_id')
member_id = record.get('member_id')
table_id = record.get('table_id')
# 判断课程类型
course_type = self.get_course_type(skill_id) if skill_id else CourseType.BASE
is_base = course_type == CourseType.BASE
is_bonus = course_type == CourseType.BONUS
is_room = course_type == CourseType.ROOM
# 检查是否被废除
is_trashed = service_id in trash_index
if is_trashed:
# 废除记录单独统计
trash_info = trash_index[service_id]
trash_seconds = self.safe_int(trash_info.get('trash_seconds', income_seconds))
agg['trashed_seconds'] += trash_seconds
agg['trashed_count'] += 1
else:
# 正常记录累加
agg['total_service_count'] += 1
agg['total_seconds'] += income_seconds
agg['total_ledger_amount'] += ledger_amount
if is_base:
agg['base_service_count'] += 1
agg['base_seconds'] += income_seconds
agg['base_ledger_amount'] += ledger_amount
elif is_bonus:
agg['bonus_service_count'] += 1
agg['bonus_seconds'] += income_seconds
agg['bonus_ledger_amount'] += ledger_amount
elif is_room:
agg['room_service_count'] += 1
agg['room_seconds'] += income_seconds
agg['room_ledger_amount'] += ledger_amount
# 客户和台桌去重统计(不论是否废除)
if member_id and not self.is_guest(member_id):
agg['unique_customers'].add(member_id)
if table_id:
agg['unique_tables'].add(table_id)
# 转换为列表并计算派生字段
result = []
for key, agg in agg_dict.items():
# 计算小时数
agg['total_hours'] = self.seconds_to_hours(agg['total_seconds'])
agg['base_hours'] = self.seconds_to_hours(agg['base_seconds'])
agg['bonus_hours'] = self.seconds_to_hours(agg['bonus_seconds'])
agg['room_hours'] = self.seconds_to_hours(agg['room_seconds'])
# 转换set为count
agg['unique_customers'] = len(agg['unique_customers'])
agg['unique_tables'] = len(agg['unique_tables'])
result.append(agg)
return result
# 便于外部导入
__all__ = ['AssistantDailyTask']

View File

@@ -0,0 +1,198 @@
# -*- coding: utf-8 -*-
"""
助教收支分析任务
功能说明:
"日期+助教"为粒度,分析助教产出的收入和成本
数据来源:
- dwd_assistant_service_log: 助教服务流水(收入)
- dws_assistant_salary_calc: 工资计算(成本)
目标表:
dws.dws_assistant_finance_analysis
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, CourseType, TaskContext
class AssistantFinanceTask(BaseDwsTask):
"""
助教收支分析任务
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_ASSISTANT_FINANCE"
def get_target_table(self) -> str:
return "dws_assistant_finance_analysis"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date", "assistant_id"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 获取助教日度收入
daily_revenue = self._extract_daily_revenue(site_id, start_date, end_date)
# 获取月度工资(用于计算日均成本)
monthly_salary = self._extract_monthly_salary(site_id, start_date, end_date)
# 加载配置
self.load_config_cache()
return {
'daily_revenue': daily_revenue,
'monthly_salary': monthly_salary,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
daily_revenue = extracted['daily_revenue']
monthly_salary = extracted['monthly_salary']
site_id = extracted['site_id']
# 构建月度工资索引
salary_index = {}
for sal in monthly_salary:
asst_id = sal.get('assistant_id')
month = sal.get('salary_month')
if asst_id and month:
salary_index[(asst_id, month)] = sal
results = []
for rev in daily_revenue:
assistant_id = rev.get('assistant_id')
stat_date = rev.get('stat_date')
# 获取对应月份的工资
month_start = stat_date.replace(day=1) if isinstance(stat_date, date) else None
salary = salary_index.get((assistant_id, month_start), {})
# 计算日均成本
gross_salary = self.safe_decimal(salary.get('gross_salary', 0))
work_days = self.safe_int(salary.get('work_days', 1)) or 1
cost_daily = gross_salary / Decimal(str(work_days))
revenue_total = self.safe_decimal(rev.get('revenue_total', 0))
gross_profit = revenue_total - cost_daily
gross_margin = gross_profit / revenue_total if revenue_total > 0 else Decimal('0')
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'stat_date': stat_date,
'assistant_id': assistant_id,
'assistant_nickname': rev.get('assistant_nickname'),
'revenue_total': revenue_total,
'revenue_base': self.safe_decimal(rev.get('revenue_base', 0)),
'revenue_bonus': self.safe_decimal(rev.get('revenue_bonus', 0)),
'revenue_room': self.safe_decimal(rev.get('revenue_room', 0)),
'cost_daily': cost_daily,
'gross_profit': gross_profit,
'gross_margin': gross_margin,
'service_count': self.safe_int(rev.get('service_count', 0)),
'service_hours': self.safe_decimal(rev.get('service_hours', 0)),
'room_service_count': self.safe_int(rev.get('room_service_count', 0)),
'room_service_hours': self.safe_decimal(rev.get('room_service_hours', 0)),
'unique_customers': self.safe_int(rev.get('unique_customers', 0)),
}
results.append(record)
return results
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="stat_date"
def _extract_daily_revenue(self, site_id: int, start_date: date, end_date: date) -> List[Dict[str, Any]]:
sql = """
SELECT
DATE(s.start_use_time) AS stat_date,
s.site_assistant_id AS assistant_id,
MAX(s.nickname) AS assistant_nickname,
COUNT(*) AS service_count,
SUM(s.income_seconds) / 3600.0 AS service_hours,
SUM(s.ledger_amount) AS revenue_total,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'BASE' THEN s.ledger_amount ELSE 0 END) AS revenue_base,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'BONUS' THEN s.ledger_amount ELSE 0 END) AS revenue_bonus,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'ROOM' THEN s.ledger_amount ELSE 0 END) AS revenue_room,
COUNT(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'ROOM' THEN 1 END) AS room_service_count,
SUM(CASE WHEN COALESCE(st.course_type_code, 'BASE') = 'ROOM' THEN s.income_seconds ELSE 0 END) / 3600.0 AS room_service_hours,
COUNT(DISTINCT CASE WHEN s.tenant_member_id > 0 THEN s.tenant_member_id END) AS unique_customers
FROM dwd.dwd_assistant_service_log s
LEFT JOIN dws.cfg_skill_type st
ON st.skill_id = s.skill_id AND st.is_active = TRUE
WHERE s.site_id = %s
AND DATE(s.start_use_time) >= %s
AND DATE(s.start_use_time) <= %s
AND s.is_delete = 0
GROUP BY DATE(s.start_use_time), s.site_assistant_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_monthly_salary(self, site_id: int, start_date: date, end_date: date) -> List[Dict[str, Any]]:
# 获取涉及的月份
month_start = start_date.replace(day=1)
month_end = end_date.replace(day=1)
sql = """
SELECT
assistant_id,
salary_month,
gross_salary,
effective_hours
FROM dws.dws_assistant_salary_calc
WHERE site_id = %s
AND salary_month >= %s
AND salary_month <= %s
"""
rows = self.db.query(sql, (site_id, month_start, month_end))
# 获取每月工作天数
work_days_sql = """
SELECT
assistant_id,
DATE_TRUNC('month', stat_date)::DATE AS month,
COUNT(DISTINCT stat_date) AS work_days
FROM dws.dws_assistant_daily_detail
WHERE site_id = %s
AND stat_date >= %s
AND stat_date <= %s
GROUP BY assistant_id, DATE_TRUNC('month', stat_date)
"""
work_days_rows = self.db.query(work_days_sql, (site_id, start_date, end_date))
work_days_index = {(r['assistant_id'], r['month']): r['work_days'] for r in (work_days_rows or [])}
results = []
for row in (rows or []):
row_dict = dict(row)
asst_id = row_dict.get('assistant_id')
month = row_dict.get('salary_month')
row_dict['work_days'] = work_days_index.get((asst_id, month), 20)
results.append(row_dict)
return results
__all__ = ['AssistantFinanceTask']

View File

@@ -0,0 +1,603 @@
# -*- coding: utf-8 -*-
"""
助教月度业绩汇总任务
功能说明:
"助教+月份"为粒度,汇总月度业绩及档位计算
数据来源:
- dws_assistant_daily_detail: 日度明细(聚合)
- dim_assistant: 助教维度(入职日期、等级)
- cfg_performance_tier: 绩效档位配置
目标表:
dws.dws_assistant_monthly_summary
更新策略:
- 更新频率:每日更新当月数据
- 幂等方式delete-before-insert按月份
业务规则:
- 新入职判断入职日期在月1日0点之后则为新入职
- 有效业绩total_hours - trashed_hours
- 档位匹配根据有效业绩小时数匹配cfg_performance_tier
- 排名计算按有效业绩小时数降序考虑并列如2个第一则都是1下一个是3
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class AssistantMonthlyTask(BaseDwsTask):
"""
助教月度业绩汇总任务
汇总每个助教每月的:
- 工作天数、服务次数、时长
- 有效业绩(扣除废除记录后)
- 档位匹配
- 月度排名用于Top3奖金
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列(月度粒度)
DATE_COL = "stat_month"
def get_task_code(self) -> str:
return "DWS_ASSISTANT_MONTHLY"
def get_target_table(self) -> str:
return "dws_assistant_monthly_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "stat_month"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据:从日度明细表聚合
"""
# 确定月份范围
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 获取涉及的月份列表
months = self._get_months_in_range(start_date, end_date)
months = self._filter_months_for_schedule(months, end_date)
self.logger.info(
"%s: 提取数据,月份范围 %s",
self.get_task_code(), [str(m) for m in months]
)
if not months:
self.logger.info("%s: 无需处理月份,跳过", self.get_task_code())
return {
'daily_aggregates': [],
'monthly_uniques': [],
'assistant_info': {},
'months': [],
'site_id': site_id
}
# 1. 获取日度明细聚合数据
daily_aggregates = self._extract_daily_aggregates(site_id, months)
# 1.1 获取月度去重客户/台桌统计从DWD直接去重
monthly_uniques = self._extract_monthly_uniques(site_id, months)
# 2. 获取助教基本信息
assistant_info = self._extract_assistant_info(site_id)
# 3. 加载配置缓存
self.load_config_cache()
return {
'daily_aggregates': daily_aggregates,
'monthly_uniques': monthly_uniques,
'assistant_info': assistant_info,
'months': months,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算月度汇总、档位匹配、排名
"""
daily_aggregates = extracted['daily_aggregates']
monthly_uniques = extracted['monthly_uniques']
assistant_info = extracted['assistant_info']
months = extracted['months']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 个月份,%d 条聚合记录",
self.get_task_code(), len(months), len(daily_aggregates)
)
# 月度去重索引
monthly_unique_index = {
(row.get('assistant_id'), row.get('stat_month')): row
for row in (monthly_uniques or [])
if row.get('assistant_id') and row.get('stat_month')
}
# 按月份处理
all_results = []
for month in months:
month_results = self._process_month(
daily_aggregates,
assistant_info,
monthly_unique_index,
month,
site_id
)
all_results.extend(month_results)
return all_results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据写入DWS表
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据(按月份)
deleted = self._delete_by_months(context, transformed)
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _get_months_in_range(self, start_date: date, end_date: date) -> List[date]:
"""
获取日期范围内的所有月份(月第一天)
"""
months = []
current = start_date.replace(day=1)
end_month = end_date.replace(day=1)
while current <= end_month:
months.append(current)
# 下个月
if current.month == 12:
current = current.replace(year=current.year + 1, month=1)
else:
current = current.replace(month=current.month + 1)
return months
def _filter_months_for_schedule(self, months: List[date], end_date: date) -> List[date]:
"""
按调度口径过滤历史月份(默认仅当月,月初可包含上月)
"""
if not months:
return []
history_months = self.safe_int(self.config.get("dws.monthly.history_months", 0))
if history_months > 0:
current_month = self.get_month_first_day(end_date)
allowed = {current_month}
for offset in range(1, history_months + 1):
allowed.add(self.get_month_first_day(self._shift_months(current_month, -offset)))
filtered = [m for m in months if m in allowed]
skipped = [m for m in months if m not in allowed]
if skipped:
self.logger.info(
"%s: 跳过历史月份 %s",
self.get_task_code(),
[str(m) for m in skipped]
)
return filtered
allow_history = bool(self.config.get("dws.monthly.allow_history", False))
if allow_history:
return months
current_month = self.get_month_first_day(end_date)
allowed = {current_month}
grace_days = self.safe_int(self.config.get("dws.monthly.prev_month_grace_days", 5))
if grace_days > 0 and end_date.day <= grace_days:
prev_month = self.get_month_first_day(self._shift_months(current_month, -1))
allowed.add(prev_month)
filtered = [m for m in months if m in allowed]
skipped = [m for m in months if m not in allowed]
if skipped:
self.logger.info(
"%s: 跳过历史月份 %s",
self.get_task_code(),
[str(m) for m in skipped]
)
return filtered
def _extract_daily_aggregates(
self,
site_id: int,
months: List[date]
) -> List[Dict[str, Any]]:
"""
从日度明细表提取并按月聚合
"""
if not months:
return []
# 构建月份条件
month_conditions = []
for month in months:
next_month = (month.replace(day=28) + timedelta(days=4)).replace(day=1)
month_conditions.append(f"(stat_date >= '{month}' AND stat_date < '{next_month}')")
month_where = " OR ".join(month_conditions)
sql = f"""
SELECT
assistant_id,
assistant_nickname,
assistant_level_code,
assistant_level_name,
DATE_TRUNC('month', stat_date)::DATE AS stat_month,
COUNT(DISTINCT stat_date) AS work_days,
SUM(total_service_count) AS total_service_count,
SUM(base_service_count) AS base_service_count,
SUM(bonus_service_count) AS bonus_service_count,
SUM(room_service_count) AS room_service_count,
SUM(total_hours) AS total_hours,
SUM(base_hours) AS base_hours,
SUM(bonus_hours) AS bonus_hours,
SUM(room_hours) AS room_hours,
SUM(total_ledger_amount) AS total_ledger_amount,
SUM(base_ledger_amount) AS base_ledger_amount,
SUM(bonus_ledger_amount) AS bonus_ledger_amount,
SUM(room_ledger_amount) AS room_ledger_amount,
SUM(unique_customers) AS total_unique_customers,
SUM(unique_tables) AS total_unique_tables,
SUM(trashed_seconds) AS trashed_seconds,
SUM(trashed_count) AS trashed_count
FROM dws.dws_assistant_daily_detail
WHERE site_id = %s AND ({month_where})
GROUP BY assistant_id, assistant_nickname, assistant_level_code, assistant_level_name,
DATE_TRUNC('month', stat_date)
"""
rows = self.db.query(sql, (site_id,))
return [dict(row) for row in rows] if rows else []
def _extract_monthly_uniques(
self,
site_id: int,
months: List[date]
) -> List[Dict[str, Any]]:
"""
从DWD按月直接去重客户与台桌
"""
if not months:
return []
start_month = min(months)
end_month = max(months)
next_month = (end_month.replace(day=28) + timedelta(days=4)).replace(day=1)
sql = """
SELECT
site_assistant_id AS assistant_id,
DATE_TRUNC('month', start_use_time)::DATE AS stat_month,
COUNT(DISTINCT CASE WHEN tenant_member_id > 0 THEN tenant_member_id END) AS unique_customers,
COUNT(DISTINCT site_table_id) AS unique_tables
FROM dwd.dwd_assistant_service_log
WHERE site_id = %s
AND start_use_time >= %s
AND start_use_time < %s
AND is_delete = 0
GROUP BY site_assistant_id, DATE_TRUNC('month', start_use_time)
"""
rows = self.db.query(sql, (site_id, start_month, next_month))
return [dict(row) for row in rows] if rows else []
def _extract_assistant_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取助教基本信息
"""
sql = """
SELECT
assistant_id,
nickname,
level AS assistant_level,
entry_time AS hire_date
FROM dwd.dim_assistant
WHERE site_id = %s
AND scd2_is_current = 1 -- 当前有效记录
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['assistant_id']] = row_dict
return result
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _process_month(
self,
daily_aggregates: List[Dict[str, Any]],
assistant_info: Dict[int, Dict[str, Any]],
monthly_unique_index: Dict[Tuple[int, date], Dict[str, Any]],
month: date,
site_id: int
) -> List[Dict[str, Any]]:
"""
处理单个月份的数据
"""
# 筛选该月份的数据
month_data = [
agg for agg in daily_aggregates
if agg.get('stat_month') == month
]
if not month_data:
return []
# 构建月度汇总记录
month_records = []
for agg in month_data:
assistant_id = agg.get('assistant_id')
asst_info = assistant_info.get(assistant_id, {})
# 计算有效业绩
total_hours = self.safe_decimal(agg.get('total_hours', 0))
trashed_hours = self.seconds_to_hours(self.safe_int(agg.get('trashed_seconds', 0)))
effective_hours = total_hours - trashed_hours
# 判断是否新入职
hire_date = asst_info.get('hire_date')
is_new_hire = False
if hire_date:
if isinstance(hire_date, datetime):
hire_date = hire_date.date()
is_new_hire = self.is_new_hire_in_month(hire_date, month)
# 匹配档位
tier_hours = effective_hours
max_tier_level = None
if is_new_hire:
tier_hours = self._calc_new_hire_tier_hours(effective_hours, self.safe_int(agg.get('work_days', 0)))
if self._should_apply_new_hire_tier_cap(month, hire_date):
max_tier_level = self._get_new_hire_max_tier_level()
tier = self.get_performance_tier(
tier_hours,
is_new_hire,
effective_date=month,
max_tier_level=max_tier_level
)
# 获取月末的等级信息(用于记录)
month_end = self._get_month_end(month)
level_info = self.get_assistant_level_asof(assistant_id, month_end)
# 月度去重客户/台桌从DWD直接去重
unique_info = monthly_unique_index.get((assistant_id, month), {})
unique_customers = self.safe_int(
unique_info.get('unique_customers', agg.get('total_unique_customers', 0))
)
unique_tables = self.safe_int(
unique_info.get('unique_tables', agg.get('total_unique_tables', 0))
)
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': agg.get('assistant_nickname'),
'stat_month': month,
'assistant_level_code': level_info.get('level_code') if level_info else agg.get('assistant_level_code'),
'assistant_level_name': level_info.get('level_name') if level_info else agg.get('assistant_level_name'),
'hire_date': hire_date,
'is_new_hire': is_new_hire,
'work_days': self.safe_int(agg.get('work_days', 0)),
'total_service_count': self.safe_int(agg.get('total_service_count', 0)),
'base_service_count': self.safe_int(agg.get('base_service_count', 0)),
'bonus_service_count': self.safe_int(agg.get('bonus_service_count', 0)),
'room_service_count': self.safe_int(agg.get('room_service_count', 0)),
'total_hours': total_hours,
'base_hours': self.safe_decimal(agg.get('base_hours', 0)),
'bonus_hours': self.safe_decimal(agg.get('bonus_hours', 0)),
'room_hours': self.safe_decimal(agg.get('room_hours', 0)),
'effective_hours': effective_hours,
'trashed_hours': trashed_hours,
'total_ledger_amount': self.safe_decimal(agg.get('total_ledger_amount', 0)),
'base_ledger_amount': self.safe_decimal(agg.get('base_ledger_amount', 0)),
'bonus_ledger_amount': self.safe_decimal(agg.get('bonus_ledger_amount', 0)),
'room_ledger_amount': self.safe_decimal(agg.get('room_ledger_amount', 0)),
'unique_customers': unique_customers,
'unique_tables': unique_tables,
'avg_service_seconds': self._calc_avg_service_seconds(agg),
'tier_id': tier.get('tier_id') if tier else None,
'tier_code': tier.get('tier_code') if tier else None,
'tier_name': tier.get('tier_name') if tier else None,
'rank_by_hours': None, # 后面计算
'rank_with_ties': None, # 后面计算
}
month_records.append(record)
# 计算排名
self._calculate_ranks(month_records)
return month_records
def _get_month_end(self, month: date) -> date:
"""
获取月末日期
"""
if month.month == 12:
next_month = month.replace(year=month.year + 1, month=1, day=1)
else:
next_month = month.replace(month=month.month + 1, day=1)
return next_month - timedelta(days=1)
def _calc_avg_service_seconds(self, agg: Dict[str, Any]) -> Decimal:
"""
计算平均单次服务时长
"""
total_count = self.safe_int(agg.get('total_service_count', 0))
if total_count == 0:
return Decimal('0')
total_hours = self.safe_decimal(agg.get('total_hours', 0))
total_seconds = total_hours * Decimal('3600')
return total_seconds / Decimal(str(total_count))
def _calc_new_hire_tier_hours(self, effective_hours: Decimal, work_days: int) -> Decimal:
"""
新入职定档:日均 * 30仅用于定档不影响奖金与排名
"""
if work_days <= 0:
return Decimal('0')
return (effective_hours / Decimal(str(work_days))) * Decimal('30')
def _should_apply_new_hire_tier_cap(self, stat_month: date, hire_date: Optional[date]) -> bool:
"""
新入职封顶规则是否生效:
- 仅在规则生效月及之后(默认 2026-03-01 起)
- 仅当入职日期晚于封顶日(默认当月 25 日)
"""
if not hire_date:
return False
effective_from = self._get_new_hire_cap_effective_from()
cap_day = self._get_new_hire_cap_day()
return stat_month >= effective_from and hire_date.day > cap_day
def _get_new_hire_cap_effective_from(self) -> date:
"""
获取新入职封顶规则生效月份(默认 2026-03-01
"""
raw_value = self.config.get("dws.monthly.new_hire_cap_effective_from", "2026-03-01")
if isinstance(raw_value, datetime):
return raw_value.date()
if isinstance(raw_value, date):
return raw_value
if isinstance(raw_value, str):
try:
return datetime.strptime(raw_value.strip(), "%Y-%m-%d").date()
except ValueError:
pass
return date(2026, 3, 1)
def _get_new_hire_cap_day(self) -> int:
"""
获取新入职封顶日(默认 25
"""
value = self.safe_int(self.config.get("dws.monthly.new_hire_cap_day", 25))
return min(max(value, 1), 31)
def _get_new_hire_max_tier_level(self) -> int:
"""
获取新入职封顶档位等级(默认 2 档)
"""
value = self.safe_int(self.config.get("dws.monthly.new_hire_max_tier_level", 2))
return max(value, 0)
def _calculate_ranks(self, records: List[Dict[str, Any]]) -> None:
"""
计算排名(考虑并列)
Top3排名口径按有效业绩总小时数排名
如遇并列则都算比如2个第一则记为2个第一一个第三
"""
if not records:
return
# 按有效业绩降序排序
sorted_records = sorted(
records,
key=lambda x: x.get('effective_hours', Decimal('0')),
reverse=True
)
# 计算考虑并列的排名
values = [
(r.get('assistant_id'), r.get('effective_hours', Decimal('0')))
for r in sorted_records
]
ranked = self.calculate_rank_with_ties(values)
# 创建排名映射
rank_map = {
assistant_id: (rank, dense_rank)
for assistant_id, rank, dense_rank in ranked
}
# 更新记录
for record in records:
assistant_id = record.get('assistant_id')
if assistant_id in rank_map:
rank, _ = rank_map[assistant_id]
record['rank_by_hours'] = rank
record['rank_with_ties'] = rank # 使用考虑并列的排名
def _delete_by_months(
self,
context: TaskContext,
records: List[Dict[str, Any]]
) -> int:
"""
按月份删除已存在的数据
"""
# 获取涉及的月份
months = set(r.get('stat_month') for r in records if r.get('stat_month'))
if not months:
return 0
target_table = self.get_target_table()
full_table = f"{self.DWS_SCHEMA}.{target_table}"
total_deleted = 0
with self.db.conn.cursor() as cur:
for month in months:
sql = f"""
DELETE FROM {full_table}
WHERE site_id = %s AND stat_month = %s
"""
cur.execute(sql, (context.store_id, month))
total_deleted += cur.rowcount
return total_deleted
# 便于外部导入
__all__ = ['AssistantMonthlyTask']

View File

@@ -0,0 +1,440 @@
# -*- coding: utf-8 -*-
"""
助教工资计算任务
功能说明:
"助教+月份"为粒度,计算月度工资明细
数据来源:
- dws_assistant_monthly_summary: 月度业绩汇总
- dws_assistant_recharge_commission: 充值提成Excel导入
- cfg_performance_tier: 绩效档位配置
- cfg_assistant_level_price: 等级定价配置
- cfg_bonus_rules: 奖金规则配置
目标表:
dws.dws_assistant_salary_calc
更新策略:
- 更新频率:月初计算上月工资
- 幂等方式delete-before-insert按月份
业务规则来自DWS数据库处理需求.md
- 基础课收入 = 基础课小时数 × (客户支付价格 - 专业课抽成)
中级助教基础课170小时3档 = 170 × (108 - 13) = 16,150元
- 附加课收入 = 附加课小时数 × 附加课价格 × (1 - 打赏课抽成比例)
附加课15小时3档 = 15 × 190 × (1 - 0.35) = 1,852.5元
- 包厢课收入 = 包厢课小时数 × (包厢课客户支付价格 - 专业课抽成)
- 冲刺奖金:按规则表配置(历史口径,不累计取最高档)
- Top3奖金1st:1000, 2nd:600, 3rd:400并列都算
- 充值提成来自dws_assistant_recharge_commission
- SCD2口径等级定价使用月份对应的历史值
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class AssistantSalaryTask(BaseDwsTask):
"""
助教工资计算任务
计算每个助教每月的工资明细:
- 课时收入(基础课+附加课)
- 扣款(档位扣款+其他)
- 奖金(档位奖金+冲刺+Top3+充值提成+其他)
- 应发工资
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列(月度粒度,按 salary_month 删除)
DATE_COL = "salary_month"
def get_task_code(self) -> str:
return "DWS_ASSISTANT_SALARY"
def get_target_table(self) -> str:
return "dws_assistant_salary_calc"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "salary_month"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
# 确定工资月份(通常是上月)
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
if self._should_skip_run(end_date):
self.logger.info("%s: 非工资结算期,跳过", self.get_task_code())
return {
'monthly_summary': [],
'recharge_commission': [],
'salary_month': None,
'site_id': context.store_id,
}
salary_month = self._get_salary_month(end_date)
site_id = context.store_id
self.logger.info(
"%s: 提取数据,工资月份 %s",
self.get_task_code(), salary_month
)
# 1. 获取月度业绩汇总
monthly_summary = self._extract_monthly_summary(site_id, salary_month)
# 2. 获取充值提成
recharge_commission = self._extract_recharge_commission(site_id, salary_month)
# 3. 加载配置缓存
self.load_config_cache()
return {
'monthly_summary': monthly_summary,
'recharge_commission': recharge_commission,
'salary_month': salary_month,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算工资
"""
if not extracted.get('salary_month'):
return []
monthly_summary = extracted['monthly_summary']
recharge_commission = extracted['recharge_commission']
salary_month = extracted['salary_month']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条月度汇总记录",
self.get_task_code(), len(monthly_summary)
)
# 构建充值提成索引
commission_index = {}
for comm in recharge_commission:
asst_id = comm.get('assistant_id')
if asst_id:
commission_index[asst_id] = commission_index.get(asst_id, Decimal('0')) + \
self.safe_decimal(comm.get('commission_amount', 0))
# 计算工资
results = []
for summary in monthly_summary:
record = self._calculate_salary(summary, commission_index, salary_month, site_id)
results.append(record)
return results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据
deleted = self._delete_by_month(context, transformed)
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _get_salary_month(self, end_date: date) -> date:
"""
获取工资月份(默认为上月)
"""
# 如果是月初,计算上月工资
if end_date.day <= 5:
if end_date.month == 1:
return date(end_date.year - 1, 12, 1)
else:
return date(end_date.year, end_date.month - 1, 1)
else:
# 否则计算当月(可能是调整)
return end_date.replace(day=1)
def _should_skip_run(self, end_date: date) -> bool:
"""
工资计算仅在月初运行(默认前 N 天)
"""
allow_out_of_cycle = bool(self.config.get("dws.salary.allow_out_of_cycle", False))
if allow_out_of_cycle:
return False
run_days = self.safe_int(self.config.get("dws.salary.run_days", 5))
if run_days <= 0:
return False
return end_date.day > run_days
def _extract_monthly_summary(
self,
site_id: int,
salary_month: date
) -> List[Dict[str, Any]]:
"""
提取月度业绩汇总
"""
sql = """
SELECT
assistant_id,
assistant_nickname,
stat_month,
assistant_level_code,
assistant_level_name,
hire_date,
is_new_hire,
effective_hours,
base_hours,
bonus_hours,
room_hours,
tier_id,
tier_code,
tier_name,
rank_with_ties
FROM dws.dws_assistant_monthly_summary
WHERE site_id = %s AND stat_month = %s
"""
rows = self.db.query(sql, (site_id, salary_month))
return [dict(row) for row in rows] if rows else []
def _extract_recharge_commission(
self,
site_id: int,
salary_month: date
) -> List[Dict[str, Any]]:
"""
提取充值提成
"""
sql = """
SELECT
assistant_id,
commission_amount
FROM dws.dws_assistant_recharge_commission
WHERE site_id = %s AND commission_month = %s
"""
rows = self.db.query(sql, (site_id, salary_month))
return [dict(row) for row in rows] if rows else []
# ==========================================================================
# 工资计算方法
# ==========================================================================
def _calculate_salary(
self,
summary: Dict[str, Any],
commission_index: Dict[int, Decimal],
salary_month: date,
site_id: int
) -> Dict[str, Any]:
"""
计算单个助教的月度工资
"""
assistant_id = summary.get('assistant_id')
level_code = summary.get('assistant_level_code')
effective_hours = self.safe_decimal(summary.get('effective_hours', 0))
base_hours = self.safe_decimal(summary.get('base_hours', 0))
bonus_hours = self.safe_decimal(summary.get('bonus_hours', 0))
room_hours = self.safe_decimal(summary.get('room_hours', 0))
is_new_hire = summary.get('is_new_hire', False)
rank = summary.get('rank_with_ties')
# 获取等级定价SCD2口径按月份取值
# base_course_price: 客户支付价格初级98/中级108/高级118/星级138
# bonus_course_price: 附加课客户支付价格固定190元
# room_course_price: 包厢课客户支付价格固定138元
level_price = self.get_level_price(level_code, salary_month)
base_course_price = self.safe_decimal(
level_price.get('base_course_price', 98) if level_price else 98
)
bonus_course_price = self.safe_decimal(
level_price.get('bonus_course_price', 190) if level_price else 190
)
room_course_price = self.safe_decimal(
self.config.get("dws.salary.room_course_price", 138)
)
# 获取档位配置
# base_deduction: 专业课抽成(元/小时),球房从每小时扣除
# bonus_deduction_ratio: 打赏课抽成比例,球房从附加课收入扣除的比例
tier = self.get_performance_tier_by_id(summary.get('tier_id'), salary_month)
if not tier:
tier = self.get_performance_tier(
effective_hours,
is_new_hire,
effective_date=salary_month
)
base_deduction = self.safe_decimal(tier.get('base_deduction', 18)) if tier else Decimal('18')
bonus_deduction_ratio = self.safe_decimal(tier.get('bonus_deduction_ratio', 0.40)) if tier else Decimal('0.40')
vacation_days = tier.get('vacation_days', 0) if tier else 0
vacation_unlimited = tier.get('vacation_unlimited', False) if tier else False
# ============================================================
# 工资计算公式来自DWS数据库处理需求.md
# ============================================================
# 基础课收入 = 基础课小时数 × (客户支付价格 - 专业课抽成)
# 例中级助教170小时3档 = 170 × (108 - 13) = 16,150元
base_income = base_hours * (base_course_price - base_deduction)
# 附加课收入 = 附加课小时数 × 附加课价格 × (1 - 打赏课抽成比例)
# 例15小时3档 = 15 × 190 × (1 - 0.35) = 1,852.5元
bonus_income = bonus_hours * bonus_course_price * (Decimal('1') - bonus_deduction_ratio)
# 包厢课收入(按包厢课统一价格口径)
room_income = room_hours * (room_course_price - base_deduction)
# 课时收入合计
total_course_income = base_income + bonus_income + room_income
# 计算冲刺奖金(按规则表配置,不累计取最高)
sprint_bonus = self.calculate_sprint_bonus(effective_hours, salary_month)
# 计算Top3排名奖金1st:1000, 2nd:600, 3rd:400并列都算
top_rank_bonus = Decimal('0')
if rank and rank <= 3:
top_rank_bonus = self.calculate_top_rank_bonus(rank, salary_month)
# 获取充值提成
recharge_commission = commission_index.get(assistant_id, Decimal('0'))
# 汇总奖金
other_bonus = Decimal('0') # 预留其他奖金
total_bonus = sprint_bonus + top_rank_bonus + recharge_commission + other_bonus
# 计算应发工资 = 课时收入 + 奖金
gross_salary = total_course_income + total_bonus
# 构建记录
return {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': summary.get('assistant_nickname'),
'salary_month': salary_month,
'assistant_level_code': level_code,
'assistant_level_name': summary.get('assistant_level_name'),
'hire_date': summary.get('hire_date'),
'is_new_hire': is_new_hire,
'effective_hours': effective_hours,
'base_hours': base_hours,
'bonus_hours': bonus_hours,
'room_hours': room_hours,
'tier_id': summary.get('tier_id'),
'tier_code': tier.get('tier_code') if tier else None,
'tier_name': tier.get('tier_name') if tier else None,
'rank_with_ties': rank,
# 定价信息
'base_course_price': base_course_price,
'bonus_course_price': bonus_course_price,
'base_deduction': base_deduction,
'bonus_deduction_ratio': bonus_deduction_ratio,
# 收入明细
'base_income': base_income,
'bonus_income': bonus_income,
'room_income': room_income,
'total_course_income': total_course_income,
# 奖金明细
'sprint_bonus': sprint_bonus,
'top_rank_bonus': top_rank_bonus,
'recharge_commission': recharge_commission,
'other_bonus': other_bonus,
'total_bonus': total_bonus,
# 应发工资
'gross_salary': gross_salary,
# 假期
'vacation_days': vacation_days,
'vacation_unlimited': vacation_unlimited,
'calc_notes': self._build_calc_notes(summary, tier, sprint_bonus, top_rank_bonus),
}
def _build_calc_notes(
self,
summary: Dict[str, Any],
tier: Optional[Dict[str, Any]],
sprint_bonus: Decimal,
top_rank_bonus: Decimal
) -> Optional[str]:
"""
构建计算备注
"""
notes = []
if summary.get('is_new_hire'):
notes.append("新入职首月")
if tier:
notes.append(f"档位: {tier.get('tier_name', 'N/A')}")
if sprint_bonus > 0:
notes.append(f"冲刺奖金: {sprint_bonus}")
if top_rank_bonus > 0:
rank = summary.get('rank_with_ties')
notes.append(f"Top{rank}奖金: {top_rank_bonus}")
return "; ".join(notes) if notes else None
def _delete_by_month(
self,
context: TaskContext,
records: List[Dict[str, Any]]
) -> int:
"""
按月份删除已存在的数据
"""
months = set(r.get('salary_month') for r in records if r.get('salary_month'))
if not months:
return 0
target_table = self.get_target_table()
full_table = f"{self.DWS_SCHEMA}.{target_table}"
total_deleted = 0
with self.db.conn.cursor() as cur:
for month in months:
sql = f"""
DELETE FROM {full_table}
WHERE site_id = %s AND salary_month = %s
"""
cur.execute(sql, (context.store_id, month))
total_deleted += cur.rowcount
return total_deleted
# 便于外部导入
__all__ = ['AssistantSalaryTask']

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
DWS 层公共辅助函数
从多个 DWS 子类中提取的纯函数,不依赖实例状态。
"""
from __future__ import annotations
from datetime import date, datetime
from decimal import Decimal
from typing import Any, Optional
def mask_mobile(mobile: Optional[str]) -> Optional[str]:
"""手机号脱敏138****1234
短于 7 位的号码原样返回,避免截断后信息丢失。
"""
if not mobile or len(mobile) < 7:
return mobile
return mobile[:3] + "****" + mobile[-4:]
def calc_days_since(stat_date: date, last_date: Optional[date]) -> Optional[int]:
"""计算距离目标日期的天数
若 last_date 为 datetime 则自动取 .date()。
"""
if not last_date:
return None
if isinstance(last_date, datetime):
last_date = last_date.date()
return (stat_date - last_date).days
def parse_id_list(value: Any) -> set[int]:
"""解析逗号分隔的 ID 列表字符串为 int 集合
支持 str / list / tuple / set 输入;非法值静默跳过。
"""
if not value:
return set()
if isinstance(value, str):
items = [v.strip() for v in value.split(",") if v.strip()]
return {int(v) for v in items if v.isdigit()}
if isinstance(value, (list, tuple, set)):
result: set[int] = set()
for item in value:
if item is None:
continue
try:
result.add(int(item))
except (ValueError, TypeError):
continue
return result
return set()
def safe_division(
numerator: Any,
denominator: Any,
default: Decimal = Decimal("0"),
) -> Decimal:
"""安全除法,分母为零或 None 时返回默认值"""
try:
d = Decimal(str(denominator))
if d == 0:
return default
return Decimal(str(numerator)) / d
except (ValueError, TypeError, ArithmeticError):
return default

View File

@@ -0,0 +1,321 @@
# -*- coding: utf-8 -*-
# AI_CHANGELOG [2025-07-16] task 2.3: 从 FinanceDailyTask 提取共享提取方法,创建财务任务基类
"""
财务任务共享基类
提供财务类 DWS 任务的公共数据提取方法,避免 SQL 重复。
子类FinanceDailyTask、FinanceRechargeTask、FinanceIncomeStructureTask、FinanceDiscountDetailTask
共享方法:
- _extract_settlement_summary: 结账单日汇总
- _extract_recharge_summary: 充值日汇总
- _extract_groupbuy_summary: 团购核销日汇总
- _extract_platform_summary: 平台回款/服务费汇总
- _extract_big_customer_discounts: 大客户优惠明细
- _extract_gift_card_consume_summary: 赠送卡消费汇总
"""
from __future__ import annotations
import calendar
from datetime import date
from decimal import Decimal
from typing import Any, Dict, List
from .base_dws_task import BaseDwsTask
from .dws_helpers import parse_id_list
class FinanceBaseTask(BaseDwsTask):
"""财务任务共享基类,提供公共数据提取方法。"""
# ------------------------------------------------------------------
# 结算汇总
# ------------------------------------------------------------------
def _extract_settlement_summary(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""结账单日汇总(结算头表按日聚合)"""
sql = """
SELECT
DATE(pay_time) AS stat_date,
COUNT(*) AS order_count,
COUNT(CASE WHEN member_id != 0 AND member_id IS NOT NULL THEN 1 END) AS member_order_count,
COUNT(CASE WHEN member_id = 0 OR member_id IS NULL THEN 1 END) AS guest_order_count,
-- 发生额(正价)
SUM(table_charge_money) AS table_fee_amount,
SUM(goods_money) AS goods_amount,
SUM(assistant_pd_money) AS assistant_pd_amount,
SUM(assistant_cx_money) AS assistant_cx_amount,
SUM(table_charge_money + goods_money + assistant_pd_money + assistant_cx_money) AS gross_amount,
-- 支付
SUM(pay_amount) AS cash_pay_amount,
SUM(recharge_card_amount) AS card_pay_amount,
SUM(balance_amount) AS balance_pay_amount,
-- 优惠
SUM(coupon_amount) AS coupon_amount,
SUM(adjust_amount) AS adjust_amount,
SUM(member_discount_amount) AS member_discount_amount,
SUM(rounding_amount) AS rounding_amount,
SUM(pl_coupon_sale_amount) AS pl_coupon_sale_amount,
-- 消费金额
SUM(consume_money) AS total_consume
FROM dwd.dwd_settlement_head
WHERE site_id = %s
AND DATE(pay_time) >= %s
AND DATE(pay_time) <= %s
GROUP BY DATE(pay_time)
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ------------------------------------------------------------------
# 充值汇总
# ------------------------------------------------------------------
def _extract_recharge_summary(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""充值日汇总(充值订单按日聚合)"""
sql = """
SELECT
DATE(pay_time) AS stat_date,
COUNT(*) AS recharge_count,
SUM(pay_money + gift_money) AS recharge_total,
SUM(pay_money) AS recharge_cash,
SUM(gift_money) AS recharge_gift,
COUNT(CASE WHEN is_first = 1 THEN 1 END) AS first_recharge_count,
SUM(CASE WHEN is_first = 1 THEN pay_money + gift_money ELSE 0 END) AS first_recharge_total,
SUM(CASE WHEN is_first = 1 THEN pay_money ELSE 0 END) AS first_recharge_cash,
SUM(CASE WHEN is_first = 1 THEN gift_money ELSE 0 END) AS first_recharge_gift,
COUNT(CASE WHEN is_first = 0 OR is_first IS NULL THEN 1 END) AS renewal_count,
SUM(CASE WHEN is_first = 0 OR is_first IS NULL THEN pay_money + gift_money ELSE 0 END) AS renewal_total,
SUM(CASE WHEN is_first = 0 OR is_first IS NULL THEN pay_money ELSE 0 END) AS renewal_cash,
SUM(CASE WHEN is_first = 0 OR is_first IS NULL THEN gift_money ELSE 0 END) AS renewal_gift,
COUNT(DISTINCT member_id) AS recharge_member_count
FROM dwd.dwd_recharge_order
WHERE site_id = %s
AND DATE(pay_time) >= %s
AND DATE(pay_time) <= %s
GROUP BY DATE(pay_time)
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ------------------------------------------------------------------
# 团购核销汇总
# ------------------------------------------------------------------
def _extract_groupbuy_summary(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""团购核销日汇总(结算头表 + 团购核销表联查)"""
sql = """
SELECT
sh.pay_time::DATE AS stat_date,
COUNT(CASE WHEN sh.coupon_amount > 0 THEN 1 END) AS groupbuy_count,
SUM(
CASE
WHEN sh.coupon_amount > 0 THEN
CASE
WHEN sh.pl_coupon_sale_amount > 0 THEN sh.pl_coupon_sale_amount
ELSE COALESCE(gr.ledger_unit_price, 0)
END
ELSE 0
END
) AS groupbuy_pay_total
FROM dwd.dwd_settlement_head sh
LEFT JOIN dwd.dwd_groupbuy_redemption gr
ON gr.order_settle_id = sh.order_settle_id
AND COALESCE(gr.is_delete, 0) = 0
WHERE sh.site_id = %s
AND sh.pay_time >= %s
AND sh.pay_time < %s + INTERVAL '1 day'
GROUP BY sh.pay_time::DATE
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ------------------------------------------------------------------
# 平台回款/服务费汇总
# ------------------------------------------------------------------
def _extract_platform_summary(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""平台回款/服务费汇总(来自导入表)"""
sql = """
SELECT
settlement_date AS stat_date,
SUM(settlement_amount) AS settlement_amount,
SUM(commission_amount) AS commission_amount,
SUM(service_fee) AS service_fee
FROM dws.dws_platform_settlement
WHERE site_id = %s
AND settlement_date >= %s
AND settlement_date <= %s
GROUP BY settlement_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ------------------------------------------------------------------
# 大客户优惠
# ------------------------------------------------------------------
def _extract_big_customer_discounts(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""大客户优惠明细(基于手动调整中标记的大客户订单)"""
member_ids = self._parse_id_list(
self.config.get("dws.discount.big_customer_member_ids")
)
order_ids = self._parse_id_list(
self.config.get("dws.discount.big_customer_order_ids")
)
if not member_ids and not order_ids:
return []
sql = """
SELECT
pay_time::DATE AS stat_date,
order_settle_id,
member_id,
adjust_amount
FROM dwd.dwd_settlement_head
WHERE site_id = %s
AND pay_time >= %s
AND pay_time < %s + INTERVAL '1 day'
AND adjust_amount != 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
if not rows:
return []
result: Dict[date, Dict[str, Any]] = {}
for row in rows:
row_dict = dict(row)
stat_date = row_dict.get("stat_date")
if not stat_date:
continue
order_id = row_dict.get("order_settle_id")
member_id = row_dict.get("member_id")
if order_id not in order_ids and member_id not in member_ids:
continue
amount = abs(self.safe_decimal(row_dict.get("adjust_amount", 0)))
entry = result.setdefault(
stat_date,
{"stat_date": stat_date, "big_customer_amount": Decimal("0"), "big_customer_count": 0},
)
entry["big_customer_amount"] += amount
entry["big_customer_count"] += 1
return list(result.values())
# ------------------------------------------------------------------
# 赠送卡消费汇总
# ------------------------------------------------------------------
# 赠送卡类型 ID 常量,供子类复用
GIFT_CARD_TYPE_IDS = (
2791990152417157, # 台费卡
2794699703437125, # 酒水卡
2793266846533445, # 活动抵用券
)
def _extract_gift_card_consume_summary(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""赠送卡消费汇总(余额变动按日聚合)"""
id_list = ", ".join(str(card_id) for card_id in self.GIFT_CARD_TYPE_IDS)
sql = f"""
SELECT
change_time::DATE AS stat_date,
SUM(ABS(change_amount)) AS gift_card_consume
FROM dwd.dwd_member_balance_change
WHERE site_id = %s
AND change_time >= %s
AND change_time < %s + INTERVAL '1 day'
AND from_type = 1
AND change_amount < 0
AND COALESCE(is_delete, 0) = 0
AND card_type_id IN ({id_list})
GROUP BY change_time::DATE
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
# ------------------------------------------------------------------
# 支出汇总(按月分摊到日)
# ------------------------------------------------------------------
def _extract_expense_summary(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""支出汇总(来自导入表,按月分摊到日)"""
if start_date > end_date:
return []
start_month = start_date.replace(day=1)
end_month = end_date.replace(day=1)
sql = """
SELECT
expense_month,
SUM(expense_amount) AS expense_amount
FROM dws.dws_finance_expense_summary
WHERE site_id = %s
AND expense_month >= %s
AND expense_month <= %s
GROUP BY expense_month
"""
rows = self.db.query(sql, (site_id, start_month, end_month))
if not rows:
return []
daily_totals: Dict[date, Decimal] = {}
for row in rows:
row_dict = dict(row)
month_date = row_dict.get("expense_month")
if not month_date:
continue
amount = self.safe_decimal(row_dict.get("expense_amount", 0))
days_in_month = calendar.monthrange(month_date.year, month_date.month)[1]
daily_amount = amount / Decimal(str(days_in_month)) if days_in_month > 0 else Decimal("0")
for day in range(1, days_in_month + 1):
stat_date = date(month_date.year, month_date.month, day)
if stat_date < start_date or stat_date > end_date:
continue
daily_totals[stat_date] = daily_totals.get(stat_date, Decimal("0")) + daily_amount
return [
{"stat_date": sd, "expense_amount": amt}
for sd, amt in sorted(daily_totals.items())
]
# ------------------------------------------------------------------
# 辅助
# ------------------------------------------------------------------
def _parse_id_list(self, value: Any) -> set:
"""解析逗号分隔的 ID 列表(委托 dws_helpers"""
return parse_id_list(value)
__all__ = ["FinanceBaseTask"]

View File

@@ -0,0 +1,322 @@
# -*- coding: utf-8 -*-
"""
财务日度汇总任务
功能说明:
"日期"为粒度,汇总当日财务数据
数据来源:
- dwd_settlement_head: 结账单头表
- dwd_groupbuy_redemption: 团购核销
- dwd_recharge_order: 充值订单
- dws_finance_expense_summary: 支出汇总Excel导入
- dws_platform_settlement: 平台回款/服务费Excel导入
目标表:
dws.dws_finance_daily_summary
更新策略:
- 更新频率:每小时更新当日数据
- 幂等方式delete-before-insert按日期
业务规则:
- 发生额table_charge_money + goods_money + assistant_pd_money + assistant_cx_money
- 团购优惠coupon_amount - 团购支付金额
- 团购支付pl_coupon_sale_amount 或关联 groupbuy_redemption.ledger_unit_price
- 首充/续充:通过 is_first 字段区分
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date
from decimal import Decimal
from typing import Any, Dict, List
from .base_dws_task import TaskContext
from .finance_base_task import FinanceBaseTask
# CHANGE 2025-07-16 | task 2.3: 继承 FinanceBaseTask共享提取方法
class FinanceDailyTask(FinanceBaseTask):
"""
财务日度汇总任务
汇总每日的:
- 发生额(正价)
- 优惠拆分
- 确认收入
- 现金流(流入/流出)
- 充值统计(首充/续充)
- 订单统计
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_FINANCE_DAILY"
def get_target_table(self) -> str:
return "dws_finance_daily_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,日期范围 %s ~ %s",
self.get_task_code(), start_date, end_date
)
# 1. 获取结账单汇总
settlement_summary = self._extract_settlement_summary(site_id, start_date, end_date)
# 2. 获取团购核销汇总
groupbuy_summary = self._extract_groupbuy_summary(site_id, start_date, end_date)
# 3. 获取充值汇总
recharge_summary = self._extract_recharge_summary(site_id, start_date, end_date)
# 3.1 获取赠送卡消费汇总(余额变动)
gift_card_summary = self._extract_gift_card_consume_summary(site_id, start_date, end_date)
# 4. 获取支出汇总(来自导入表)
expense_summary = self._extract_expense_summary(site_id, start_date, end_date)
# 5. 获取平台回款汇总(来自导入表)
platform_summary = self._extract_platform_summary(site_id, start_date, end_date)
# 6. 获取大客户优惠明细(用于拆分手动优惠)
big_customer_summary = self._extract_big_customer_discounts(site_id, start_date, end_date)
return {
'settlement_summary': settlement_summary,
'groupbuy_summary': groupbuy_summary,
'recharge_summary': recharge_summary,
'gift_card_summary': gift_card_summary,
'expense_summary': expense_summary,
'platform_summary': platform_summary,
'big_customer_summary': big_customer_summary,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:按日期聚合
"""
settlement_summary = extracted['settlement_summary']
groupbuy_summary = extracted['groupbuy_summary']
recharge_summary = extracted['recharge_summary']
gift_card_summary = extracted['gift_card_summary']
expense_summary = extracted['expense_summary']
platform_summary = extracted['platform_summary']
big_customer_summary = extracted['big_customer_summary']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 天结账数据,%d 天充值数据",
self.get_task_code(), len(settlement_summary), len(recharge_summary)
)
# 按日期合并数据
dates = set()
for item in settlement_summary + recharge_summary + gift_card_summary + expense_summary + platform_summary:
stat_date = item.get('stat_date')
if stat_date:
dates.add(stat_date)
# 构建索引
settle_index = {s['stat_date']: s for s in settlement_summary}
groupbuy_index = {g['stat_date']: g for g in groupbuy_summary}
recharge_index = {r['stat_date']: r for r in recharge_summary}
gift_card_index = {g['stat_date']: g for g in gift_card_summary}
expense_index = {e['stat_date']: e for e in expense_summary}
platform_index = {p['stat_date']: p for p in platform_summary}
big_customer_index = {b['stat_date']: b for b in big_customer_summary}
results = []
for stat_date in sorted(dates):
settle = settle_index.get(stat_date, {})
groupbuy = groupbuy_index.get(stat_date, {})
recharge = recharge_index.get(stat_date, {})
gift_card = gift_card_index.get(stat_date, {})
expense = expense_index.get(stat_date, {})
platform = platform_index.get(stat_date, {})
big_customer = big_customer_index.get(stat_date, {})
record = self._build_daily_record(
stat_date, settle, groupbuy, recharge, gift_card, expense, platform, big_customer, site_id
)
results.append(record)
return results
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="stat_date"
# 提取方法已迁移至 FinanceBaseTasktask 2.3
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _build_daily_record(
self,
stat_date: date,
settle: Dict[str, Any],
groupbuy: Dict[str, Any],
recharge: Dict[str, Any],
gift_card: Dict[str, Any],
expense: Dict[str, Any],
platform: Dict[str, Any],
big_customer: Dict[str, Any],
site_id: int
) -> Dict[str, Any]:
"""
构建日度财务记录
"""
# 发生额
gross_amount = self.safe_decimal(settle.get('gross_amount', 0))
table_fee_amount = self.safe_decimal(settle.get('table_fee_amount', 0))
goods_amount = self.safe_decimal(settle.get('goods_amount', 0))
assistant_pd_amount = self.safe_decimal(settle.get('assistant_pd_amount', 0))
assistant_cx_amount = self.safe_decimal(settle.get('assistant_cx_amount', 0))
# 支付
cash_pay_amount = self.safe_decimal(settle.get('cash_pay_amount', 0))
card_pay_amount = self.safe_decimal(settle.get('card_pay_amount', 0))
balance_pay_amount = self.safe_decimal(settle.get('balance_pay_amount', 0))
# 优惠
coupon_amount = self.safe_decimal(settle.get('coupon_amount', 0))
pl_coupon_sale = self.safe_decimal(settle.get('pl_coupon_sale_amount', 0))
groupbuy_pay = self.safe_decimal(groupbuy.get('groupbuy_pay_total', 0))
# 团购支付金额优先使用pl_coupon_sale_amount否则使用groupbuy核销金额
if pl_coupon_sale > 0:
groupbuy_pay_amount = pl_coupon_sale
else:
groupbuy_pay_amount = groupbuy_pay
# 团购优惠 = 团购抵消台费 - 团购支付金额
discount_groupbuy = coupon_amount - groupbuy_pay_amount
if discount_groupbuy < 0:
discount_groupbuy = Decimal('0')
adjust_amount = self.safe_decimal(settle.get('adjust_amount', 0))
member_discount = self.safe_decimal(settle.get('member_discount_amount', 0))
rounding_amount = self.safe_decimal(settle.get('rounding_amount', 0))
big_customer_amount = self.safe_decimal(big_customer.get('big_customer_amount', 0))
other_discount = adjust_amount - big_customer_amount
if other_discount < 0:
other_discount = Decimal('0')
# 赠送卡消费(来自余额变动)
gift_card_consume_amount = self.safe_decimal(gift_card.get('gift_card_consume', 0))
# 优惠合计
discount_total = discount_groupbuy + member_discount + gift_card_consume_amount + adjust_amount + rounding_amount
# 确认收入
confirmed_income = gross_amount - discount_total
# 现金流
platform_settlement_amount = self.safe_decimal(platform.get('settlement_amount', 0))
platform_fee_amount = (
self.safe_decimal(platform.get('commission_amount', 0))
+ self.safe_decimal(platform.get('service_fee', 0))
)
recharge_cash_inflow = self.safe_decimal(recharge.get('recharge_cash', 0))
platform_inflow = platform_settlement_amount if platform_settlement_amount > 0 else groupbuy_pay_amount
cash_inflow_total = cash_pay_amount + platform_inflow + recharge_cash_inflow
cash_outflow_total = self.safe_decimal(expense.get('expense_amount', 0)) + platform_fee_amount
cash_balance_change = cash_inflow_total - cash_outflow_total
# 卡消费
cash_card_consume = card_pay_amount + balance_pay_amount
gift_card_consume = gift_card_consume_amount
card_consume_total = cash_card_consume + gift_card_consume
# 充值统计
recharge_count = self.safe_int(recharge.get('recharge_count', 0))
recharge_total = self.safe_decimal(recharge.get('recharge_total', 0))
recharge_cash = self.safe_decimal(recharge.get('recharge_cash', 0))
recharge_gift = self.safe_decimal(recharge.get('recharge_gift', 0))
first_recharge_count = self.safe_int(recharge.get('first_recharge_count', 0))
first_recharge_amount = self.safe_decimal(recharge.get('first_recharge_total', 0))
renewal_count = self.safe_int(recharge.get('renewal_count', 0))
renewal_amount = self.safe_decimal(recharge.get('renewal_total', 0))
# 订单统计
order_count = self.safe_int(settle.get('order_count', 0))
member_order_count = self.safe_int(settle.get('member_order_count', 0))
guest_order_count = self.safe_int(settle.get('guest_order_count', 0))
avg_order_amount = gross_amount / order_count if order_count > 0 else Decimal('0')
return {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'stat_date': stat_date,
# 发生额
'gross_amount': gross_amount,
'table_fee_amount': table_fee_amount,
'goods_amount': goods_amount,
'assistant_pd_amount': assistant_pd_amount,
'assistant_cx_amount': assistant_cx_amount,
# 优惠
'discount_total': discount_total,
'discount_groupbuy': discount_groupbuy,
'discount_vip': member_discount,
'discount_gift_card': gift_card_consume_amount,
'discount_manual': adjust_amount,
'discount_rounding': rounding_amount,
'discount_other': other_discount,
# 确认收入
'confirmed_income': confirmed_income,
# 现金流
'cash_inflow_total': cash_inflow_total,
'cash_pay_amount': cash_pay_amount,
'groupbuy_pay_amount': groupbuy_pay_amount,
'platform_settlement_amount': platform_settlement_amount,
'platform_fee_amount': platform_fee_amount,
'recharge_cash_inflow': recharge_cash_inflow,
'card_consume_total': card_consume_total,
'cash_card_consume': cash_card_consume,
'gift_card_consume': gift_card_consume,
'cash_outflow_total': cash_outflow_total,
'cash_balance_change': cash_balance_change,
# 充值统计
'recharge_count': recharge_count,
'recharge_total': recharge_total,
'recharge_cash': recharge_cash,
'recharge_gift': recharge_gift,
'first_recharge_count': first_recharge_count,
'first_recharge_amount': first_recharge_amount,
'renewal_count': renewal_count,
'renewal_amount': renewal_amount,
# 订单统计
'order_count': order_count,
'member_order_count': member_order_count,
'guest_order_count': guest_order_count,
'avg_order_amount': avg_order_amount,
}
# 便于外部导入
__all__ = ['FinanceDailyTask']

View File

@@ -0,0 +1,424 @@
# -*- coding: utf-8 -*-
"""
优惠明细分析任务
功能说明:
"日期+优惠类型"为粒度,分析优惠构成
数据来源:
- dwd_settlement_head: 结账单头表(优惠字段)
- dwd_groupbuy_redemption: 团购核销(团购实付金额)
- dwd_member_balance_change: 余额变动(赠送卡消费)
目标表:
dws.dws_finance_discount_detail
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期
业务规则:
- 团购优惠 (GROUPBUY): coupon_amount - 团购实付金额
- 会员折扣 (VIP): member_discount_amount
- 赠送卡抵扣 (GIFT_CARD_*): dwd_member_balance_change台费卡/酒水卡/活动抵用券)
- 抹零 (ROUNDING): rounding_amount
- 大客户优惠 (BIG_CUSTOMER): 手动调整中标记的大客户订单
- 其他优惠 (OTHER): 手动调整中除大客户外的部分
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date
from decimal import Decimal
from typing import Any, Dict, List
from .base_dws_task import TaskContext
from .finance_base_task import FinanceBaseTask
# CHANGE 2025-07-16 | task 2.3: 继承 FinanceBaseTask复用 _extract_big_customer_discounts
class FinanceDiscountDetailTask(FinanceBaseTask):
"""
优惠明细分析任务
分析各类优惠的使用情况:
- 团购优惠
- 会员折扣
- 赠送卡抵扣
- 手动调整
- 抹零
- 其他优惠
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列load() 保留自定义实现(非标准 bulk_insert 路径)
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_FINANCE_DISCOUNT_DETAIL"
def get_target_table(self) -> str:
return "dws_finance_discount_detail"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date", "discount_type_code"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
抽取优惠相关数据
数据来源:
1. settlement_head: 各类优惠字段
2. groupbuy_redemption: 团购实付金额
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 从settlement_head抽取优惠数据
discount_summary = self._extract_discount_summary(site_id, start_date, end_date)
# 从groupbuy_redemption获取团购实付金额
groupbuy_payments = self._extract_groupbuy_payments(site_id, start_date, end_date)
# 提取大客户优惠(拆分手动调整)
big_customer_summary = self._extract_big_customer_discounts(site_id, start_date, end_date)
# 提取赠送卡消费(按卡类型拆分)
gift_card_consumes = self._extract_gift_card_consumes(site_id, start_date, end_date)
return {
'discount_summary': discount_summary,
'groupbuy_payments': groupbuy_payments,
'big_customer_summary': big_customer_summary,
'gift_card_consumes': gift_card_consumes,
}
def _extract_discount_summary(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
从结账单头表抽取优惠汇总
字段说明:
- coupon_amount: 团购抵消台费金额
- adjust_amount: 手动调整金额(台费打折)
- member_discount_amount: 会员折扣
- rounding_amount: 抹零金额
- pl_coupon_sale_amount: 平台券销售金额团购实付路径1
"""
sql = """
SELECT
pay_time::DATE AS stat_date,
-- 团购相关
COALESCE(SUM(coupon_amount), 0) AS coupon_amount_total,
COALESCE(SUM(pl_coupon_sale_amount), 0) AS pl_coupon_sale_total,
COUNT(CASE WHEN coupon_amount > 0 THEN 1 END) AS coupon_order_count,
-- 手动调整
COALESCE(SUM(adjust_amount), 0) AS adjust_amount_total,
COUNT(CASE WHEN adjust_amount != 0 THEN 1 END) AS adjust_order_count,
-- 会员折扣
COALESCE(SUM(member_discount_amount), 0) AS member_discount_total,
COUNT(CASE WHEN member_discount_amount > 0 THEN 1 END) AS member_discount_order_count,
-- 抹零
COALESCE(SUM(rounding_amount), 0) AS rounding_amount_total,
COUNT(CASE WHEN rounding_amount != 0 THEN 1 END) AS rounding_order_count,
-- 总订单数
COUNT(*) AS total_orders
FROM dwd.dwd_settlement_head
WHERE site_id = %(site_id)s
AND pay_time >= %(start_date)s
AND pay_time < %(end_date)s + INTERVAL '1 day'
AND settle_status = 1 -- 已结账
GROUP BY pay_time::DATE
ORDER BY stat_date
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def _extract_groupbuy_payments(
self,
site_id: int,
start_date: date,
end_date: date
) -> Dict[date, Decimal]:
"""
从团购核销表获取团购实付金额
团购实付金额计算:
- 若 pl_coupon_sale_amount > 0使用该值
- 否则使用 groupbuy_redemption.ledger_unit_price
返回:{日期: 团购实付总额}
"""
sql = """
SELECT
sh.pay_time::DATE AS stat_date,
SUM(
CASE
WHEN sh.pl_coupon_sale_amount > 0 THEN sh.pl_coupon_sale_amount
ELSE COALESCE(gr.ledger_unit_price, 0)
END
) AS groupbuy_payment
FROM dwd.dwd_settlement_head sh
LEFT JOIN dwd.dwd_groupbuy_redemption gr
ON gr.order_settle_id = sh.order_settle_id
AND COALESCE(gr.is_delete, 0) = 0
WHERE sh.site_id = %(site_id)s
AND sh.pay_time >= %(start_date)s
AND sh.pay_time < %(end_date)s + INTERVAL '1 day'
AND sh.settle_status = 1
AND sh.coupon_amount > 0 -- 只统计有团购的订单
GROUP BY sh.pay_time::DATE
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
result = {}
if rows:
for row in rows:
result[row['stat_date']] = self.safe_decimal(row.get('groupbuy_payment', 0))
return result
def _extract_gift_card_consumes(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取赠送卡消费(按卡类型)
"""
gift_card_type_ids = (
2791990152417157, # 台费卡
2794699703437125, # 酒水卡
2793266846533445, # 活动抵用券
)
id_list = ", ".join(str(card_id) for card_id in gift_card_type_ids)
sql = f"""
SELECT
change_time::DATE AS stat_date,
card_type_id,
COUNT(*) AS consume_count,
SUM(ABS(change_amount)) AS consume_amount
FROM dwd.dwd_member_balance_change
WHERE site_id = %(site_id)s
AND change_time >= %(start_date)s
AND change_time < %(end_date)s + INTERVAL '1 day'
AND from_type = 1
AND change_amount < 0
AND COALESCE(is_delete, 0) = 0
AND card_type_id IN ({id_list})
GROUP BY change_time::DATE, card_type_id
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def transform(self, data: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
将抽取的数据转换为目标表格式:
- 每种优惠类型一条记录
- 计算团购优惠coupon_amount - 团购实付)
- 计算优惠占比
"""
site_id = context.store_id
tenant_id = self.config.get("app.tenant_id", site_id)
discount_summary = data.get('discount_summary', [])
groupbuy_payments = data.get('groupbuy_payments', {})
big_customer_summary = {r['stat_date']: r for r in data.get('big_customer_summary', [])}
gift_card_consumes = data.get('gift_card_consumes', [])
records = []
# 优惠类型定义
# (type_code, type_name, amount_field, count_field, special_calc)
discount_types = [
('GROUPBUY', '团购优惠', 'coupon_amount_total', 'coupon_order_count', True),
('VIP', '会员折扣', 'member_discount_total', 'member_discount_order_count', False),
('ROUNDING', '抹零', 'rounding_amount_total', 'rounding_order_count', False),
]
gift_card_type_map = {
2791990152417157: ('GIFT_CARD_TABLE', '台费卡抵扣'),
2794699703437125: ('GIFT_CARD_DRINK', '酒水卡抵扣'),
2793266846533445: ('GIFT_CARD_COUPON', '活动抵用券抵扣'),
}
# 赠送卡消费按日期+类型聚合
gift_card_by_date: Dict[date, Dict[str, Dict[str, Any]]] = {}
for row in gift_card_consumes:
stat_date = row.get('stat_date')
card_type_id = row.get('card_type_id')
type_info = gift_card_type_map.get(card_type_id)
if not stat_date or not type_info:
continue
type_code, type_name = type_info
daily = gift_card_by_date.setdefault(stat_date, {})
entry = daily.setdefault(type_code, {'type_name': type_name, 'amount': Decimal('0'), 'count': 0})
entry['amount'] += self.safe_decimal(row.get('consume_amount', 0))
entry['count'] += self.safe_int(row.get('consume_count', 0))
discount_summary_map = {row.get('stat_date'): row for row in discount_summary if row.get('stat_date')}
stat_dates = set(discount_summary_map.keys())
stat_dates.update(groupbuy_payments.keys())
stat_dates.update(big_customer_summary.keys())
stat_dates.update(gift_card_by_date.keys())
for stat_date in sorted(stat_dates):
daily_data = discount_summary_map.get(stat_date, {})
# 计算各类优惠金额
daily_discounts = {}
total_discount = Decimal('0')
for type_code, type_name, amount_field, count_field, special_calc in discount_types:
if special_calc and type_code == 'GROUPBUY':
# 团购优惠 = 团购抵消台费 - 团购实付
coupon_amount = self.safe_decimal(daily_data.get(amount_field, 0))
groupbuy_paid = groupbuy_payments.get(stat_date, Decimal('0'))
discount_amount = coupon_amount - groupbuy_paid
# 确保优惠金额为正数
discount_amount = max(discount_amount, Decimal('0'))
else:
discount_amount = abs(self.safe_decimal(daily_data.get(amount_field, 0)))
usage_count = daily_data.get(count_field, 0) or 0
daily_discounts[type_code] = {
'type_name': type_name,
'amount': discount_amount,
'count': usage_count,
}
total_discount += discount_amount
# 赠送卡拆分(台费卡/酒水卡/活动券)
gift_daily = gift_card_by_date.get(stat_date, {})
for type_code, type_name in gift_card_type_map.values():
info = gift_daily.get(type_code, {'amount': Decimal('0'), 'count': 0})
daily_discounts[type_code] = {
'type_name': type_name,
'amount': self.safe_decimal(info.get('amount', 0)),
'count': self.safe_int(info.get('count', 0)),
}
total_discount += self.safe_decimal(info.get('amount', 0))
# 拆分手动调整为大客户/其他
adjust_amount = abs(self.safe_decimal(daily_data.get('adjust_amount_total', 0)))
adjust_count = daily_data.get('adjust_order_count', 0) or 0
big_customer_info = big_customer_summary.get(stat_date, {})
big_customer_amount = self.safe_decimal(big_customer_info.get('big_customer_amount', 0))
big_customer_count = big_customer_info.get('big_customer_count', 0) or 0
other_amount = adjust_amount - big_customer_amount
if other_amount < 0:
other_amount = Decimal('0')
other_count = adjust_count - big_customer_count
if other_count < 0:
other_count = 0
daily_discounts['BIG_CUSTOMER'] = {
'type_name': '大客户优惠',
'amount': big_customer_amount,
'count': big_customer_count,
}
daily_discounts['OTHER'] = {
'type_name': '其他优惠',
'amount': other_amount,
'count': other_count,
}
total_discount += big_customer_amount + other_amount
# 为每种优惠类型生成记录
for type_code, discount_info in daily_discounts.items():
discount_amount = discount_info['amount']
usage_count = discount_info['count']
# 计算占比(避免除零)
discount_ratio = (discount_amount / total_discount) if total_discount > 0 else Decimal('0')
records.append({
'site_id': site_id,
'tenant_id': tenant_id,
'stat_date': stat_date,
'discount_type_code': type_code,
'discount_type_name': discount_info['type_name'],
'discount_amount': discount_amount,
'discount_ratio': round(discount_ratio, 4),
'usage_count': usage_count,
'affected_orders': usage_count, # 简化:使用次数=影响订单数
})
return records
# _parse_id_list / _extract_big_customer_discounts 已迁移至 FinanceBaseTasktask 2.3
def load(self, records: List[Dict[str, Any]], context: TaskContext) -> Dict[str, Any]:
"""
加载数据到目标表
使用幂等方式delete-before-insert按日期范围
"""
if not records:
return {'inserted': 0, 'deleted': 0}
site_id = context.store_id
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
# 删除窗口内的旧数据
delete_sql = """
DELETE FROM dws.dws_finance_discount_detail
WHERE site_id = %(site_id)s
AND stat_date >= %(start_date)s
AND stat_date <= %(end_date)s
"""
deleted = self.db.execute(delete_sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
# 批量插入新数据
insert_sql = """
INSERT INTO dws.dws_finance_discount_detail (
site_id, tenant_id, stat_date,
discount_type_code, discount_type_name,
discount_amount, discount_ratio,
usage_count, affected_orders,
created_at, updated_at
) VALUES (
%(site_id)s, %(tenant_id)s, %(stat_date)s,
%(discount_type_code)s, %(discount_type_name)s,
%(discount_amount)s, %(discount_ratio)s,
%(usage_count)s, %(affected_orders)s,
NOW(), NOW()
)
"""
inserted = 0
for record in records:
self.db.execute(insert_sql, record)
inserted += 1
return {
'deleted': deleted or 0,
'inserted': inserted,
}

View File

@@ -0,0 +1,417 @@
# -*- coding: utf-8 -*-
"""
收入结构分析任务
功能说明:
"日期+区域/类型"为粒度,分析收入结构
数据来源:
- dwd_settlement_head: 结账单头表(台费、商品、助教正价)
- dwd_table_fee_log: 台费流水(区域关联)
- dwd_assistant_service_log: 助教服务流水(区域关联)
- cfg_area_category: 区域分类映射
目标表:
dws.dws_finance_income_structure
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期+类型)
业务规则:
- 结构类型1INCOME_TYPE按收入类型分析台费/商品/助教基础课/助教附加课)
- 结构类型2AREA按区域分析普通台球区/VIP包厢/斯诺克/麻将/KTV等
- 区域映射使用cfg_area_category配置
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date
from decimal import Decimal
from typing import Any, Dict, List, Optional
from .base_dws_task import TaskContext
from .finance_base_task import FinanceBaseTask
# CHANGE 2025-07-16 | task 2.3: 继承 FinanceBaseTask统一财务任务继承链
class FinanceIncomeStructureTask(FinanceBaseTask):
"""
收入结构分析任务
分析收入的两种维度:
1. INCOME_TYPE: 按收入类型(台费/商品/助教基础课/助教附加课)
2. AREA: 按区域使用cfg_area_category映射
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列load() 保留自定义实现(非标准 bulk_insert 路径)
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_FINANCE_INCOME_STRUCTURE"
def get_target_table(self) -> str:
return "dws_finance_income_structure"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date", "structure_type", "category_code"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
抽取数据
分两条路径抽取:
1. 按收入类型汇总来自settlement_head
2. 按区域汇总来自table_fee_log和assistant_service_log
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 按收入类型汇总
income_by_type = self._extract_income_by_type(site_id, start_date, end_date)
# 按区域汇总
income_by_area = self._extract_income_by_area(site_id, start_date, end_date)
return {
'income_by_type': income_by_type,
'income_by_area': income_by_area,
}
def _extract_income_by_type(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
按收入类型汇总
收入类型分类:
- TABLE_FEE: 台费收入 (table_charge_money)
- GOODS: 商品收入 (goods_money)
- ASSISTANT_BASE: 助教基础课 (assistant_pd_money)
- ASSISTANT_BONUS: 助教附加课 (assistant_cx_money)
"""
sql = """
SELECT
pay_time::DATE AS stat_date,
-- 台费收入
COALESCE(SUM(table_charge_money), 0) AS table_fee_income,
COUNT(CASE WHEN table_charge_money > 0 THEN 1 END) AS table_fee_orders,
-- 商品收入
COALESCE(SUM(goods_money), 0) AS goods_income,
COUNT(CASE WHEN goods_money > 0 THEN 1 END) AS goods_orders,
-- 助教基础课收入PD=陪打)
COALESCE(SUM(assistant_pd_money), 0) AS assistant_base_income,
COUNT(CASE WHEN assistant_pd_money > 0 THEN 1 END) AS assistant_base_orders,
-- 助教附加课收入CX=超休/促销)
COALESCE(SUM(assistant_cx_money), 0) AS assistant_bonus_income,
COUNT(CASE WHEN assistant_cx_money > 0 THEN 1 END) AS assistant_bonus_orders,
-- 总订单数
COUNT(*) AS total_orders
FROM dwd.dwd_settlement_head
WHERE site_id = %(site_id)s
AND pay_time >= %(start_date)s
AND pay_time < %(end_date)s + INTERVAL '1 day'
AND settle_status = 1 -- 已结账
GROUP BY pay_time::DATE
ORDER BY stat_date
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def _extract_income_by_area(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
按区域汇总收入
关联dim_table获取区域名称再映射到cfg_area_category
"""
sql = """
WITH area_orders AS (
SELECT
tfl.pay_time::DATE AS stat_date,
dt.site_table_area_name AS area_name,
tfl.order_settle_id,
COALESCE(tfl.ledger_amount, 0) AS income_amount,
COALESCE(tfl.ledger_time_seconds, 0) AS duration_seconds
FROM dwd.dwd_table_fee_log tfl
LEFT JOIN dwd.dim_table dt
ON dt.site_table_id = tfl.site_table_id
WHERE tfl.site_id = %(site_id)s
AND tfl.pay_time >= %(start_date)s
AND tfl.pay_time < %(end_date)s + INTERVAL '1 day'
AND COALESCE(tfl.is_delete, 0) = 0
UNION ALL
SELECT
asl.start_use_time::DATE AS stat_date,
dt.site_table_area_name AS area_name,
asl.order_settle_id,
COALESCE(asl.ledger_amount, 0) AS income_amount,
COALESCE(asl.income_seconds, 0) AS duration_seconds
FROM dwd.dwd_assistant_service_log asl
LEFT JOIN dwd.dim_table dt
ON dt.site_table_id = asl.site_table_id
WHERE asl.site_id = %(site_id)s
AND asl.start_use_time >= %(start_date)s
AND asl.start_use_time < %(end_date)s + INTERVAL '1 day'
AND asl.is_delete = 0
)
SELECT
stat_date,
area_name,
COALESCE(SUM(income_amount), 0) AS income_amount,
COALESCE(SUM(duration_seconds), 0) AS duration_seconds,
COUNT(DISTINCT order_settle_id) AS order_count
FROM area_orders
GROUP BY stat_date, area_name
ORDER BY stat_date, area_name
"""
rows = self.db.query(sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
return [dict(row) for row in rows] if rows else []
def transform(self, data: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
将抽取的数据转换为目标表格式:
1. 按收入类型展开(每种类型一条记录)
2. 按区域展开(每个区域一条记录)
3. 计算占比
"""
site_id = context.store_id
tenant_id = self.config.get("app.tenant_id", site_id)
records = []
# 处理按收入类型的数据
income_type_records = self._transform_income_by_type(
data.get('income_by_type', []),
site_id,
tenant_id
)
records.extend(income_type_records)
# 处理按区域的数据
area_records = self._transform_income_by_area(
data.get('income_by_area', []),
site_id,
tenant_id
)
records.extend(area_records)
return records
def _transform_income_by_type(
self,
income_data: List[Dict[str, Any]],
site_id: int,
tenant_id: int
) -> List[Dict[str, Any]]:
"""
转换按收入类型的数据
将每日汇总数据展开为4条记录台费/商品/基础课/附加课)
"""
# 收入类型定义
income_types = [
('TABLE_FEE', '台费收入', 'table_fee_income', 'table_fee_orders'),
('GOODS', '商品收入', 'goods_income', 'goods_orders'),
('ASSISTANT_BASE', '助教基础课', 'assistant_base_income', 'assistant_base_orders'),
('ASSISTANT_BONUS', '助教附加课', 'assistant_bonus_income', 'assistant_bonus_orders'),
]
records = []
for daily_data in income_data:
stat_date = daily_data.get('stat_date')
# 计算当日总收入(用于计算占比)
total_income = sum(
self.safe_decimal(daily_data.get(field, 0))
for _, _, field, _ in income_types
)
# 为每种收入类型生成一条记录
for type_code, type_name, income_field, order_field in income_types:
income_amount = self.safe_decimal(daily_data.get(income_field, 0))
order_count = daily_data.get(order_field, 0) or 0
# 计算占比(避免除零)
income_ratio = (income_amount / total_income) if total_income > 0 else Decimal('0')
records.append({
'site_id': site_id,
'tenant_id': tenant_id,
'stat_date': stat_date,
'structure_type': 'INCOME_TYPE',
'category_code': type_code,
'category_name': type_name,
'income_amount': income_amount,
'income_ratio': round(income_ratio, 4),
'order_count': order_count,
'duration_minutes': 0, # 收入类型维度不统计时长
})
return records
def _transform_income_by_area(
self,
area_data: List[Dict[str, Any]],
site_id: int,
tenant_id: int
) -> List[Dict[str, Any]]:
"""
转换按区域的数据
将区域名称映射到cfg_area_category的category_code
"""
records = []
# 加载区域分类配置
self.load_config_cache()
# 按日期分组计算总收入(用于计算占比)
daily_totals = {}
for row in area_data:
stat_date = row.get('stat_date')
income = self.safe_decimal(row.get('income_amount', 0))
daily_totals[stat_date] = daily_totals.get(stat_date, Decimal('0')) + income
# 按日期+区域聚合相同category_code需要合并
aggregated = {}
for row in area_data:
stat_date = row.get('stat_date')
area_name = row.get('area_name') or '未知区域'
income_amount = self.safe_decimal(row.get('income_amount', 0))
duration_seconds = row.get('duration_seconds', 0) or 0
order_count = row.get('order_count', 0) or 0
# 映射区域名称到分类代码
category = self.get_area_category(area_name)
category_code = category.get('category_code', 'OTHER')
category_name = category.get('category_name', '其他区域')
# 聚合键
key = (stat_date, category_code)
if key not in aggregated:
aggregated[key] = {
'stat_date': stat_date,
'category_code': category_code,
'category_name': category_name,
'income_amount': Decimal('0'),
'duration_seconds': 0,
'order_count': 0,
}
aggregated[key]['income_amount'] += income_amount
aggregated[key]['duration_seconds'] += duration_seconds
aggregated[key]['order_count'] += order_count
# 生成记录
for key, agg_data in aggregated.items():
stat_date = agg_data['stat_date']
total_income = daily_totals.get(stat_date, Decimal('1'))
income_amount = agg_data['income_amount']
# 计算占比
income_ratio = (income_amount / total_income) if total_income > 0 else Decimal('0')
records.append({
'site_id': site_id,
'tenant_id': tenant_id,
'stat_date': stat_date,
'structure_type': 'AREA',
'category_code': agg_data['category_code'],
'category_name': agg_data['category_name'],
'income_amount': income_amount,
'income_ratio': round(income_ratio, 4),
'order_count': agg_data['order_count'],
'duration_minutes': agg_data['duration_seconds'] // 60,
})
return records
def _map_area_to_category(
self,
area_name: str,
area_categories: Dict[str, Dict[str, Any]]
) -> Dict[str, Any]:
"""
兼容旧逻辑的映射方法(当前使用 get_area_category
"""
return self.get_area_category(area_name)
def load(self, records: List[Dict[str, Any]], context: TaskContext) -> Dict[str, Any]:
"""
加载数据到目标表
使用幂等方式delete-before-insert按日期范围
"""
if not records:
return {'inserted': 0, 'deleted': 0}
site_id = context.store_id
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
# 删除窗口内的旧数据
delete_sql = """
DELETE FROM dws.dws_finance_income_structure
WHERE site_id = %(site_id)s
AND stat_date >= %(start_date)s
AND stat_date <= %(end_date)s
"""
deleted = self.db.execute(delete_sql, {
'site_id': site_id,
'start_date': start_date,
'end_date': end_date,
})
# 批量插入新数据
insert_sql = """
INSERT INTO dws.dws_finance_income_structure (
site_id, tenant_id, stat_date,
structure_type, category_code, category_name,
income_amount, income_ratio,
order_count, duration_minutes,
created_at, updated_at
) VALUES (
%(site_id)s, %(tenant_id)s, %(stat_date)s,
%(structure_type)s, %(category_code)s, %(category_name)s,
%(income_amount)s, %(income_ratio)s,
%(order_count)s, %(duration_minutes)s,
NOW(), NOW()
)
"""
inserted = 0
for record in records:
self.db.execute(insert_sql, record)
inserted += 1
return {
'deleted': deleted or 0,
'inserted': inserted,
}

View File

@@ -0,0 +1,168 @@
# -*- coding: utf-8 -*-
"""
充值统计任务
功能说明:
"日期"为粒度,统计充值数据
数据来源:
- dwd_recharge_order: 充值订单
- dim_member_card_account: 会员卡账户(余额快照)
目标表:
dws.dws_finance_recharge_summary
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按日期
业务规则:
- 首充/续充:通过 is_first 字段区分
- 现金/赠送:通过 pay_money/gift_money 区分
- 卡余额:区分储值卡和赠送卡
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date
from decimal import Decimal
from typing import Any, Dict, List
from .base_dws_task import TaskContext
from .finance_base_task import FinanceBaseTask
# CHANGE 2025-07-16 | task 2.3: 继承 FinanceBaseTask复用 _extract_recharge_summary
class FinanceRechargeTask(FinanceBaseTask):
"""
充值统计任务
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_FINANCE_RECHARGE"
def get_target_table(self) -> str:
return "dws_finance_recharge_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "stat_date"]
def extract(self, context: TaskContext) -> Dict[str, Any]:
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
recharge_summary = self._extract_recharge_summary(site_id, start_date, end_date)
card_balances = self._extract_card_balances(site_id, end_date)
return {
'recharge_summary': recharge_summary,
'card_balances': card_balances,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
recharge_summary = extracted['recharge_summary']
card_balances = extracted['card_balances']
site_id = extracted['site_id']
results = []
for recharge in recharge_summary:
stat_date = recharge.get('stat_date')
# 仅有当前快照时统一写入避免窗口内其他日期为0
balance = card_balances
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'stat_date': stat_date,
'recharge_count': self.safe_int(recharge.get('recharge_count', 0)),
'recharge_total': self.safe_decimal(recharge.get('recharge_total', 0)),
'recharge_cash': self.safe_decimal(recharge.get('recharge_cash', 0)),
'recharge_gift': self.safe_decimal(recharge.get('recharge_gift', 0)),
'first_recharge_count': self.safe_int(recharge.get('first_recharge_count', 0)),
'first_recharge_cash': self.safe_decimal(recharge.get('first_recharge_cash', 0)),
'first_recharge_gift': self.safe_decimal(recharge.get('first_recharge_gift', 0)),
'first_recharge_total': self.safe_decimal(recharge.get('first_recharge_total', 0)),
'renewal_count': self.safe_int(recharge.get('renewal_count', 0)),
'renewal_cash': self.safe_decimal(recharge.get('renewal_cash', 0)),
'renewal_gift': self.safe_decimal(recharge.get('renewal_gift', 0)),
'renewal_total': self.safe_decimal(recharge.get('renewal_total', 0)),
'recharge_member_count': self.safe_int(recharge.get('recharge_member_count', 0)),
'new_member_count': self.safe_int(recharge.get('new_member_count', 0)),
'total_card_balance': self.safe_decimal(balance.get('total_balance', 0)),
'cash_card_balance': self.safe_decimal(balance.get('cash_balance', 0)),
'gift_card_balance': self.safe_decimal(balance.get('gift_balance', 0)),
}
results.append(record)
return results
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="stat_date"
def _extract_recharge_summary(self, site_id: int, start_date: date, end_date: date) -> List[Dict[str, Any]]:
sql = """
SELECT
DATE(pay_time) AS stat_date,
COUNT(*) AS recharge_count,
SUM(pay_money + gift_money) AS recharge_total,
SUM(pay_money) AS recharge_cash,
SUM(gift_money) AS recharge_gift,
COUNT(CASE WHEN is_first = 1 THEN 1 END) AS first_recharge_count,
SUM(CASE WHEN is_first = 1 THEN pay_money ELSE 0 END) AS first_recharge_cash,
SUM(CASE WHEN is_first = 1 THEN gift_money ELSE 0 END) AS first_recharge_gift,
SUM(CASE WHEN is_first = 1 THEN pay_money + gift_money ELSE 0 END) AS first_recharge_total,
COUNT(CASE WHEN is_first != 1 OR is_first IS NULL THEN 1 END) AS renewal_count,
SUM(CASE WHEN is_first != 1 OR is_first IS NULL THEN pay_money ELSE 0 END) AS renewal_cash,
SUM(CASE WHEN is_first != 1 OR is_first IS NULL THEN gift_money ELSE 0 END) AS renewal_gift,
SUM(CASE WHEN is_first != 1 OR is_first IS NULL THEN pay_money + gift_money ELSE 0 END) AS renewal_total,
COUNT(DISTINCT member_id) AS recharge_member_count,
COUNT(DISTINCT CASE WHEN is_first = 1 THEN member_id END) AS new_member_count
FROM dwd.dwd_recharge_order
WHERE site_id = %s AND DATE(pay_time) >= %s AND DATE(pay_time) <= %s
GROUP BY DATE(pay_time)
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_card_balances(self, site_id: int, stat_date: date) -> Dict[str, Decimal]:
CASH_CARD_TYPE_ID = 2793249295533893
GIFT_CARD_TYPE_IDS = [2791990152417157, 2793266846533445, 2794699703437125]
sql = """
SELECT card_type_id, SUM(balance) AS total_balance
FROM dwd.dim_member_card_account
WHERE site_id = %s AND scd2_is_current = 1
AND COALESCE(is_delete, 0) = 0
GROUP BY card_type_id
"""
rows = self.db.query(sql, (site_id,))
cash_balance = Decimal('0')
gift_balance = Decimal('0')
for row in (rows or []):
card_type_id = row['card_type_id']
balance = self.safe_decimal(row['total_balance'])
if card_type_id == CASH_CARD_TYPE_ID:
cash_balance += balance
elif card_type_id in GIFT_CARD_TYPE_IDS:
gift_balance += balance
return {
'cash_balance': cash_balance,
'gift_balance': gift_balance,
'total_balance': cash_balance + gift_balance
}
__all__ = ['FinanceRechargeTask']

View File

@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# AI_CHANGELOG [2026-02-13] 移除 RecallIndexTask/IntimacyIndexTask 导出,仅保留 WBI/NCI/ML/Relation
"""
指数算法任务模块
包含:
- WinbackIndexTask: 老客挽回指数 (WBI)
- NewconvIndexTask: 新客转化指数 (NCI)
- MlManualImportTask: ML 人工台账导入任务
- RelationIndexTask: 关系指数计算任务RS/OS/MS/ML
"""
from .winback_index_task import WinbackIndexTask
from .newconv_index_task import NewconvIndexTask
from .ml_manual_import_task import MlManualImportTask
from .relation_index_task import RelationIndexTask
__all__ = [
'WinbackIndexTask',
'NewconvIndexTask',
'MlManualImportTask',
'RelationIndexTask',
]

View File

@@ -0,0 +1,572 @@
# -*- coding: utf-8 -*-
# AI_CHANGELOG [2026-02-13] 更新 docstring移除 RECALL/INTIMACY 引用反映当前指数体系WBI/NCI/RS/OS/MS/ML
"""
指数算法任务基类
功能说明:
- 提供半衰期时间衰减函数
- 提供分位数计算和分位截断
- 提供0-10映射方法
- 提供算法参数加载
- 提供分位点历史记录用于EWMA平滑
算法原理:
1. 时间衰减函数半衰期模型decay(d; h) = exp(-ln(2) * d / h)
当 d=h 时权重衰减到 0.5,越近权重越大
2. 0-10映射流程
Raw Score → Winsorize(P5, P95) → [可选Log/asinh压缩] → MinMax(0, 10)
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import math
from abc import abstractmethod
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from ..base_dws_task import BaseDwsTask, TaskContext
# =============================================================================
# 数据类定义
# =============================================================================
@dataclass
class IndexParameters:
"""指数算法参数数据类"""
params: Dict[str, float]
loaded_at: datetime
@dataclass
class PercentileHistory:
"""分位点历史记录"""
percentile_5: float
percentile_95: float
percentile_5_smoothed: float
percentile_95_smoothed: float
record_count: int
calc_time: datetime
# =============================================================================
# 指数任务基类
# =============================================================================
class BaseIndexTask(BaseDwsTask):
"""
指数算法任务基类
提供指数计算通用功能:
1. 半衰期时间衰减函数
2. 分位数计算与截断
3. 0-10归一化映射
4. 算法参数加载
5. 分位点历史管理EWMA平滑
"""
# 子类需要定义的指数类型
INDEX_TYPE: str = ""
# 参数缓存TTL
_index_params_ttl: int = 300
def __init__(self, config, db_connection, api_client, logger):
super().__init__(config, db_connection, api_client, logger)
# 参数缓存:按 index_type 隔离,避免单任务多指数串参
self._index_params_cache_by_type: Dict[str, IndexParameters] = {}
# 默认参数
DEFAULT_LOOKBACK_DAYS = 60
DEFAULT_PERCENTILE_LOWER = 5
DEFAULT_PERCENTILE_UPPER = 95
DEFAULT_EWMA_ALPHA = 0.2
# ==========================================================================
# 抽象方法(子类需实现)
# ==========================================================================
@abstractmethod
def get_index_type(self) -> str:
"""获取指数类型(如 WBI/NCI/RS/OS/MS/ML"""
raise NotImplementedError
# ==========================================================================
# 时间衰减函数
# ==========================================================================
def decay(self, days: float, halflife: float) -> float:
"""
半衰期衰减函数
公式: decay(d; h) = exp(-ln(2) * d / h)
解释:当 d=h 时权重衰减到 0.5;越近权重越大,符合"近期更重要"的直觉
Args:
days: 事件距今天数 (d >= 0)
halflife: 半衰期 (h > 0),单位:天
Returns:
衰减后的权重,范围 (0, 1]
Examples:
>>> decay(0, 7) # 今天,权重=1.0
1.0
>>> decay(7, 7) # 7天前半衰期=7权重=0.5
0.5
>>> decay(14, 7) # 14天前权重=0.25
0.25
"""
if halflife <= 0:
raise ValueError("半衰期必须大于0")
if days < 0:
days = 0
return math.exp(-math.log(2) * days / halflife)
# ==========================================================================
# 分位数计算
# ==========================================================================
def calculate_percentiles(
self,
scores: List[float],
lower: int = 5,
upper: int = 95
) -> Tuple[float, float]:
"""
计算分位点
Args:
scores: 分数列表
lower: 下分位点百分比默认5
upper: 上分位点百分比默认95
Returns:
(下分位值, 上分位值) 元组
"""
if not scores:
return 0.0, 0.0
sorted_scores = sorted(scores)
n = len(sorted_scores)
# 计算分位点索引
lower_idx = max(0, int(n * lower / 100) - 1)
upper_idx = min(n - 1, int(n * upper / 100))
return sorted_scores[lower_idx], sorted_scores[upper_idx]
def winsorize(self, value: float, lower: float, upper: float) -> float:
"""
分位截断Winsorize
将值限制在 [lower, upper] 范围内
Args:
value: 原始值
lower: 下限P5分位
upper: 上限P95分位
Returns:
截断后的值
"""
return min(max(value, lower), upper)
# ==========================================================================
# 0-10映射
# ==========================================================================
def normalize_to_display(
self,
value: float,
min_val: float,
max_val: float,
use_log: bool = False,
compression: Optional[str] = None,
epsilon: float = 1e-6
) -> float:
"""
归一化到0-10分
映射流程:
1. [可选] 压缩y = ln(1 + x) / asinh(x)
2. MinMax映射score = 10 * (y - min) / (max - min)
Args:
value: 原始值已Winsorize
min_val: 最小值通常为P5
max_val: 最大值通常为P95
use_log: 是否使用log1p压缩兼容历史参数
compression: 压缩方式none/log1p/asinh优先级高于use_log
epsilon: 防除零小量
Returns:
0-10范围的分数
"""
compression_mode = self._resolve_compression(compression, use_log)
if compression_mode == "log1p":
value = math.log1p(value)
min_val = math.log1p(min_val)
max_val = math.log1p(max_val)
elif compression_mode == "asinh":
value = math.asinh(value)
min_val = math.asinh(min_val)
max_val = math.asinh(max_val)
# 防止分母为0
range_val = max_val - min_val
if range_val < epsilon:
return 5.0 # 几乎全员相同时返回中间值
score = 10.0 * (value - min_val) / range_val
# 确保在0-10范围内
return max(0.0, min(10.0, score))
def batch_normalize_to_display(
self,
raw_scores: List[Tuple[Any, float]], # [(entity_id, raw_score), ...]
use_log: bool = False,
compression: Optional[str] = None,
percentile_lower: int = 5,
percentile_upper: int = 95,
use_smoothing: bool = False,
site_id: Optional[int] = None,
index_type: Optional[str] = None,
) -> List[Tuple[Any, float, float]]:
"""
批量归一化Raw Score到Display Score
流程:
1. 提取所有raw_score
2. 计算分位点可选EWMA平滑
3. Winsorize截断
4. MinMax映射到0-10
Args:
raw_scores: (entity_id, raw_score) 元组列表
use_log: 是否使用log1p压缩兼容历史参数
compression: 压缩方式none/log1p/asinh优先级高于use_log
percentile_lower: 下分位百分比
percentile_upper: 上分位百分比
use_smoothing: 是否使用EWMA平滑分位点
site_id: 门店ID平滑时需要
index_type: 指数类型(平滑时用于分位历史隔离)
Returns:
(entity_id, raw_score, display_score) 元组列表
"""
if not raw_scores:
return []
# 提取raw_score
scores = [s for _, s in raw_scores]
# 计算分位点
q_l, q_u = self.calculate_percentiles(scores, percentile_lower, percentile_upper)
# EWMA平滑
if use_smoothing and site_id is not None:
q_l, q_u = self._apply_ewma_smoothing(
site_id=site_id,
current_p5=q_l,
current_p95=q_u,
index_type=index_type,
)
# 映射
results = []
compression_mode = self._resolve_compression(compression, use_log)
for entity_id, raw_score in raw_scores:
clipped = self.winsorize(raw_score, q_l, q_u)
display = self.normalize_to_display(
clipped,
q_l,
q_u,
compression=compression_mode,
)
results.append((entity_id, raw_score, round(display, 2)))
return results
# ==========================================================================
# 算法参数加载
# ==========================================================================
def load_index_parameters(
self,
index_type: Optional[str] = None,
force_reload: bool = False
) -> Dict[str, float]:
"""
加载指数算法参数
Args:
index_type: 指数类型默认使用子类定义的INDEX_TYPE
force_reload: 是否强制重新加载
Returns:
参数名到参数值的字典
"""
if index_type is None:
index_type = self.get_index_type()
now = datetime.now(self.tz)
cache_key = str(index_type).upper()
cache_item = self._index_params_cache_by_type.get(cache_key)
# 检查缓存
if (
not force_reload
and cache_item is not None
and (now - cache_item.loaded_at).total_seconds() < self._index_params_ttl
):
return cache_item.params
self.logger.debug("加载指数算法参数: %s", index_type)
sql = """
SELECT param_name, param_value
FROM dws.cfg_index_parameters
WHERE index_type = %s
AND effective_from <= CURRENT_DATE
AND (effective_to IS NULL OR effective_to >= CURRENT_DATE)
ORDER BY effective_from DESC
"""
rows = self.db.query(sql, (index_type,))
params = {}
seen = set()
for row in (rows or []):
row_dict = dict(row)
name = row_dict['param_name']
if name not in seen:
params[name] = float(row_dict['param_value'])
seen.add(name)
self._index_params_cache_by_type[cache_key] = IndexParameters(
params=params,
loaded_at=now
)
return params
def get_param(
self,
name: str,
default: float = 0.0,
index_type: Optional[str] = None,
) -> float:
"""
获取单个参数值
Args:
name: 参数名
default: 默认值
Returns:
参数值
"""
params = self.load_index_parameters(index_type=index_type)
return params.get(name, default)
# ==========================================================================
# 分位点历史管理EWMA平滑
# ==========================================================================
def get_last_percentile_history(
self,
site_id: int,
index_type: Optional[str] = None
) -> Optional[PercentileHistory]:
"""
获取最近一次分位点历史
Args:
site_id: 门店ID
index_type: 指数类型
Returns:
PercentileHistory 或 None
"""
if index_type is None:
index_type = self.get_index_type()
sql = """
SELECT
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, calc_time
FROM dws.dws_index_percentile_history
WHERE site_id = %s AND index_type = %s
ORDER BY calc_time DESC
LIMIT 1
"""
rows = self.db.query(sql, (site_id, index_type))
if not rows:
return None
row = dict(rows[0])
return PercentileHistory(
percentile_5=float(row['percentile_5'] or 0),
percentile_95=float(row['percentile_95'] or 0),
percentile_5_smoothed=float(row['percentile_5_smoothed'] or 0),
percentile_95_smoothed=float(row['percentile_95_smoothed'] or 0),
record_count=int(row['record_count'] or 0),
calc_time=row['calc_time']
)
def save_percentile_history(
self,
site_id: int,
percentile_5: float,
percentile_95: float,
percentile_5_smoothed: float,
percentile_95_smoothed: float,
record_count: int,
min_raw: float,
max_raw: float,
avg_raw: float,
index_type: Optional[str] = None
) -> None:
"""
保存分位点历史
Args:
site_id: 门店ID
percentile_5: 原始5分位
percentile_95: 原始95分位
percentile_5_smoothed: 平滑后5分位
percentile_95_smoothed: 平滑后95分位
record_count: 记录数
min_raw: 最小Raw Score
max_raw: 最大Raw Score
avg_raw: 平均Raw Score
index_type: 指数类型
"""
if index_type is None:
index_type = self.get_index_type()
sql = """
INSERT INTO dws.dws_index_percentile_history (
site_id, index_type, calc_time,
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, min_raw_score, max_raw_score, avg_raw_score
) VALUES (%s, %s, NOW(), %s, %s, %s, %s, %s, %s, %s, %s)
"""
with self.db.conn.cursor() as cur:
cur.execute(sql, (
site_id, index_type,
percentile_5, percentile_95,
percentile_5_smoothed, percentile_95_smoothed,
record_count, min_raw, max_raw, avg_raw
))
self.db.conn.commit()
def _apply_ewma_smoothing(
self,
site_id: int,
current_p5: float,
current_p95: float,
alpha: Optional[float] = None,
index_type: Optional[str] = None,
) -> Tuple[float, float]:
"""
应用EWMA平滑到分位点
公式: Q_t = (1 - α) * Q_{t-1} + α * Q_now
Args:
site_id: 门店ID
current_p5: 当前5分位
current_p95: 当前95分位
alpha: 平滑系数默认0.2
index_type: 指数类型(用于参数和历史隔离)
Returns:
(平滑后的P5, 平滑后的P95)
"""
if index_type is None:
index_type = self.get_index_type()
if alpha is None:
alpha = self.get_param(
'ewma_alpha',
self.DEFAULT_EWMA_ALPHA,
index_type=index_type,
)
history = self.get_last_percentile_history(site_id, index_type=index_type)
if history is None:
# 首次计算,不平滑
return current_p5, current_p95
smoothed_p5 = (1 - alpha) * history.percentile_5_smoothed + alpha * current_p5
smoothed_p95 = (1 - alpha) * history.percentile_95_smoothed + alpha * current_p95
return smoothed_p5, smoothed_p95
# ==========================================================================
# 统计工具方法
# ==========================================================================
def calculate_median(self, values: List[float]) -> float:
"""计算中位数"""
if not values:
return 0.0
sorted_vals = sorted(values)
n = len(sorted_vals)
mid = n // 2
if n % 2 == 0:
return (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
return sorted_vals[mid]
def calculate_mad(self, values: List[float]) -> float:
"""
计算MAD中位绝对偏差
MAD = median(|x - median(x)|)
MAD是比标准差更稳健的离散度度量不受极端值影响
"""
if not values:
return 0.0
median_val = self.calculate_median(values)
deviations = [abs(v - median_val) for v in values]
return self.calculate_median(deviations)
def safe_log(self, value: float, default: float = 0.0) -> float:
"""安全的对数运算"""
if value <= 0:
return default
return math.log(value)
def safe_ln1p(self, value: float) -> float:
"""安全的ln(1+x)运算"""
if value < -1:
return 0.0
return math.log1p(value)
def _resolve_compression(self, compression: Optional[str], use_log: bool) -> str:
"""规范化压缩方式"""
if compression is None:
return "log1p" if use_log else "none"
compression_key = str(compression).strip().lower()
if compression_key in ("none", "log1p", "asinh"):
return compression_key
if hasattr(self, "logger"):
self.logger.warning("未知压缩方式: %s,已降级为 none", compression)
return "none"

View File

@@ -0,0 +1,545 @@
# -*- coding: utf-8 -*-
"""
会员层召回/转化指数共享逻辑
"""
from __future__ import annotations
from dataclasses import dataclass, field
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask
from ..base_dws_task import TaskContext
@dataclass
class MemberActivityData:
"""Shared member activity features for WBI/NCI."""
member_id: int
site_id: int
tenant_id: int
member_create_time: Optional[datetime] = None
first_visit_time: Optional[datetime] = None
last_visit_time: Optional[datetime] = None
last_recharge_time: Optional[datetime] = None
t_v: float = 60.0
t_r: float = 60.0
t_a: float = 60.0
days_since_first_visit: Optional[int] = None
days_since_last_visit: Optional[int] = None
days_since_last_recharge: Optional[int] = None
visits_14d: int = 0
visits_60d: int = 0
visits_total: int = 0
spend_30d: float = 0.0
spend_180d: float = 0.0
sv_balance: float = 0.0
recharge_60d_amt: float = 0.0
interval_count: int = 0
intervals: List[float] = field(default_factory=list)
interval_ages_days: List[int] = field(default_factory=list)
recharge_unconsumed: int = 0
class MemberIndexBaseTask(BaseIndexTask):
"""会员指数共享基类,提供 execute() 模板方法。
子类只需实现:
- _calculate_scores(): 对会员活动数据计算原始评分
- _save_results(): 将归一化后的评分持久化到数据库
模板方法 execute() 按顺序执行:
获取站点信息 → 加载参数 → 构建会员活动数据 → _calculate_scores()
→ 归一化 → _save_results()
"""
DEFAULT_VISIT_LOOKBACK_DAYS = 180
DEFAULT_RECENCY_LOOKBACK_DAYS = 60
CASH_CARD_TYPE_ID = 2793249295533893
def execute(self, context: Optional[TaskContext] = None) -> Dict[str, Any]:
"""模板方法:编排会员指数计算的完整流程。
流程:
1. 获取 site_id / tenant_id
2. 加载算法参数
3. 构建会员活动特征
4. 调用子类 _calculate_scores() 计算原始评分
5. 调用子类 _save_results() 完成归一化与持久化
Returns:
与原子类 execute() 相同结构的结果字典
"""
self.logger.info("开始计算 %s", self.get_task_code())
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
params = self._load_params()
activities = self._build_member_activity(site_id, tenant_id, params)
if not activities:
self.logger.warning("No member activity data available; skip calculation")
return {'status': 'skipped', 'reason': 'no_data'}
raw_scores = self._calculate_scores(activities, params, site_id, tenant_id)
result = self._save_results(raw_scores, site_id, tenant_id, params, context)
return result
def _calculate_scores(
self,
activities: Dict[int, 'MemberActivityData'],
params: Dict[str, float],
site_id: int,
tenant_id: int,
) -> Any:
"""子类实现:对会员活动数据计算原始评分。
Args:
activities: member_id → MemberActivityData 映射
params: 算法参数字典
site_id: 门店ID
tenant_id: 租户ID
Returns:
原始评分数据(具体结构由子类定义)
"""
raise NotImplementedError(
f"{self.__class__.__name__} 必须实现 _calculate_scores()"
)
def _save_results(
self,
raw_scores: Any,
site_id: int,
tenant_id: int,
params: Dict[str, float],
context: Optional[TaskContext],
) -> Dict[str, Any]:
"""子类实现:将评分数据归一化并持久化。
Args:
raw_scores: _calculate_scores() 的返回值
site_id: 门店ID
tenant_id: 租户ID
params: 算法参数字典
context: 任务上下文
Returns:
包含 status、member_count、records_inserted 等键的结果字典
"""
raise NotImplementedError(
f"{self.__class__.__name__} 必须实现 _save_results()"
)
def _get_site_id(self, context: Optional[TaskContext]) -> int:
"""获取门店ID"""
if context and hasattr(context, 'store_id') and context.store_id:
return context.store_id
site_id = self.config.get('app.default_site_id') or self.config.get('app.store_id')
if site_id is not None:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM dwd.dwd_settlement_head WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('site_id')
if value is not None:
return int(value)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
"""获取租户ID"""
tenant_id = self.config.get('app.tenant_id')
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM dwd.dwd_settlement_head WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
value = dict(rows[0]).get('tenant_id')
if value is not None:
return int(value)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0
def _load_params(self) -> Dict[str, float]:
"""Load index parameters with defaults and runtime overrides."""
params = self.load_index_parameters()
result = dict(self.DEFAULT_PARAMS)
result.update(params)
# GUI/环境变量可通过 run.index_lookback_days 覆盖 recency 窗口
override_days = self.config.get('run.index_lookback_days')
if override_days is not None:
try:
override_days_int = int(override_days)
if override_days_int < 7 or override_days_int > 180:
self.logger.warning(
"%s: run.index_lookback_days=%s 超出建议范围[7,180],已自动截断",
self.get_task_code(),
override_days,
)
override_days_int = max(7, min(180, override_days_int))
result['lookback_days_recency'] = float(override_days_int)
self.logger.info(
"%s: 使用回溯天数覆盖 lookback_days_recency=%d",
self.get_task_code(),
override_days_int,
)
except (TypeError, ValueError):
self.logger.warning(
"%s: run.index_lookback_days=%s is invalid; ignore override and use parameter table value",
self.get_task_code(),
override_days,
)
return result
def _build_visit_condition_sql(self) -> str:
"""Build visit-scope condition SQL."""
return """
(
s.settle_type = 1
OR (
s.settle_type = 3
AND EXISTS (
SELECT 1
FROM dwd.dwd_assistant_service_log asl
JOIN dws.cfg_skill_type st
ON asl.skill_id = st.skill_id
AND st.course_type_code = 'BONUS'
AND st.is_active = TRUE
WHERE asl.order_settle_id = s.order_settle_id
AND asl.site_id = s.site_id
AND asl.tenant_member_id = s.member_id
AND asl.is_delete = 0
)
)
)
"""
def _extract_visit_day_rows(
self,
site_id: int,
start_date: date,
end_date: date,
) -> List[Dict[str, Any]]:
"""提取到店记录(按天去重)"""
condition_sql = self._build_visit_condition_sql()
sql = f"""
WITH visit_source AS (
SELECT
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time,
s.pay_amount
FROM dwd.dwd_settlement_head s
LEFT JOIN dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = %s
AND s.pay_time >= %s
AND s.pay_time < %s + INTERVAL '1 day'
AND {condition_sql}
)
SELECT
canonical_member_id AS member_id,
DATE(pay_time) AS visit_date,
MAX(pay_time) AS last_visit_time,
SUM(COALESCE(pay_amount, 0)) AS day_pay_amount
FROM visit_source
WHERE canonical_member_id > 0
GROUP BY canonical_member_id, DATE(pay_time)
ORDER BY canonical_member_id, visit_date
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in (rows or [])]
def _extract_recharge_rows(
self,
site_id: int,
start_date: date,
end_date: date,
) -> Dict[int, Dict[str, Any]]:
"""提取充值记录近60天"""
sql = """
WITH recharge_source AS (
SELECT
COALESCE(NULLIF(r.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
r.pay_time,
r.pay_amount
FROM dwd.dwd_recharge_order r
LEFT JOIN dwd.dim_member_card_account mca
ON r.tenant_member_card_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = r.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE r.site_id = %s
AND r.settle_type = 5
AND r.pay_time >= %s
AND r.pay_time < %s + INTERVAL '1 day'
)
SELECT
canonical_member_id AS member_id,
MAX(pay_time) AS last_recharge_time,
SUM(COALESCE(pay_amount, 0)) AS recharge_60d_amt
FROM recharge_source
WHERE canonical_member_id > 0
GROUP BY canonical_member_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
result: Dict[int, Dict[str, Any]] = {}
for row in (rows or []):
row_dict = dict(row)
result[int(row_dict['member_id'])] = row_dict
return result
def _extract_member_create_times(self, member_ids: List[int]) -> Dict[int, datetime]:
"""提取会员建档时间"""
if not member_ids:
return {}
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
member_id,
create_time
FROM dwd.dim_member
WHERE member_id IN ({member_ids_str})
AND scd2_is_current = 1
"""
rows = self.db.query(sql)
result = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
create_time = row_dict.get('create_time')
if create_time:
result[member_id] = create_time
return result
def _extract_first_visit_times(self, site_id: int, member_ids: List[int]) -> Dict[int, datetime]:
"""提取首次到店时间(全量)"""
if not member_ids:
return {}
member_ids_str = ','.join(str(m) for m in member_ids)
condition_sql = self._build_visit_condition_sql()
sql = f"""
WITH visit_source AS (
SELECT
COALESCE(NULLIF(s.member_id, 0), mca.tenant_member_id) AS canonical_member_id,
s.pay_time
FROM dwd.dwd_settlement_head s
LEFT JOIN dwd.dim_member_card_account mca
ON s.member_card_account_id = mca.member_card_id
AND mca.scd2_is_current = 1
AND mca.register_site_id = s.site_id
AND COALESCE(mca.is_delete, 0) = 0
WHERE s.site_id = %s
AND {condition_sql}
)
SELECT
canonical_member_id AS member_id,
MIN(pay_time) AS first_visit_time
FROM visit_source
WHERE canonical_member_id IN ({member_ids_str})
GROUP BY canonical_member_id
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
first_visit_time = row_dict.get('first_visit_time')
if first_visit_time:
result[member_id] = first_visit_time
return result
def _extract_sv_balances(self, site_id: int, tenant_id: int, member_ids: List[int]) -> Dict[int, Decimal]:
"""Fetch member stored-value card balances."""
if not member_ids:
return {}
member_ids_str = ','.join(str(m) for m in member_ids)
sql = f"""
SELECT
tenant_member_id AS member_id,
SUM(CASE WHEN card_type_id = %s THEN balance ELSE 0 END) AS sv_balance
FROM dwd.dim_member_card_account
WHERE tenant_id = %s
AND register_site_id = %s
AND scd2_is_current = 1
AND COALESCE(is_delete, 0) = 0
AND tenant_member_id IN ({member_ids_str})
GROUP BY tenant_member_id
"""
rows = self.db.query(sql, (self.CASH_CARD_TYPE_ID, tenant_id, site_id))
result: Dict[int, Decimal] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = int(row_dict['member_id'])
result[member_id] = row_dict.get('sv_balance') or Decimal('0')
return result
def _build_member_activity(
self,
site_id: int,
tenant_id: int,
params: Dict[str, float],
) -> Dict[int, MemberActivityData]:
"""构建会员活动特征"""
now = datetime.now(self.tz)
base_date = now.date()
visit_lookback_days = int(params.get('visit_lookback_days', self.DEFAULT_VISIT_LOOKBACK_DAYS))
recency_days = int(params.get('lookback_days_recency', self.DEFAULT_RECENCY_LOOKBACK_DAYS))
visit_start_date = base_date - timedelta(days=visit_lookback_days)
visit_rows = self._extract_visit_day_rows(site_id, visit_start_date, base_date)
member_day_rows: Dict[int, List[Dict[str, Any]]] = {}
for row in (visit_rows or []):
member_id = int(row['member_id'])
member_day_rows.setdefault(member_id, []).append(row)
recharge_start_date = base_date - timedelta(days=recency_days)
recharge_rows = self._extract_recharge_rows(site_id, recharge_start_date, base_date)
member_ids = set(member_day_rows.keys()) | set(recharge_rows.keys())
if not member_ids:
return {}
member_id_list = list(member_ids)
member_create_times = self._extract_member_create_times(member_id_list)
first_visit_times = self._extract_first_visit_times(site_id, member_id_list)
sv_balances = self._extract_sv_balances(site_id, tenant_id, member_id_list)
results: Dict[int, MemberActivityData] = {}
for member_id in member_ids:
data = MemberActivityData(
member_id=member_id,
site_id=site_id,
tenant_id=tenant_id,
)
day_rows = member_day_rows.get(member_id, [])
if day_rows:
day_rows_sorted = sorted(day_rows, key=lambda x: x['visit_date'])
data.visits_total = len(day_rows_sorted)
last_visit_time = max(r.get('last_visit_time') for r in day_rows_sorted)
data.last_visit_time = last_visit_time
# 近14/60天到店次数
days_14_ago = base_date - timedelta(days=14)
days_60_ago = base_date - timedelta(days=60)
for r in day_rows_sorted:
visit_date = r.get('visit_date')
if visit_date is None:
continue
if visit_date >= days_14_ago:
data.visits_14d += 1
if visit_date >= days_60_ago:
data.visits_60d += 1
# 消费金额
days_30_ago = base_date - timedelta(days=30)
for r in day_rows_sorted:
visit_date = r.get('visit_date')
day_pay = float(r.get('day_pay_amount') or 0)
data.spend_180d += day_pay
if visit_date and visit_date >= days_30_ago:
data.spend_30d += day_pay
# 计算到店间隔(按天)
visit_dates = [r.get('visit_date') for r in day_rows_sorted if r.get('visit_date')]
intervals: List[float] = []
interval_ages_days: List[int] = []
for i in range(1, len(visit_dates)):
interval = (visit_dates[i] - visit_dates[i - 1]).days
intervals.append(float(min(recency_days, interval)))
interval_ages_days.append(max(0, (base_date - visit_dates[i]).days))
data.intervals = intervals
data.interval_ages_days = interval_ages_days
data.interval_count = len(intervals)
recharge_info = recharge_rows.get(member_id)
if recharge_info:
data.last_recharge_time = recharge_info.get('last_recharge_time')
data.recharge_60d_amt = float(recharge_info.get('recharge_60d_amt') or 0)
data.member_create_time = member_create_times.get(member_id)
data.first_visit_time = first_visit_times.get(member_id)
sv_balance = sv_balances.get(member_id)
if sv_balance is not None:
data.sv_balance = float(sv_balance)
# 时间差计算
if data.first_visit_time:
data.days_since_first_visit = (base_date - data.first_visit_time.date()).days
if data.last_visit_time:
data.days_since_last_visit = (base_date - data.last_visit_time.date()).days
if data.last_recharge_time:
data.days_since_last_recharge = (base_date - data.last_recharge_time.date()).days
# tV/tR/tA
data.t_v = float(min(recency_days, data.days_since_last_visit)) if data.days_since_last_visit is not None else float(recency_days)
data.t_r = float(min(recency_days, data.days_since_last_recharge)) if data.days_since_last_recharge is not None else float(recency_days)
data.t_a = float(min(data.t_v, data.t_r))
# 充值是否未回访
if data.last_recharge_time and (data.last_visit_time is None or data.last_recharge_time > data.last_visit_time):
data.recharge_unconsumed = 1
results[member_id] = data
return results
def classify_segment(
self,
data: MemberActivityData,
params: Dict[str, float],
) -> Tuple[str, str, bool]:
"""Classify member into NEW/OLD/STOP buckets."""
recency_days = int(params.get('lookback_days_recency', self.DEFAULT_RECENCY_LOOKBACK_DAYS))
enable_stop_exception = int(params.get('enable_stop_high_balance_exception', 0)) == 1
high_balance_threshold = float(params.get('high_balance_threshold', 1000))
if data.t_a >= recency_days:
if enable_stop_exception and data.sv_balance >= high_balance_threshold:
return "STOP", "STOP_HIGH_BALANCE", True
return "STOP", "STOP", False
new_visit_threshold = int(params.get('new_visit_threshold', 2))
new_days_threshold = int(params.get('new_days_threshold', 30))
recharge_recent_days = int(params.get('recharge_recent_days', 14))
new_recharge_max_visits = int(params.get('new_recharge_max_visits', 10))
is_new_by_visits = data.visits_total <= new_visit_threshold
is_new_by_first_visit = data.days_since_first_visit is not None and data.days_since_first_visit <= new_days_threshold
is_new_by_recharge = (
data.recharge_unconsumed == 1
and data.days_since_last_recharge is not None
and data.days_since_last_recharge <= recharge_recent_days
and data.visits_total <= new_recharge_max_visits
)
if is_new_by_visits or is_new_by_first_visit or is_new_by_recharge:
return "NEW", "NEW", True
return "OLD", "OLD", True

View File

@@ -0,0 +1,623 @@
# -*- coding: utf-8 -*-
"""
ML 人工台账导入任务。
设计目标:
1. 人工台账作为 ML 唯一真源;
2. 同一订单支持多助教归因,默认均分;
3. 覆盖策略:
- 近 30 天:按 site_id + biz_date 日覆盖;
- 超过 30 天按固定纪元2026-01-01切 30 天批次覆盖。
"""
from __future__ import annotations
import os
import uuid
from dataclasses import dataclass
from datetime import date, datetime, timedelta
from decimal import Decimal
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
from .base_index_task import BaseIndexTask
from ..base_dws_task import TaskContext
@dataclass(frozen=True)
class ImportScope:
"""导入覆盖范围定义。"""
site_id: int
scope_type: str # DAY / P30
start_date: date
end_date: date
@property
def scope_key(self) -> str:
if self.scope_type == "DAY":
return f"DAY:{self.site_id}:{self.start_date.isoformat()}"
return (
f"P30:{self.site_id}:{self.start_date.isoformat()}:{self.end_date.isoformat()}"
)
class MlManualImportTask(BaseIndexTask):
"""导入并拆分 ML 人工台账(订单宽表 + 助教分摊窄表)。"""
INDEX_TYPE = "ML"
EPOCH_ANCHOR = date(2026, 1, 1)
HISTORICAL_BUCKET_DAYS = 30
ASSISTANT_SLOT_COUNT = 5
# Excel 模板字段(按列顺序)
TEMPLATE_COLUMNS = [
"site_id",
"biz_date",
"external_id",
"member_id",
"pay_time",
"order_amount",
"currency",
"assistant_id_1",
"assistant_name_1",
"assistant_id_2",
"assistant_name_2",
"assistant_id_3",
"assistant_name_3",
"assistant_id_4",
"assistant_name_4",
"assistant_id_5",
"assistant_name_5",
"remark",
]
def get_task_code(self) -> str:
return "DWS_ML_MANUAL_IMPORT"
def get_target_table(self) -> str:
return "dws_ml_manual_order_source"
def get_primary_keys(self) -> List[str]:
return ["site_id", "external_id", "import_scope_key", "row_no"]
def get_index_type(self) -> str:
return self.INDEX_TYPE
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
"""
执行导入。
说明:该任务按“文件”运行,不依赖时间窗口。调度器会以工具任务方式直接触发。
"""
file_path = self._resolve_file_path()
if not file_path:
raise ValueError(
"未找到 ML 台账文件,请通过环境变量 ML_MANUAL_LEDGER_FILE 或配置 run.ml_manual_ledger_file 指定"
)
rows = self._read_excel_rows(file_path)
if not rows:
self.logger.warning("台账文件为空:%s", file_path)
return {
"status": "SUCCESS",
"counts": {
"source_rows": 0,
"alloc_rows": 0,
"deleted_source_rows": 0,
"deleted_alloc_rows": 0,
"scopes": 0,
},
}
now = datetime.now(self.tz)
today = now.date()
import_batch_no = self._build_import_batch_no(now)
import_file_name = Path(file_path).name
import_user = self._resolve_import_user()
source_rows: List[Dict[str, Any]] = []
alloc_rows: List[Dict[str, Any]] = []
scope_set: Dict[Tuple[int, str, date, date], ImportScope] = {}
for idx, raw in enumerate(rows, start=2):
normalized = self._normalize_row(raw, row_no=idx, file_path=file_path)
row_scope = self.resolve_scope(
site_id=normalized["site_id"],
biz_date=normalized["biz_date"],
today=today,
)
scope_set[(row_scope.site_id, row_scope.scope_type, row_scope.start_date, row_scope.end_date)] = row_scope
source_row = self._build_source_row(
normalized=normalized,
scope=row_scope,
import_batch_no=import_batch_no,
import_file_name=import_file_name,
import_user=import_user,
import_time=now,
)
source_rows.append(source_row)
alloc_rows.extend(
self._build_alloc_rows(
normalized=normalized,
scope=row_scope,
import_batch_no=import_batch_no,
import_file_name=import_file_name,
import_user=import_user,
import_time=now,
)
)
scopes = list(scope_set.values())
deleted_source_rows, deleted_alloc_rows = self._delete_by_scopes(scopes)
inserted_source = self._insert_source_rows(source_rows)
upserted_alloc = self._upsert_alloc_rows(alloc_rows)
self.db.conn.commit()
self.logger.info(
"ML 人工台账导入完成: file=%s source=%d alloc=%d scopes=%d",
file_path,
inserted_source,
upserted_alloc,
len(scopes),
)
return {
"status": "SUCCESS",
"counts": {
"source_rows": inserted_source,
"alloc_rows": upserted_alloc,
"deleted_source_rows": deleted_source_rows,
"deleted_alloc_rows": deleted_alloc_rows,
"scopes": len(scopes),
},
}
def _resolve_file_path(self) -> Optional[str]:
"""解析台账文件路径。"""
raw_path = (
self.config.get("run.ml_manual_ledger_file")
or self.config.get("run.ml_manual_file")
or os.getenv("ML_MANUAL_LEDGER_FILE")
)
if not raw_path:
return None
candidate = Path(str(raw_path)).expanduser()
if not candidate.is_absolute():
candidate = Path.cwd() / candidate
if not candidate.exists():
raise FileNotFoundError(f"台账文件不存在: {candidate}")
return str(candidate)
def _read_excel_rows(self, file_path: str) -> List[Dict[str, Any]]:
"""读取 Excel 为行字典列表。"""
try:
from openpyxl import load_workbook
except Exception as exc: # noqa: BLE001
raise RuntimeError(
"缺少 openpyxl 依赖,无法读取 Excel请先安装 openpyxl"
) from exc
wb = load_workbook(file_path, data_only=True)
ws = wb.active
header_row = next(ws.iter_rows(min_row=1, max_row=1, values_only=True), None)
if not header_row:
return []
headers = [str(col).strip() if col is not None else "" for col in header_row]
if not headers:
return []
rows: List[Dict[str, Any]] = []
for values in ws.iter_rows(min_row=2, values_only=True):
if values is None:
continue
row_dict = {headers[i]: values[i] for i in range(min(len(headers), len(values)))}
if self._is_empty_row(row_dict):
continue
rows.append(row_dict)
return rows
@staticmethod
def _is_empty_row(row: Dict[str, Any]) -> bool:
for value in row.values():
if value is None:
continue
if isinstance(value, str) and not value.strip():
continue
return False
return True
def _normalize_row(
self,
raw: Dict[str, Any],
row_no: int,
file_path: str,
) -> Dict[str, Any]:
"""规范化单行字段。"""
site_id = self._to_int(raw.get("site_id"), fallback=self.config.get("app.store_id"))
biz_date = self._to_date(raw.get("biz_date"))
pay_time = self._to_datetime(raw.get("pay_time"), fallback_date=biz_date)
external_id = str(raw.get("external_id") or "").strip()
if not external_id:
raise ValueError(f"台账行 {row_no} 缺少 external_id订单ID: {file_path}")
member_id = self._to_int(raw.get("member_id"), fallback=0)
order_amount = self._to_decimal(raw.get("order_amount"))
currency = str(raw.get("currency") or "CNY").strip().upper() or "CNY"
remark = str(raw.get("remark") or "").strip()
assistants: List[Tuple[int, str]] = []
for idx in range(1, self.ASSISTANT_SLOT_COUNT + 1):
aid = self._to_int(raw.get(f"assistant_id_{idx}"), fallback=None)
name = str(raw.get(f"assistant_name_{idx}") or "").strip()
if aid is None:
continue
assistants.append((aid, name))
return {
"site_id": site_id,
"biz_date": biz_date,
"external_id": external_id,
"member_id": member_id,
"pay_time": pay_time,
"order_amount": order_amount,
"currency": currency,
"assistants": assistants,
"remark": remark,
"row_no": row_no,
}
def _build_source_row(
self,
*,
normalized: Dict[str, Any],
scope: ImportScope,
import_batch_no: str,
import_file_name: str,
import_user: str,
import_time: datetime,
) -> Dict[str, Any]:
"""构造宽表入库行。"""
assistants: Sequence[Tuple[int, str]] = normalized["assistants"]
row = {
"site_id": normalized["site_id"],
"biz_date": normalized["biz_date"],
"external_id": normalized["external_id"],
"member_id": normalized["member_id"],
"pay_time": normalized["pay_time"],
"order_amount": normalized["order_amount"],
"currency": normalized["currency"],
"import_batch_no": import_batch_no,
"import_file_name": import_file_name,
"import_scope_key": scope.scope_key,
"import_time": import_time,
"import_user": import_user,
"row_no": normalized["row_no"],
"remark": normalized["remark"],
}
for idx in range(1, self.ASSISTANT_SLOT_COUNT + 1):
aid, aname = (assistants[idx - 1] if idx - 1 < len(assistants) else (None, None))
row[f"assistant_id_{idx}"] = aid
row[f"assistant_name_{idx}"] = aname
return row
def _build_alloc_rows(
self,
*,
normalized: Dict[str, Any],
scope: ImportScope,
import_batch_no: str,
import_file_name: str,
import_user: str,
import_time: datetime,
) -> List[Dict[str, Any]]:
"""构造窄表分摊行。"""
assistants: Sequence[Tuple[int, str]] = normalized["assistants"]
if not assistants:
return []
n = Decimal(str(len(assistants)))
share_ratio = Decimal("1") / n
rows: List[Dict[str, Any]] = []
for assistant_id, assistant_name in assistants:
allocated_amount = normalized["order_amount"] * share_ratio
rows.append(
{
"site_id": normalized["site_id"],
"biz_date": normalized["biz_date"],
"external_id": normalized["external_id"],
"member_id": normalized["member_id"],
"pay_time": normalized["pay_time"],
"order_amount": normalized["order_amount"],
"assistant_id": assistant_id,
"assistant_name": assistant_name,
"share_ratio": share_ratio,
"allocated_amount": allocated_amount,
"currency": normalized["currency"],
"import_scope_key": scope.scope_key,
"import_batch_no": import_batch_no,
"import_file_name": import_file_name,
"import_time": import_time,
"import_user": import_user,
}
)
return rows
@classmethod
def resolve_scope(cls, site_id: int, biz_date: date, today: date) -> ImportScope:
"""按规则解析覆盖范围。"""
day_diff = (today - biz_date).days
if day_diff <= cls.HISTORICAL_BUCKET_DAYS:
return ImportScope(
site_id=site_id,
scope_type="DAY",
start_date=biz_date,
end_date=biz_date,
)
bucket_start, bucket_end = cls.resolve_p30_bucket(biz_date)
return ImportScope(
site_id=site_id,
scope_type="P30",
start_date=bucket_start,
end_date=bucket_end,
)
@classmethod
def resolve_p30_bucket(cls, biz_date: date) -> Tuple[date, date]:
"""固定纪元 30 天分桶。"""
delta_days = (biz_date - cls.EPOCH_ANCHOR).days
bucket_index = delta_days // cls.HISTORICAL_BUCKET_DAYS
bucket_start = cls.EPOCH_ANCHOR + timedelta(days=bucket_index * cls.HISTORICAL_BUCKET_DAYS)
bucket_end = bucket_start + timedelta(days=cls.HISTORICAL_BUCKET_DAYS - 1)
return bucket_start, bucket_end
def _delete_by_scopes(self, scopes: Iterable[ImportScope]) -> Tuple[int, int]:
"""按 scope 先删后写,保证整批覆盖。"""
deleted_source = 0
deleted_alloc = 0
with self.db.conn.cursor() as cur:
for scope in scopes:
if scope.scope_type == "DAY":
cur.execute(
"""
DELETE FROM dws.dws_ml_manual_order_source
WHERE site_id = %s AND biz_date = %s
""",
(scope.site_id, scope.start_date),
)
deleted_source += max(cur.rowcount, 0)
cur.execute(
"""
DELETE FROM dws.dws_ml_manual_order_alloc
WHERE site_id = %s AND biz_date = %s
""",
(scope.site_id, scope.start_date),
)
deleted_alloc += max(cur.rowcount, 0)
else:
cur.execute(
"""
DELETE FROM dws.dws_ml_manual_order_source
WHERE site_id = %s AND biz_date >= %s AND biz_date <= %s
""",
(scope.site_id, scope.start_date, scope.end_date),
)
deleted_source += max(cur.rowcount, 0)
cur.execute(
"""
DELETE FROM dws.dws_ml_manual_order_alloc
WHERE site_id = %s AND biz_date >= %s AND biz_date <= %s
""",
(scope.site_id, scope.start_date, scope.end_date),
)
deleted_alloc += max(cur.rowcount, 0)
return deleted_source, deleted_alloc
def _insert_source_rows(self, rows: List[Dict[str, Any]]) -> int:
if not rows:
return 0
columns = [
"site_id",
"biz_date",
"external_id",
"member_id",
"pay_time",
"order_amount",
"currency",
"assistant_id_1",
"assistant_name_1",
"assistant_id_2",
"assistant_name_2",
"assistant_id_3",
"assistant_name_3",
"assistant_id_4",
"assistant_name_4",
"assistant_id_5",
"assistant_name_5",
"import_batch_no",
"import_file_name",
"import_scope_key",
"import_time",
"import_user",
"row_no",
"remark",
"created_at",
"updated_at",
]
sql = f"""
INSERT INTO dws.dws_ml_manual_order_source ({", ".join(columns)})
VALUES ({", ".join(["%s"] * len(columns))})
"""
inserted = 0
with self.db.conn.cursor() as cur:
for row in rows:
values = [
row.get("site_id"),
row.get("biz_date"),
row.get("external_id"),
row.get("member_id"),
row.get("pay_time"),
row.get("order_amount"),
row.get("currency"),
row.get("assistant_id_1"),
row.get("assistant_name_1"),
row.get("assistant_id_2"),
row.get("assistant_name_2"),
row.get("assistant_id_3"),
row.get("assistant_name_3"),
row.get("assistant_id_4"),
row.get("assistant_name_4"),
row.get("assistant_id_5"),
row.get("assistant_name_5"),
row.get("import_batch_no"),
row.get("import_file_name"),
row.get("import_scope_key"),
row.get("import_time"),
row.get("import_user"),
row.get("row_no"),
row.get("remark"),
row.get("import_time"),
row.get("import_time"),
]
cur.execute(sql, values)
inserted += max(cur.rowcount, 0)
return inserted
def _upsert_alloc_rows(self, rows: List[Dict[str, Any]]) -> int:
if not rows:
return 0
columns = [
"site_id",
"biz_date",
"external_id",
"member_id",
"pay_time",
"order_amount",
"assistant_id",
"assistant_name",
"share_ratio",
"allocated_amount",
"currency",
"import_scope_key",
"import_batch_no",
"import_file_name",
"import_time",
"import_user",
"created_at",
"updated_at",
]
sql = f"""
INSERT INTO dws.dws_ml_manual_order_alloc ({", ".join(columns)})
VALUES ({", ".join(["%s"] * len(columns))})
ON CONFLICT (site_id, external_id, assistant_id)
DO UPDATE SET
biz_date = EXCLUDED.biz_date,
member_id = EXCLUDED.member_id,
pay_time = EXCLUDED.pay_time,
order_amount = EXCLUDED.order_amount,
assistant_name = EXCLUDED.assistant_name,
share_ratio = EXCLUDED.share_ratio,
allocated_amount = EXCLUDED.allocated_amount,
currency = EXCLUDED.currency,
import_scope_key = EXCLUDED.import_scope_key,
import_batch_no = EXCLUDED.import_batch_no,
import_file_name = EXCLUDED.import_file_name,
import_time = EXCLUDED.import_time,
import_user = EXCLUDED.import_user,
updated_at = NOW()
"""
affected = 0
with self.db.conn.cursor() as cur:
for row in rows:
values = [
row.get("site_id"),
row.get("biz_date"),
row.get("external_id"),
row.get("member_id"),
row.get("pay_time"),
row.get("order_amount"),
row.get("assistant_id"),
row.get("assistant_name"),
row.get("share_ratio"),
row.get("allocated_amount"),
row.get("currency"),
row.get("import_scope_key"),
row.get("import_batch_no"),
row.get("import_file_name"),
row.get("import_time"),
row.get("import_user"),
row.get("import_time"),
row.get("import_time"),
]
cur.execute(sql, values)
affected += max(cur.rowcount, 0)
return affected
@staticmethod
def _to_int(value: Any, fallback: Optional[int] = None) -> Optional[int]:
if value is None:
return fallback
if isinstance(value, str) and not value.strip():
return fallback
try:
return int(value)
except Exception: # noqa: BLE001
return fallback
@staticmethod
def _to_decimal(value: Any) -> Decimal:
if value is None or value == "":
return Decimal("0")
return Decimal(str(value))
@staticmethod
def _to_date(value: Any) -> date:
if isinstance(value, datetime):
return value.date()
if isinstance(value, date):
return value
if isinstance(value, str):
text = value.strip()
if not text:
raise ValueError("biz_date 不能为空")
if len(text) >= 10:
return datetime.fromisoformat(text[:10]).date()
return datetime.fromisoformat(text).date()
raise ValueError(f"无法解析 biz_date: {value}")
@staticmethod
def _to_datetime(value: Any, fallback_date: date) -> datetime:
if isinstance(value, datetime):
return value
if isinstance(value, date):
return datetime.combine(value, datetime.min.time())
if isinstance(value, str):
text = value.strip()
if text:
text = text.replace("/", "-")
try:
return datetime.fromisoformat(text)
except Exception: # noqa: BLE001
if len(text) >= 19:
return datetime.strptime(text[:19], "%Y-%m-%d %H:%M:%S")
return datetime.fromisoformat(text[:10])
return datetime.combine(fallback_date, datetime.min.time())
@staticmethod
def _build_import_batch_no(now: datetime) -> str:
return f"MLM_{now.strftime('%Y%m%d%H%M%S')}_{str(uuid.uuid4())[:8]}"
@staticmethod
def _resolve_import_user() -> str:
return (
os.getenv("ETL_OPERATOR")
or os.getenv("USERNAME")
or os.getenv("USER")
or "system"
)
__all__ = ["MlManualImportTask", "ImportScope"]

View File

@@ -0,0 +1,397 @@
# -*- coding: utf-8 -*-
"""
新客转化指数NCI计算任务。"""
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from .member_index_base import MemberActivityData, MemberIndexBaseTask
from ..base_dws_task import TaskContext
@dataclass
class MemberNewconvData:
activity: MemberActivityData
status: str
segment: str
need_new: float = 0.0
salvage_new: float = 0.0
recharge_new: float = 0.0
value_new: float = 0.0
welcome_new: float = 0.0
raw_score_welcome: Optional[float] = None
raw_score_convert: Optional[float] = None
raw_score: Optional[float] = None
display_score_welcome: Optional[float] = None
display_score_convert: Optional[float] = None
display_score: Optional[float] = None
class NewconvIndexTask(MemberIndexBaseTask):
"""新客转化指数NCI计算任务。"""
INDEX_TYPE = "NCI"
DEFAULT_PARAMS = {
# 通用参数
'lookback_days_recency': 60,
'visit_lookback_days': 180,
'percentile_lower': 5,
'percentile_upper': 95,
'compression_mode': 0,
'use_smoothing': 1,
'ewma_alpha': 0.2,
# 分流参数
'new_visit_threshold': 2,
'new_days_threshold': 30,
'recharge_recent_days': 14,
'new_recharge_max_visits': 10,
# NCI参数
'no_touch_days_new': 3,
't2_target_days': 7,
'salvage_start': 30,
'salvage_end': 60,
'welcome_window_days': 3,
'active_new_visit_threshold_14d': 2,
'active_new_recency_days': 7,
'active_new_penalty': 0.2,
'h_recharge': 7,
'amount_base_M0': 300,
'balance_base_B0': 500,
'value_w_spend': 1.0,
'value_w_bal': 0.8,
'w_welcome': 1.0,
'w_need': 1.6,
'w_re': 0.8,
'w_value': 1.0,
# STOP高余额例外默认关闭
'enable_stop_high_balance_exception': 0,
'high_balance_threshold': 1000,
}
def get_task_code(self) -> str:
return "DWS_NEWCONV_INDEX"
def get_target_table(self) -> str:
return "dws_member_newconv_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# CHANGE 2026-07-25 | intent: 迁移至 MemberIndexBaseTask 模板方法
# 原 execute() 拆分为 _calculate_scores() + _save_results()
# 由基类 execute() 统一编排。
def _calculate_scores(
self,
activities: Dict[int, MemberActivityData],
params: Dict[str, float],
site_id: int,
tenant_id: int,
) -> List[MemberNewconvData]:
"""对会员活动数据计算 NCI 原始评分。
Returns:
评分后的 MemberNewconvData 列表
"""
newconv_list: List[MemberNewconvData] = []
for activity in activities.values():
segment, status, in_scope = self.classify_segment(activity, params)
if not in_scope:
continue
if segment != "NEW":
continue
data = MemberNewconvData(activity=activity, status=status, segment=segment)
self._calculate_nci_scores(data, params)
newconv_list.append(data)
return newconv_list
def _save_results(
self,
raw_scores: List[MemberNewconvData],
site_id: int,
tenant_id: int,
params: Dict[str, float],
context: Optional[TaskContext],
) -> Dict[str, Any]:
"""归一化 NCI 评分并持久化到数据库。"""
newconv_list = raw_scores
if not newconv_list:
self.logger.warning("No new-member rows to calculate")
return {'status': 'skipped', 'reason': 'no_new_members'}
# 归一化 Display Score
score_pairs = [
(d.activity.member_id, d.raw_score)
for d in newconv_list
if d.raw_score is not None
]
if score_pairs:
use_smoothing = int(params.get('use_smoothing', 1)) == 1
total_score_map = self._normalize_score_pairs(
score_pairs,
params=params,
site_id=site_id,
use_smoothing=use_smoothing,
)
for data in newconv_list:
if data.activity.member_id in total_score_map:
data.display_score = total_score_map[data.activity.member_id]
raw_scores_welcome = [
(d.activity.member_id, d.raw_score_welcome)
for d in newconv_list
if d.raw_score_welcome is not None
]
welcome_score_map = self._normalize_score_pairs(
raw_scores_welcome,
params=params,
site_id=site_id,
use_smoothing=False,
)
for data in newconv_list:
if data.activity.member_id in welcome_score_map:
data.display_score_welcome = welcome_score_map[data.activity.member_id]
raw_scores_convert = [
(d.activity.member_id, d.raw_score_convert)
for d in newconv_list
if d.raw_score_convert is not None
]
convert_score_map = self._normalize_score_pairs(
raw_scores_convert,
params=params,
site_id=site_id,
use_smoothing=False,
)
for data in newconv_list:
if data.activity.member_id in convert_score_map:
data.display_score_convert = convert_score_map[data.activity.member_id]
# 保存分位点历史
all_raw = [float(score) for _, score in score_pairs]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
inserted = self._save_newconv_data(newconv_list)
self.logger.info("NCI calculation finished, inserted %d rows", inserted)
return {
'status': 'success',
'member_count': len(newconv_list),
'records_inserted': inserted
}
def _calculate_nci_scores(self, data: MemberNewconvData, params: Dict[str, float]) -> None:
"""计算 NCI 分项与 Raw Score"""
activity = data.activity
# 1) 紧迫度
no_touch_days = float(params['no_touch_days_new'])
t2_target_days = float(params['t2_target_days'])
t2_max_days = t2_target_days * 2.0
if t2_max_days <= no_touch_days:
data.need_new = 0.0
else:
data.need_new = self._clip(
(activity.t_v - no_touch_days) / (t2_max_days - no_touch_days),
0.0, 1.0
)
# 2) Salvage30-60天线性衰减
salvage_start = float(params['salvage_start'])
salvage_end = float(params['salvage_end'])
if salvage_end <= salvage_start:
data.salvage_new = 0.0
elif activity.t_a <= salvage_start:
data.salvage_new = 1.0
elif activity.t_a >= salvage_end:
data.salvage_new = 0.0
else:
data.salvage_new = (salvage_end - activity.t_a) / (salvage_end - salvage_start)
# 3) 充值未回访压力
if activity.recharge_unconsumed == 1:
data.recharge_new = self.decay(activity.t_r, params['h_recharge'])
else:
data.recharge_new = 0.0
# 4) 价值分
m0 = float(params['amount_base_M0'])
b0 = float(params['balance_base_B0'])
spend_score = math.log1p(activity.spend_180d / m0) if m0 > 0 else 0.0
bal_score = math.log1p(activity.sv_balance / b0) if b0 > 0 else 0.0
data.value_new = float(params['value_w_spend']) * spend_score + float(params['value_w_bal']) * bal_score
# 5) 欢迎建联分:优先首访后立即触达
welcome_window_days = float(params.get('welcome_window_days', 3))
data.welcome_new = 0.0
if welcome_window_days > 0 and activity.visits_total <= 1 and activity.t_v <= welcome_window_days:
data.welcome_new = self._clip(1.0 - (activity.t_v / welcome_window_days), 0.0, 1.0)
# 6) 抑制高活跃新客在转化召回排名中的权重
active_visit_threshold = int(params.get('active_new_visit_threshold_14d', 2))
active_recency_days = float(params.get('active_new_recency_days', 7))
active_penalty = float(params.get('active_new_penalty', 0.2))
if activity.visits_14d >= active_visit_threshold and activity.t_v <= active_recency_days:
active_multiplier = self._clip(active_penalty, 0.0, 1.0)
else:
active_multiplier = 1.0
# 7) 价值/充值分主要在进入免打扰窗口后生效
if no_touch_days > 0:
touch_multiplier = self._clip(activity.t_v / no_touch_days, 0.0, 1.0)
else:
touch_multiplier = 1.0
data.raw_score_welcome = float(params.get('w_welcome', 1.0)) * data.welcome_new
data.raw_score_convert = active_multiplier * (
float(params['w_need']) * (data.need_new * data.salvage_new)
+ float(params['w_re']) * data.recharge_new * touch_multiplier
+ float(params['w_value']) * data.value_new * touch_multiplier
)
data.raw_score_welcome = max(0.0, data.raw_score_welcome)
data.raw_score_convert = max(0.0, data.raw_score_convert)
data.raw_score = data.raw_score_welcome + data.raw_score_convert
if data.raw_score < 0:
data.raw_score = 0.0
def _save_newconv_data(self, data_list: List[MemberNewconvData]) -> int:
"""保存 NCI 数据"""
if not data_list:
return 0
site_id = data_list[0].activity.site_id
# 按门店全量刷新,避免因分群变化导致过期数据残留。
delete_sql = """
DELETE FROM dws.dws_member_newconv_index
WHERE site_id = %s
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
insert_sql = """
INSERT INTO dws.dws_member_newconv_index (
site_id, tenant_id, member_id,
status, segment,
member_create_time, first_visit_time, last_visit_time, last_recharge_time,
t_v, t_r, t_a,
visits_14d, visits_60d, visits_total,
spend_30d, spend_180d, sv_balance, recharge_60d_amt,
interval_count,
need_new, salvage_new, recharge_new, value_new,
welcome_new,
raw_score_welcome, raw_score_convert, raw_score,
display_score_welcome, display_score_convert, display_score,
last_wechat_touch_time,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s,
%s,
%s, %s, %s, %s,
%s,
%s, %s, %s,
%s, %s, %s,
%s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
activity = data.activity
cur.execute(insert_sql, (
activity.site_id, activity.tenant_id, activity.member_id,
data.status, data.segment,
activity.member_create_time, activity.first_visit_time, activity.last_visit_time, activity.last_recharge_time,
activity.t_v, activity.t_r, activity.t_a,
activity.visits_14d, activity.visits_60d, activity.visits_total,
activity.spend_30d, activity.spend_180d, activity.sv_balance, activity.recharge_60d_amt,
activity.interval_count,
data.need_new, data.salvage_new, data.recharge_new, data.value_new,
data.welcome_new,
data.raw_score_welcome, data.raw_score_convert, data.raw_score,
data.display_score_welcome, data.display_score_convert, data.display_score,
None,
))
inserted += cur.rowcount
self.db.conn.commit()
return inserted
def _clip(self, value: float, low: float, high: float) -> float:
return max(low, min(high, value))
def _map_compression(self, params: Dict[str, float]) -> str:
mode = int(params.get('compression_mode', 0))
if mode == 1:
return "log1p"
if mode == 2:
return "asinh"
return "none"
def _normalize_score_pairs(
self,
raw_scores: List[tuple[int, Optional[float]]],
params: Dict[str, float],
site_id: int,
use_smoothing: bool,
) -> Dict[int, float]:
valid_scores = [(member_id, float(score)) for member_id, score in raw_scores if score is not None]
if not valid_scores:
return {}
# 全为0时直接返回避免 MinMax 归一化退化
if all(abs(score) <= 1e-9 for _, score in valid_scores):
return {member_id: 0.0 for member_id, _ in valid_scores}
compression = self._map_compression(params)
normalized = self.batch_normalize_to_display(
valid_scores,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
return {member_id: display for member_id, _, display in normalized}
__all__ = ['NewconvIndexTask']

View File

@@ -0,0 +1,695 @@
# -*- coding: utf-8 -*-
# AI_CHANGELOG [2026-02-13] 删除 _apply_last_touch_ml 方法及 source_mode/recharge_attribute_hours 参数;
# 更新 docstring 移除 last-touch 备用路径描述;
# Prompt: "ML 只用人工台账,删除所有 last-touch 备用路径"
"""
关系指数任务RS/OS/MS/ML
设计说明:
1. 单任务一次产出 RS / OS / MS / ML写入统一关系表
2. RS/MS 复用服务日志 + 会话合并口径;
3. ML 以人工台账窄表为唯一真源;
4. RS/MS/ML 的 display 映射按 index_type 隔离分位历史。
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Tuple
from .base_index_task import BaseIndexTask
from ..base_dws_task import CourseType, TaskContext
@dataclass
class ServiceSession:
"""合并后的服务会话。"""
session_start: datetime
session_end: datetime
total_duration_minutes: int
course_weight: float
is_incentive: bool
@dataclass
class RelationPairMetrics:
"""单个 member-assistant 关系对的计算指标。"""
site_id: int
tenant_id: int
member_id: int
assistant_id: int
sessions: List[ServiceSession] = field(default_factory=list)
days_since_last_session: Optional[int] = None
session_count: int = 0
total_duration_minutes: int = 0
basic_session_count: int = 0
incentive_session_count: int = 0
rs_f: float = 0.0
rs_d: float = 0.0
rs_r: float = 0.0
rs_raw: float = 0.0
rs_display: float = 0.0
ms_f_short: float = 0.0
ms_f_long: float = 0.0
ms_raw: float = 0.0
ms_display: float = 0.0
ml_raw: float = 0.0
ml_display: float = 0.0
ml_order_count: int = 0
ml_allocated_amount: float = 0.0
os_share: float = 0.0
os_label: str = "POOL"
os_rank: Optional[int] = None
class RelationIndexTask(BaseIndexTask):
"""关系指数任务:单任务产出 RS / OS / MS / ML。"""
INDEX_TYPE = "RS"
DEFAULT_PARAMS_RS: Dict[str, float] = {
"lookback_days": 60,
"session_merge_hours": 4,
"incentive_weight": 1.5,
"halflife_session": 14.0,
"halflife_last": 10.0,
"weight_f": 1.0,
"weight_d": 0.7,
"gate_alpha": 0.6,
"percentile_lower": 5.0,
"percentile_upper": 95.0,
"compression_mode": 1.0,
"use_smoothing": 1.0,
"ewma_alpha": 0.2,
}
DEFAULT_PARAMS_OS: Dict[str, float] = {
"min_rs_raw_for_ownership": 0.05,
"min_total_rs_raw": 0.10,
"ownership_main_threshold": 0.60,
"ownership_comanage_threshold": 0.35,
"ownership_gap_threshold": 0.15,
"eps": 1e-6,
}
DEFAULT_PARAMS_MS: Dict[str, float] = {
"lookback_days": 60,
"session_merge_hours": 4,
"incentive_weight": 1.5,
"halflife_short": 7.0,
"halflife_long": 30.0,
"eps": 1e-6,
"percentile_lower": 5.0,
"percentile_upper": 95.0,
"compression_mode": 1.0,
"use_smoothing": 1.0,
"ewma_alpha": 0.2,
}
# CHANGE 2026-02-13 | intent: ML 仅使用人工台账,移除 source_mode / recharge_attribute_hours
DEFAULT_PARAMS_ML: Dict[str, float] = {
"lookback_days": 60,
"amount_base": 500.0,
"halflife_recharge": 21.0,
"percentile_lower": 5.0,
"percentile_upper": 95.0,
"compression_mode": 1.0,
"use_smoothing": 1.0,
"ewma_alpha": 0.2,
}
def get_task_code(self) -> str:
return "DWS_RELATION_INDEX"
def get_target_table(self) -> str:
return "dws_member_assistant_relation_index"
def get_primary_keys(self) -> List[str]:
return ["site_id", "member_id", "assistant_id"]
def get_index_type(self) -> str:
# 多指数任务保留一个默认 index_type调用处应显式传 RS/MS/ML
return self.INDEX_TYPE
def execute(self, context: Optional[TaskContext]) -> Dict[str, Any]:
self.logger.info("开始计算关系指数RS/OS/MS/ML")
site_id = self._get_site_id(context)
tenant_id = self._get_tenant_id()
now = datetime.now(self.tz)
params_rs = self._load_params("RS", self.DEFAULT_PARAMS_RS)
params_os = self._load_params("OS", self.DEFAULT_PARAMS_OS)
params_ms = self._load_params("MS", self.DEFAULT_PARAMS_MS)
params_ml = self._load_params("ML", self.DEFAULT_PARAMS_ML)
service_lookback_days = max(
int(params_rs.get("lookback_days", 60)),
int(params_ms.get("lookback_days", 60)),
)
service_start = now - timedelta(days=service_lookback_days)
merge_hours = max(
int(params_rs.get("session_merge_hours", 4)),
int(params_ms.get("session_merge_hours", 4)),
)
raw_services = self._extract_service_records(site_id, service_start, now)
pair_map = self._group_and_merge_sessions(
raw_services=raw_services,
merge_hours=merge_hours,
incentive_weight=max(
float(params_rs.get("incentive_weight", 1.5)),
float(params_ms.get("incentive_weight", 1.5)),
),
now=now,
site_id=site_id,
tenant_id=tenant_id,
)
self.logger.info("服务关系对数量: %d", len(pair_map))
self._calculate_rs(pair_map, params_rs, now)
self._calculate_ms(pair_map, params_ms, now)
self._calculate_ml(pair_map, params_ml, site_id, now)
self._calculate_os(pair_map, params_os)
self._apply_display_scores(pair_map, params_rs, params_ms, params_ml, site_id)
inserted = self._save_relation_rows(site_id, list(pair_map.values()))
self.logger.info("关系指数计算完成,写入 %d 条记录", inserted)
return {
"status": "SUCCESS",
"records_inserted": inserted,
"pair_count": len(pair_map),
}
def _load_params(self, index_type: str, defaults: Dict[str, float]) -> Dict[str, float]:
params = dict(defaults)
params.update(self.load_index_parameters(index_type=index_type))
return params
def _extract_service_records(
self,
site_id: int,
start_datetime: datetime,
end_datetime: datetime,
) -> List[Dict[str, Any]]:
"""提取服务记录。"""
sql = """
SELECT
s.tenant_member_id AS member_id,
d.assistant_id AS assistant_id,
s.start_use_time AS start_time,
s.last_use_time AS end_time,
COALESCE(s.income_seconds, 0) / 60 AS duration_minutes,
s.skill_id
FROM dwd.dwd_assistant_service_log s
JOIN dwd.dim_assistant d
ON s.user_id = d.user_id
AND d.scd2_is_current = 1
AND COALESCE(d.is_delete, 0) = 0
WHERE s.site_id = %s
AND s.tenant_member_id > 0
AND s.user_id > 0
AND s.is_delete = 0
AND s.last_use_time >= %s
AND s.last_use_time < %s
ORDER BY s.tenant_member_id, d.assistant_id, s.start_use_time
"""
rows = self.db.query(sql, (site_id, start_datetime, end_datetime))
return [dict(row) for row in (rows or [])]
def _group_and_merge_sessions(
self,
*,
raw_services: List[Dict[str, Any]],
merge_hours: int,
incentive_weight: float,
now: datetime,
site_id: int,
tenant_id: int,
) -> Dict[Tuple[int, int], RelationPairMetrics]:
"""按 (member_id, assistant_id) 分组并合并会话。"""
result: Dict[Tuple[int, int], RelationPairMetrics] = {}
if not raw_services:
return result
merge_threshold = timedelta(hours=max(0, merge_hours))
grouped: Dict[Tuple[int, int], List[Dict[str, Any]]] = {}
for row in raw_services:
member_id = int(row["member_id"])
assistant_id = int(row["assistant_id"])
grouped.setdefault((member_id, assistant_id), []).append(row)
for (member_id, assistant_id), records in grouped.items():
metrics = RelationPairMetrics(
site_id=site_id,
tenant_id=tenant_id,
member_id=member_id,
assistant_id=assistant_id,
)
sorted_records = sorted(records, key=lambda r: r["start_time"])
current: Optional[ServiceSession] = None
for svc in sorted_records:
start_time = svc["start_time"]
end_time = svc["end_time"]
duration = int(svc.get("duration_minutes") or 0)
skill_id = int(svc.get("skill_id") or 0)
course_type = self.get_course_type(skill_id)
is_incentive = course_type == CourseType.BONUS
weight = incentive_weight if is_incentive else 1.0
if current is None:
current = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive,
)
continue
if start_time - current.session_end <= merge_threshold:
current.session_end = max(current.session_end, end_time)
current.total_duration_minutes += duration
current.course_weight = max(current.course_weight, weight)
current.is_incentive = current.is_incentive or is_incentive
else:
metrics.sessions.append(current)
current = ServiceSession(
session_start=start_time,
session_end=end_time,
total_duration_minutes=duration,
course_weight=weight,
is_incentive=is_incentive,
)
if current is not None:
metrics.sessions.append(current)
metrics.session_count = len(metrics.sessions)
metrics.total_duration_minutes = sum(s.total_duration_minutes for s in metrics.sessions)
metrics.basic_session_count = sum(1 for s in metrics.sessions if not s.is_incentive)
metrics.incentive_session_count = sum(1 for s in metrics.sessions if s.is_incentive)
if metrics.sessions:
last_session = max(metrics.sessions, key=lambda s: s.session_end)
metrics.days_since_last_session = (now - last_session.session_end).days
result[(member_id, assistant_id)] = metrics
return result
def _calculate_rs(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
halflife_session = float(params.get("halflife_session", 14.0))
halflife_last = float(params.get("halflife_last", 10.0))
weight_f = float(params.get("weight_f", 1.0))
weight_d = float(params.get("weight_d", 0.7))
gate_alpha = max(0.0, float(params.get("gate_alpha", 0.6)))
for metrics in pair_map.values():
f_score = 0.0
d_score = 0.0
for session in metrics.sessions:
days_ago = min(
lookback_days,
max(0.0, (now - session.session_end).total_seconds() / 86400.0),
)
decay_factor = self.decay(days_ago, halflife_session)
f_score += session.course_weight * decay_factor
d_score += (
math.sqrt(max(session.total_duration_minutes, 0) / 60.0)
* session.course_weight
* decay_factor
)
if metrics.days_since_last_session is None:
r_score = 0.0
else:
r_score = self.decay(min(lookback_days, metrics.days_since_last_session), halflife_last)
base = weight_f * f_score + weight_d * d_score
gate = math.pow(r_score, gate_alpha) if r_score > 0 else 0.0
metrics.rs_f = f_score
metrics.rs_d = d_score
metrics.rs_r = r_score
metrics.rs_raw = max(0.0, base * gate)
def _calculate_ms(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
halflife_short = float(params.get("halflife_short", 7.0))
halflife_long = float(params.get("halflife_long", 30.0))
eps = float(params.get("eps", 1e-6))
for metrics in pair_map.values():
f_short = 0.0
f_long = 0.0
for session in metrics.sessions:
days_ago = min(
lookback_days,
max(0.0, (now - session.session_end).total_seconds() / 86400.0),
)
f_short += session.course_weight * self.decay(days_ago, halflife_short)
f_long += session.course_weight * self.decay(days_ago, halflife_long)
ratio = (f_short + eps) / (f_long + eps)
metrics.ms_f_short = f_short
metrics.ms_f_long = f_long
metrics.ms_raw = max(0.0, self.safe_log(ratio, 0.0))
def _calculate_ml(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
site_id: int,
now: datetime,
) -> None:
lookback_days = int(params.get("lookback_days", 60))
amount_base = float(params.get("amount_base", 500.0))
halflife_recharge = float(params.get("halflife_recharge", 21.0))
start_time = now - timedelta(days=lookback_days)
# CHANGE 2026-02-13 | intent: ML 仅使用人工台账,移除 last-touch 备用路径
manual_rows = self._extract_manual_alloc(site_id, start_time, now)
for row in manual_rows:
member_id = int(row["member_id"])
assistant_id = int(row["assistant_id"])
key = (member_id, assistant_id)
if key not in pair_map:
pair_map[key] = RelationPairMetrics(
site_id=site_id,
tenant_id=pair_map[next(iter(pair_map))].tenant_id if pair_map else self._get_tenant_id(),
member_id=member_id,
assistant_id=assistant_id,
)
metrics = pair_map[key]
amount = float(row.get("allocated_amount") or 0.0)
pay_time = row.get("pay_time")
if amount <= 0 or pay_time is None:
continue
days_ago = min(lookback_days, max(0.0, (now - pay_time).total_seconds() / 86400.0))
metrics.ml_raw += math.log1p(amount / max(amount_base, 1e-6)) * self.decay(
days_ago,
halflife_recharge,
)
metrics.ml_order_count += 1
metrics.ml_allocated_amount += amount
def _extract_manual_alloc(
self,
site_id: int,
start_time: datetime,
end_time: datetime,
) -> List[Dict[str, Any]]:
sql = """
SELECT
member_id,
assistant_id,
pay_time,
allocated_amount
FROM dws.dws_ml_manual_order_alloc
WHERE site_id = %s
AND pay_time >= %s
AND pay_time < %s
"""
rows = self.db.query(sql, (site_id, start_time, end_time))
return [dict(row) for row in (rows or [])]
def _calculate_os(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params: Dict[str, float],
) -> None:
min_rs = float(params.get("min_rs_raw_for_ownership", 0.05))
min_total = float(params.get("min_total_rs_raw", 0.10))
main_threshold = float(params.get("ownership_main_threshold", 0.60))
comanage_threshold = float(params.get("ownership_comanage_threshold", 0.35))
gap_threshold = float(params.get("ownership_gap_threshold", 0.15))
member_groups: Dict[int, List[RelationPairMetrics]] = {}
for metrics in pair_map.values():
member_groups.setdefault(metrics.member_id, []).append(metrics)
for _, rows in member_groups.items():
eligible = [row for row in rows if row.rs_raw >= min_rs]
sum_rs = sum(row.rs_raw for row in eligible)
if sum_rs < min_total:
for row in rows:
row.os_share = 0.0
row.os_label = "UNASSIGNED"
row.os_rank = None
continue
for row in rows:
if row.rs_raw >= min_rs:
row.os_share = row.rs_raw / sum_rs
else:
row.os_share = 0.0
sorted_eligible = sorted(
eligible,
key=lambda item: (
-item.os_share,
-item.rs_raw,
item.days_since_last_session if item.days_since_last_session is not None else 10**9,
item.assistant_id,
),
)
for idx, row in enumerate(sorted_eligible, start=1):
row.os_rank = idx
top1 = sorted_eligible[0]
top2_share = sorted_eligible[1].os_share if len(sorted_eligible) > 1 else 0.0
gap = top1.os_share - top2_share
has_main = top1.os_share >= main_threshold and gap >= gap_threshold
if has_main:
for row in rows:
if row is top1:
row.os_label = "MAIN"
elif row.os_share >= comanage_threshold:
row.os_label = "COMANAGE"
else:
row.os_label = "POOL"
else:
for row in rows:
if row.os_share >= comanage_threshold and row.rs_raw >= min_rs:
row.os_label = "COMANAGE"
else:
row.os_label = "POOL"
# 非 eligible 不赋 rank
for row in rows:
if row.rs_raw < min_rs:
row.os_rank = None
def _apply_display_scores(
self,
pair_map: Dict[Tuple[int, int], RelationPairMetrics],
params_rs: Dict[str, float],
params_ms: Dict[str, float],
params_ml: Dict[str, float],
site_id: int,
) -> None:
pair_items = list(pair_map.items())
rs_map = self._normalize_and_record(
raw_pairs=[(key, item.rs_raw) for key, item in pair_items],
params=params_rs,
index_type="RS",
site_id=site_id,
)
ms_map = self._normalize_and_record(
raw_pairs=[(key, item.ms_raw) for key, item in pair_items],
params=params_ms,
index_type="MS",
site_id=site_id,
)
ml_map = self._normalize_and_record(
raw_pairs=[(key, item.ml_raw) for key, item in pair_items],
params=params_ml,
index_type="ML",
site_id=site_id,
)
for key, item in pair_items:
item.rs_display = rs_map.get(key, 0.0)
item.ms_display = ms_map.get(key, 0.0)
item.ml_display = ml_map.get(key, 0.0)
def _normalize_and_record(
self,
*,
raw_pairs: List[Tuple[Any, float]],
params: Dict[str, float],
index_type: str,
site_id: int,
) -> Dict[Any, float]:
if not raw_pairs:
return {}
if all(abs(score) <= 1e-9 for _, score in raw_pairs):
return {entity: 0.0 for entity, _ in raw_pairs}
percentile_lower = int(params.get("percentile_lower", 5))
percentile_upper = int(params.get("percentile_upper", 95))
use_smoothing = int(params.get("use_smoothing", 1)) == 1
compression = self._map_compression(params)
normalized = self.batch_normalize_to_display(
raw_scores=raw_pairs,
compression=compression,
percentile_lower=percentile_lower,
percentile_upper=percentile_upper,
use_smoothing=use_smoothing,
site_id=site_id,
index_type=index_type,
)
display_map = {entity: display for entity, _, display in normalized}
raw_values = [float(score) for _, score in raw_pairs]
q_l, q_u = self.calculate_percentiles(raw_values, percentile_lower, percentile_upper)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(
site_id=site_id,
current_p5=q_l,
current_p95=q_u,
index_type=index_type,
)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(raw_values),
min_raw=min(raw_values),
max_raw=max(raw_values),
avg_raw=sum(raw_values) / len(raw_values),
index_type=index_type,
)
return display_map
@staticmethod
def _map_compression(params: Dict[str, float]) -> str:
mode = int(params.get("compression_mode", 0))
if mode == 1:
return "log1p"
if mode == 2:
return "asinh"
return "none"
def _save_relation_rows(self, site_id: int, rows: List[RelationPairMetrics]) -> int:
with self.db.conn.cursor() as cur:
cur.execute(
"DELETE FROM dws.dws_member_assistant_relation_index WHERE site_id = %s",
(site_id,),
)
if not rows:
self.db.conn.commit()
return 0
insert_sql = """
INSERT INTO dws.dws_member_assistant_relation_index (
site_id, tenant_id, member_id, assistant_id,
session_count, total_duration_minutes, basic_session_count, incentive_session_count,
days_since_last_session,
rs_f, rs_d, rs_r, rs_raw, rs_display,
os_share, os_label, os_rank,
ms_f_short, ms_f_long, ms_raw, ms_display,
ml_order_count, ml_allocated_amount, ml_raw, ml_display,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s, %s,
%s, %s, %s, %s,
%s,
%s, %s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s, %s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
for row in rows:
cur.execute(
insert_sql,
(
row.site_id,
row.tenant_id,
row.member_id,
row.assistant_id,
row.session_count,
row.total_duration_minutes,
row.basic_session_count,
row.incentive_session_count,
row.days_since_last_session,
row.rs_f,
row.rs_d,
row.rs_r,
row.rs_raw,
row.rs_display,
row.os_share,
row.os_label,
row.os_rank,
row.ms_f_short,
row.ms_f_long,
row.ms_raw,
row.ms_display,
row.ml_order_count,
row.ml_allocated_amount,
row.ml_raw,
row.ml_display,
),
)
inserted += max(cur.rowcount, 0)
self.db.conn.commit()
return inserted
def _get_site_id(self, context: Optional[TaskContext]) -> int:
if context and getattr(context, "store_id", None):
return int(context.store_id)
site_id = self.config.get("app.default_site_id") or self.config.get("app.store_id")
if site_id is not None:
return int(site_id)
sql = "SELECT DISTINCT site_id FROM dwd.dwd_assistant_service_log WHERE site_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0]).get("site_id") or 0)
self.logger.warning("无法确定门店ID使用 0 继续执行")
return 0
def _get_tenant_id(self) -> int:
tenant_id = self.config.get("app.tenant_id")
if tenant_id is not None:
return int(tenant_id)
sql = "SELECT DISTINCT tenant_id FROM dwd.dwd_assistant_service_log WHERE tenant_id IS NOT NULL LIMIT 1"
rows = self.db.query(sql)
if rows:
return int(dict(rows[0]).get("tenant_id") or 0)
self.logger.warning("无法确定租户ID使用 0 继续执行")
return 0
__all__ = ["RelationIndexTask", "RelationPairMetrics", "ServiceSession"]

View File

@@ -0,0 +1,421 @@
# -*- coding: utf-8 -*-
# AI_CHANGELOG [2026-02-13] 修复 STOP_HIGH_BALANCE 会员不参与评分的逻辑缺陷;
# Prompt: "STOP_HIGH_BALANCE 应该参与 WBI 评分"
"""
老客挽回指数WBI计算任务。"""
from __future__ import annotations
import math
from dataclasses import dataclass
from datetime import date, timedelta
from typing import Any, Dict, List, Optional, Tuple
from .member_index_base import MemberActivityData, MemberIndexBaseTask
from ..base_dws_task import TaskContext
@dataclass
class MemberWinbackData:
activity: MemberActivityData
status: str
segment: str
overdue_old: float = 0.0
overdue_cdf_p: float = 0.0
drop_old: float = 0.0
recharge_old: float = 0.0
value_old: float = 0.0
ideal_interval_days: Optional[float] = None
ideal_next_visit_date: Optional[date] = None
raw_score: Optional[float] = None
display_score: Optional[float] = None
class WinbackIndexTask(MemberIndexBaseTask):
"""老客挽回指数WBI计算任务。"""
INDEX_TYPE = "WBI"
DEFAULT_PARAMS = {
# 通用参数
'lookback_days_recency': 60,
'visit_lookback_days': 180,
'percentile_lower': 5,
'percentile_upper': 95,
'compression_mode': 0,
'use_smoothing': 1,
'ewma_alpha': 0.2,
# 分流参数
'new_visit_threshold': 2,
'new_days_threshold': 30,
'recharge_recent_days': 14,
'new_recharge_max_visits': 10,
'recency_hard_floor_days': 14,
'recency_gate_days': 14,
'recency_gate_slope_days': 3,
# WBI参数
'overdue_alpha': 2.0,
'overdue_weight_halflife_days': 30,
'overdue_weight_blend_min_samples': 8,
'h_recharge': 7,
'amount_base_M0': 300,
'balance_base_B0': 500,
'value_w_spend': 1.0,
'value_w_bal': 1.0,
'w_over': 2.0,
'w_drop': 1.0,
'w_re': 0.4,
'w_value': 1.2,
# STOP高余额例外默认关闭
'enable_stop_high_balance_exception': 0,
'high_balance_threshold': 1000,
}
def get_task_code(self) -> str:
return "DWS_WINBACK_INDEX"
def get_target_table(self) -> str:
return "dws_member_winback_index"
def get_primary_keys(self) -> List[str]:
return ['site_id', 'member_id']
def get_index_type(self) -> str:
return self.INDEX_TYPE
# CHANGE 2026-07-25 | intent: 迁移至 MemberIndexBaseTask 模板方法
# 原 execute() 拆分为 _calculate_scores() + _save_results()
# 由基类 execute() 统一编排。
def _calculate_scores(
self,
activities: Dict[int, MemberActivityData],
params: Dict[str, float],
site_id: int,
tenant_id: int,
) -> List[MemberWinbackData]:
"""对会员活动数据计算 WBI 原始评分。
Returns:
评分后的 MemberWinbackData 列表
"""
winback_list: List[MemberWinbackData] = []
for activity in activities.values():
segment, status, in_scope = self.classify_segment(activity, params)
if not in_scope:
continue
if segment != "OLD" and status != "STOP_HIGH_BALANCE":
continue
data = MemberWinbackData(activity=activity, status=status, segment=segment)
# CHANGE 2026-02-13 | intent: STOP_HIGH_BALANCE 也参与评分
if segment == "OLD" or status == "STOP_HIGH_BALANCE":
self._calculate_wbi_scores(data, params)
winback_list.append(data)
return winback_list
def _save_results(
self,
raw_scores: List[MemberWinbackData],
site_id: int,
tenant_id: int,
params: Dict[str, float],
context: Optional[TaskContext],
) -> Dict[str, Any]:
"""归一化 WBI 评分并持久化到数据库。"""
winback_list = raw_scores
if not winback_list:
self.logger.warning("No old-member rows to calculate")
return {'status': 'skipped', 'reason': 'no_old_members'}
# 归一化 Display Score
score_pairs = [
(d.activity.member_id, d.raw_score)
for d in winback_list
if d.raw_score is not None
]
if score_pairs:
compression = self._map_compression(params)
use_smoothing = int(params.get('use_smoothing', 1)) == 1
normalized = self.batch_normalize_to_display(
score_pairs,
compression=compression,
percentile_lower=int(params['percentile_lower']),
percentile_upper=int(params['percentile_upper']),
use_smoothing=use_smoothing,
site_id=site_id
)
score_map = {member_id: display for member_id, _, display in normalized}
for data in winback_list:
if data.activity.member_id in score_map:
data.display_score = score_map[data.activity.member_id]
# 保存分位点历史
all_raw = [float(score) for _, score in score_pairs]
q_l, q_u = self.calculate_percentiles(
all_raw,
int(params['percentile_lower']),
int(params['percentile_upper'])
)
if use_smoothing:
smoothed_l, smoothed_u = self._apply_ewma_smoothing(site_id, q_l, q_u)
else:
smoothed_l, smoothed_u = q_l, q_u
self.save_percentile_history(
site_id=site_id,
percentile_5=q_l,
percentile_95=q_u,
percentile_5_smoothed=smoothed_l,
percentile_95_smoothed=smoothed_u,
record_count=len(all_raw),
min_raw=min(all_raw),
max_raw=max(all_raw),
avg_raw=sum(all_raw) / len(all_raw)
)
inserted = self._save_winback_data(winback_list)
self.logger.info("WBI calculation finished, inserted %d rows", inserted)
return {
'status': 'success',
'member_count': len(winback_list),
'records_inserted': inserted
}
def _weighted_cdf(
self,
samples: List[Tuple[float, int]],
t_v: float,
halflife_days: float,
blend_min_samples: int,
) -> float:
if not samples:
return 0.5
if halflife_days <= 0:
p_equal = sum(1.0 for interval, _ in samples if interval <= t_v) / len(samples)
return self._clip(p_equal, 0.0, 1.0)
ln2 = math.log(2.0)
weighted_hit = 0.0
weight_sum = 0.0
equal_hit = 0.0
for interval, age_days in samples:
weight = math.exp(-ln2 * float(age_days) / halflife_days)
indicator = 1.0 if interval <= t_v else 0.0
weighted_hit += weight * indicator
weight_sum += weight
equal_hit += indicator
p_weighted = 0.5 if weight_sum <= 0 else (weighted_hit / weight_sum)
p_equal = equal_hit / len(samples)
lam = min(1.0, float(len(samples)) / float(max(1, blend_min_samples)))
p_final = lam * p_weighted + (1.0 - lam) * p_equal
return self._clip(p_final, 0.0, 1.0)
def _weighted_quantile(
self,
samples: List[Tuple[float, int]],
quantile: float,
halflife_days: float,
blend_min_samples: int,
) -> Optional[float]:
if not samples:
return None
q = self._clip(quantile, 0.0, 1.0)
equal_weight = 1.0 / float(len(samples))
if halflife_days <= 0:
weighted = [(interval, equal_weight) for interval, _ in samples]
else:
ln2 = math.log(2.0)
raw_weighted: List[Tuple[float, float]] = []
total = 0.0
for interval, age_days in samples:
w = math.exp(-ln2 * float(age_days) / halflife_days)
raw_weighted.append((interval, w))
total += w
if total <= 0:
weighted = [(interval, equal_weight) for interval, _ in samples]
else:
weighted = [(interval, w / total) for interval, w in raw_weighted]
# 对小样本混合加权分布与等权分布。
lam = min(1.0, float(len(samples)) / float(max(1, blend_min_samples)))
blended: List[Tuple[float, float]] = []
for (interval_w, w), (interval_e, _) in zip(weighted, samples):
_ = interval_e # keep tuple alignment explicit
blended_weight = lam * w + (1.0 - lam) * equal_weight
blended.append((interval_w, blended_weight))
blended.sort(key=lambda item: item[0])
cumulative = 0.0
for interval, weight in blended:
cumulative += weight
if cumulative >= q:
return float(interval)
return float(blended[-1][0])
def _calculate_wbi_scores(self, data: MemberWinbackData, params: Dict[str, float]) -> None:
"""计算 WBI 分项与 Raw Score"""
activity = data.activity
# 1) 超期紧急性基于近期加权经验CDF
overdue_alpha = float(params['overdue_alpha'])
half_life_days = float(params.get('overdue_weight_halflife_days', 30))
blend_min_samples = int(params.get('overdue_weight_blend_min_samples', 8))
if activity.interval_count <= 0:
p = 0.5
ideal_interval = None
else:
if len(activity.interval_ages_days) == activity.interval_count:
samples = list(zip(activity.intervals, activity.interval_ages_days))
else:
samples = [(interval, 0) for interval in activity.intervals]
p = self._weighted_cdf(
samples=samples,
t_v=activity.t_v,
halflife_days=half_life_days,
blend_min_samples=blend_min_samples,
)
ideal_interval = self._weighted_quantile(
samples=samples,
quantile=0.5,
halflife_days=half_life_days,
blend_min_samples=blend_min_samples,
)
data.overdue_cdf_p = p
data.overdue_old = math.pow(p, overdue_alpha)
data.ideal_interval_days = ideal_interval
if ideal_interval is not None and activity.last_visit_time is not None:
ideal_days = max(0, int(round(ideal_interval)))
data.ideal_next_visit_date = activity.last_visit_time.date() + timedelta(days=ideal_days)
else:
data.ideal_next_visit_date = None
# 2) 降频分
expected14 = activity.visits_60d * 14.0 / 60.0
data.drop_old = self._clip((expected14 - activity.visits_14d) / (expected14 + 1), 0.0, 1.0)
# 3) 充值未回访压力
if activity.recharge_unconsumed == 1:
data.recharge_old = self.decay(activity.t_r, params['h_recharge'])
else:
data.recharge_old = 0.0
# 4) 价值分
m0 = float(params['amount_base_M0'])
b0 = float(params['balance_base_B0'])
spend_score = math.log1p(activity.spend_180d / m0) if m0 > 0 else 0.0
bal_score = math.log1p(activity.sv_balance / b0) if b0 > 0 else 0.0
data.value_old = float(params['value_w_spend']) * spend_score + float(params['value_w_bal']) * bal_score
data.raw_score = (
float(params['w_over']) * data.overdue_old
+ float(params['w_drop']) * data.drop_old
+ float(params['w_re']) * data.recharge_old
+ float(params['w_value']) * data.value_old
)
hard_floor_days = float(params.get('recency_hard_floor_days', 0))
gate_days = float(params.get('recency_gate_days', 14))
slope_days = float(params.get('recency_gate_slope_days', 3))
if hard_floor_days > 0 and activity.t_v < hard_floor_days:
suppression = 0.0
elif slope_days <= 0:
suppression = 1.0 if activity.t_v >= gate_days else 0.0
else:
x = (activity.t_v - gate_days) / slope_days
x = self._clip(x, -60.0, 60.0)
suppression = 1.0 / (1.0 + math.exp(-x))
data.raw_score *= suppression
# 限制在 0 以上
if data.raw_score < 0:
data.raw_score = 0.0
def _save_winback_data(self, data_list: List[MemberWinbackData]) -> int:
"""保存 WBI 数据"""
if not data_list:
return 0
site_id = data_list[0].activity.site_id
# 按门店全量刷新,避免因分群变化导致过期数据残留。
delete_sql = """
DELETE FROM dws.dws_member_winback_index
WHERE site_id = %s
"""
with self.db.conn.cursor() as cur:
cur.execute(delete_sql, (site_id,))
insert_sql = """
INSERT INTO dws.dws_member_winback_index (
site_id, tenant_id, member_id,
status, segment,
member_create_time, first_visit_time, last_visit_time, last_recharge_time,
t_v, t_r, t_a,
visits_14d, visits_60d, visits_total,
spend_30d, spend_180d, sv_balance, recharge_60d_amt,
interval_count,
overdue_old, overdue_cdf_p, drop_old, recharge_old, value_old,
ideal_interval_days, ideal_next_visit_date,
raw_score, display_score,
last_wechat_touch_time,
calc_time, created_at, updated_at
) VALUES (
%s, %s, %s,
%s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s, %s,
%s,
%s, %s, %s, %s, %s,
%s, %s,
%s, %s,
%s,
NOW(), NOW(), NOW()
)
"""
inserted = 0
with self.db.conn.cursor() as cur:
for data in data_list:
activity = data.activity
cur.execute(insert_sql, (
activity.site_id, activity.tenant_id, activity.member_id,
data.status, data.segment,
activity.member_create_time, activity.first_visit_time, activity.last_visit_time, activity.last_recharge_time,
activity.t_v, activity.t_r, activity.t_a,
activity.visits_14d, activity.visits_60d, activity.visits_total,
activity.spend_30d, activity.spend_180d, activity.sv_balance, activity.recharge_60d_amt,
activity.interval_count,
data.overdue_old, data.overdue_cdf_p, data.drop_old, data.recharge_old, data.value_old,
data.ideal_interval_days, data.ideal_next_visit_date,
data.raw_score, data.display_score,
None,
))
inserted += cur.rowcount
self.db.conn.commit()
return inserted
def _clip(self, value: float, low: float, high: float) -> float:
return max(low, min(high, value))
def _map_compression(self, params: Dict[str, float]) -> str:
mode = int(params.get('compression_mode', 0))
if mode == 1:
return "log1p"
if mode == 2:
return "asinh"
return "none"
__all__ = ['WinbackIndexTask']

View File

@@ -0,0 +1,352 @@
# -*- coding: utf-8 -*-
"""
DWS 统一维护任务
合并原 BaseMvRefreshTask物化视图刷新和 DwsRetentionCleanupTask历史数据清理
为单一维护任务 DWS_MAINTENANCE简化调度配置。
执行顺序:先刷新物化视图,再清理历史数据。
任一子功能可通过配置独立禁用。
"""
from __future__ import annotations
import json
from datetime import date
from typing import Any, Dict, List, Optional
from .base_dws_task import BaseDwsTask, TaskContext, TimeLayer
class DwsMaintenanceTask(BaseDwsTask):
"""合并 MV 刷新 + 数据清理为单一维护任务。
配置项:
dws.mv.enabled — 是否启用物化视图刷新(默认 False
dws.retention.enabled — 是否启用历史数据清理(默认 False
dws.mv.tables — 需要刷新的表列表(逗号分隔)
dws.retention.tables — 需要清理的表列表(逗号分隔)
dws.retention.layer — 默认清理层级
dws.retention.table_layers — 表级清理层级覆盖JSON dict
"""
# ── MV 刷新相关常量 ──────────────────────────────────────
VIEW_PREFIX = "mv_"
LAYER_ORDER = [
TimeLayer.LAST_2_DAYS,
TimeLayer.LAST_1_MONTH,
TimeLayer.LAST_3_MONTHS,
TimeLayer.LAST_6_MONTHS,
]
LAYER_SUFFIX = {
TimeLayer.LAST_2_DAYS: "l1",
TimeLayer.LAST_1_MONTH: "l2",
TimeLayer.LAST_3_MONTHS: "l3",
TimeLayer.LAST_6_MONTHS: "l4",
}
# 需要刷新物化视图的基表列表
MV_BASE_TABLES = [
"dws_finance_daily_summary",
"dws_assistant_daily_detail",
]
# ── 数据清理相关常量 ─────────────────────────────────────
# 默认需要清理的表及其日期列
DEFAULT_RETENTION_TABLES = [
{"table": "dws_assistant_daily_detail", "date_col": "stat_date"},
{"table": "dws_assistant_monthly_summary", "date_col": "stat_month"},
{"table": "dws_assistant_customer_stats", "date_col": "stat_date"},
{"table": "dws_assistant_salary_calc", "date_col": "salary_month"},
{"table": "dws_assistant_recharge_commission", "date_col": "commission_month"},
{"table": "dws_assistant_finance_analysis", "date_col": "stat_date"},
{"table": "dws_member_consumption_summary", "date_col": "stat_date"},
{"table": "dws_member_visit_detail", "date_col": "visit_date"},
{"table": "dws_finance_daily_summary", "date_col": "stat_date"},
{"table": "dws_finance_income_structure", "date_col": "stat_date"},
{"table": "dws_finance_discount_detail", "date_col": "stat_date"},
{"table": "dws_finance_recharge_summary", "date_col": "stat_date"},
{"table": "dws_finance_expense_summary", "date_col": "expense_month"},
{"table": "dws_platform_settlement", "date_col": "settlement_date"},
]
def get_task_code(self) -> str:
return "DWS_MAINTENANCE"
def get_target_table(self) -> str:
# 维护任务不写入单一目标表
return "dws_maintenance"
def get_primary_keys(self) -> List[str]:
return []
# ── E/T/L 实现 ───────────────────────────────────────────
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""维护任务无需提取业务数据。"""
return {"site_id": context.store_id}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
"""维护任务无需转换。"""
return extracted
def load(self, transformed: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
"""按顺序执行:① 物化视图刷新 → ② 历史数据清理。
任一子功能可通过配置独立禁用。
"""
stats: Dict[str, Any] = {"refreshed": 0, "cleaned": 0}
# ① 物化视图刷新(先执行,满足需求 4.2
if self._is_mv_enabled():
stats["refreshed"] = self._refresh_all_views()
else:
self.logger.info("DWS_MAINTENANCE: 物化视图刷新未启用,跳过")
# ② 历史数据清理
if self._is_retention_enabled():
stats["cleaned"] = self._cleanup_all_tables(context)
else:
self.logger.info("DWS_MAINTENANCE: 历史数据清理未启用,跳过")
return {"counts": stats}
# ══════════════════════════════════════════════════════════
# 物化视图刷新逻辑(源自 BaseMvRefreshTask
# ══════════════════════════════════════════════════════════
def _is_mv_enabled(self) -> bool:
"""检查物化视图刷新是否启用。"""
return bool(self.config.get("dws.mv.enabled", False))
def _refresh_all_views(self) -> int:
"""刷新所有已配置基表的物化视图,返回刷新视图数。"""
refreshed = 0
for base_table in self.MV_BASE_TABLES:
if not self._is_table_in_mv_scope(base_table):
continue
layers = self._resolve_mv_layers(base_table)
for layer in layers:
view_name = self._get_view_name(base_table, layer)
if not view_name:
continue
if not self._view_exists(view_name):
self.logger.warning(
"DWS_MAINTENANCE: 物化视图不存在,跳过 %s", view_name
)
continue
self._refresh_view(view_name)
refreshed += 1
self.logger.info("DWS_MAINTENANCE: 物化视图刷新完成,数量=%d", refreshed)
return refreshed
def _is_table_in_mv_scope(self, base_table: str) -> bool:
"""判断基表是否在 MV 刷新范围内。"""
tables = self._parse_list(self.config.get("dws.mv.tables"))
if not tables:
tables = self._parse_list(self.config.get("dws.retention.tables"))
# 未配置表列表时默认刷新所有
if not tables:
return True
return base_table in tables
def _resolve_mv_layers(self, base_table: str) -> List[TimeLayer]:
"""解析某基表需要刷新的时间分层列表。"""
# 显式配置优先
configured = self._parse_time_layers(self.config.get("dws.mv.layers"))
if configured:
return configured
# 表级覆盖
table_layers = self._resolve_layer_map(
self.config.get("dws.mv.table_layers")
or self.config.get("dws.retention.table_layers")
)
layer_name = table_layers.get(base_table)
if layer_name:
layer = self._get_layer(layer_name)
if layer and layer != TimeLayer.ALL:
return self._layers_up_to(layer)
# 默认使用 retention.layer
retention_layer = self._get_layer(self.config.get("dws.retention.layer"))
if retention_layer and retention_layer != TimeLayer.ALL:
return self._layers_up_to(retention_layer)
return list(self.LAYER_ORDER)
def _layers_up_to(self, target: TimeLayer) -> List[TimeLayer]:
"""返回从 L1 到 target的分层列表。"""
layers = []
for layer in self.LAYER_ORDER:
layers.append(layer)
if layer == target:
break
return layers
def _get_view_name(self, base_table: str, layer: TimeLayer) -> Optional[str]:
suffix = self.LAYER_SUFFIX.get(layer)
if not suffix or not base_table:
return None
return f"{self.VIEW_PREFIX}{base_table}_{suffix}"
def _view_exists(self, view_name: str) -> bool:
sql = "SELECT to_regclass(%s) AS reg"
rows = self.db.query(sql, (f"{self.DWS_SCHEMA}.{view_name}",))
return bool(rows and rows[0].get("reg"))
def _refresh_view(self, view_name: str) -> None:
concurrently = bool(self.config.get("dws.mv.refresh_concurrently", False))
keyword = "CONCURRENTLY " if concurrently else ""
sql = f"REFRESH MATERIALIZED VIEW {keyword}{self.DWS_SCHEMA}.{view_name}"
self.db.execute(sql)
# ══════════════════════════════════════════════════════════
# 历史数据清理逻辑(源自 DwsRetentionCleanupTask
# ══════════════════════════════════════════════════════════
def _is_retention_enabled(self) -> bool:
"""检查历史数据清理是否启用。"""
return bool(self.config.get("dws.retention.enabled", False))
def _cleanup_all_tables(self, context: TaskContext) -> int:
"""清理所有已配置表的历史数据,返回总删除行数。"""
base_date = (
context.window_end.date()
if hasattr(context.window_end, "date")
else context.window_end
)
default_layer = self._get_layer(
self.config.get("dws.retention.layer", "ALL")
)
if default_layer is None:
self.logger.warning("DWS_MAINTENANCE: 未识别的清理层级,跳过清理")
return 0
target_tables = self._resolve_retention_tables()
if not target_tables:
self.logger.info("DWS_MAINTENANCE: 未配置需要清理的表,跳过清理")
return 0
table_layers = self._resolve_table_layers()
total_deleted = 0
for item in target_tables:
table = item["table"]
date_col = item["date_col"]
layer_name = table_layers.get(table, default_layer.value)
layer = self._get_layer(layer_name)
if layer is None or layer == TimeLayer.ALL:
continue
time_range = self.get_time_layer_range(layer, base_date)
cutoff = self._normalize_cutoff(date_col, time_range.start)
deleted = self._cleanup_table(table, date_col, cutoff, context.store_id)
total_deleted += deleted
self.logger.info("DWS_MAINTENANCE: 数据清理完成,总删除 %d", total_deleted)
return total_deleted
def _resolve_retention_tables(self) -> List[Dict[str, str]]:
"""解析需要清理的目标表列表。"""
table_list = self.config.get("dws.retention.tables")
if not table_list:
return self.DEFAULT_RETENTION_TABLES
if isinstance(table_list, str):
names = [t.strip() for t in table_list.split(",") if t.strip()]
else:
names = list(table_list)
return [
item for item in self.DEFAULT_RETENTION_TABLES
if item["table"] in names
]
def _resolve_table_layers(self) -> Dict[str, str]:
"""解析表级清理层级覆盖配置。"""
raw = self.config.get("dws.retention.table_layers")
if not raw:
return {}
if isinstance(raw, dict):
return {str(k): str(v) for k, v in raw.items()}
if isinstance(raw, str):
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return {str(k): str(v) for k, v in parsed.items()}
except json.JSONDecodeError:
return {}
return {}
def _normalize_cutoff(self, date_col: str, cutoff: date) -> date:
"""月度列的截止日期归一化到月初。"""
monthly_cols = {"stat_month", "salary_month", "commission_month", "expense_month"}
if date_col in monthly_cols:
return cutoff.replace(day=1)
return cutoff
def _cleanup_table(self, table: str, date_col: str, cutoff: date, site_id: int) -> int:
"""删除指定表中早于截止日期的历史数据。"""
full_table = f"{self.DWS_SCHEMA}.{table}"
sql = f"DELETE FROM {full_table} WHERE site_id = %s AND {date_col} < %s"
with self.db.conn.cursor() as cur:
cur.execute(sql, (site_id, cutoff))
return cur.rowcount
# ══════════════════════════════════════════════════════════
# 共享工具方法
# ══════════════════════════════════════════════════════════
def _get_layer(self, layer_name: Optional[str]) -> Optional[TimeLayer]:
if not layer_name:
return None
name = str(layer_name).upper()
try:
return TimeLayer[name]
except KeyError:
return None
def _resolve_layer_map(self, raw: Any) -> Dict[str, str]:
if not raw:
return {}
if isinstance(raw, dict):
return {str(k): str(v) for k, v in raw.items()}
if isinstance(raw, str):
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return {str(k): str(v) for k, v in parsed.items()}
except json.JSONDecodeError:
return {}
return {}
def _parse_time_layers(self, raw: Any) -> List[TimeLayer]:
"""解析逗号分隔的时间分层配置。"""
if not raw:
return []
if isinstance(raw, str):
items = [v.strip() for v in raw.split(",") if v.strip()]
elif isinstance(raw, (list, tuple, set)):
items = [str(v).strip() for v in raw if str(v).strip()]
else:
return []
layers = []
for item in items:
layer = self._get_layer(item)
if layer and layer not in layers:
layers.append(layer)
return layers
def _parse_list(self, raw: Any) -> List[str]:
"""解析逗号分隔的字符串列表。"""
if not raw:
return []
if isinstance(raw, str):
return [v.strip() for v in raw.split(",") if v.strip()]
if isinstance(raw, (list, tuple, set)):
return [str(v).strip() for v in raw if str(v).strip()]
return []
__all__ = ["DwsMaintenanceTask"]

View File

@@ -0,0 +1,341 @@
# -*- coding: utf-8 -*-
"""
会员消费汇总任务
功能说明:
"会员"为粒度,统计消费行为和滚动窗口指标
数据来源:
- dwd_settlement_head: 结账单头表
- dim_member: 会员维度
- dim_member_card_account: 会员卡账户
目标表:
dws.dws_member_consumption_summary
更新策略:
- 更新频率:每日更新
- 幂等方式delete-before-insert按统计日期
业务规则:
- 散客处理member_id=0 不进入此表
- 滚动窗口7/10/15/30/60/90天
- 卡余额:区分储值卡(现金卡)和赠送卡
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
from .dws_helpers import mask_mobile, calc_days_since
class MemberConsumptionTask(BaseDwsTask):
"""
会员消费汇总任务
统计每个会员的:
- 首次/最近消费日期
- 累计消费统计
- 滚动窗口统计7/10/15/30/60/90天
- 卡余额快照
- 活跃度指标和客户分层
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_MEMBER_CONSUMPTION"
def get_target_table(self) -> str:
return "dws_member_consumption_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "member_id", "stat_date"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
stat_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,统计日期 %s",
self.get_task_code(), stat_date
)
# 1. 获取会员消费统计(含滚动窗口)
consumption_stats = self._extract_consumption_stats(site_id, stat_date)
# 2. 获取会员信息
member_info = self._extract_member_info(site_id)
# 3. 获取会员卡余额
card_balances = self._extract_card_balances(site_id)
return {
'consumption_stats': consumption_stats,
'member_info': member_info,
'card_balances': card_balances,
'stat_date': stat_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
"""
consumption_stats = extracted['consumption_stats']
member_info = extracted['member_info']
card_balances = extracted['card_balances']
stat_date = extracted['stat_date']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条会员消费记录",
self.get_task_code(), len(consumption_stats)
)
results = []
for stats in consumption_stats:
member_id = stats.get('member_id')
# 跳过散客
if self.is_guest(member_id):
continue
memb_info = member_info.get(member_id, {})
balance = card_balances.get(member_id, {})
# 计算活跃度和客户分层
days_since_last = self._calc_days_since(stat_date, stats.get('last_consume_date'))
customer_tier = self._calculate_customer_tier(stats, days_since_last)
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'member_id': member_id,
'stat_date': stat_date,
# 会员基本信息
'member_nickname': memb_info.get('nickname'),
'member_mobile': self._mask_mobile(memb_info.get('mobile')),
'card_grade_name': memb_info.get('member_card_grade_name'),
'register_date': memb_info.get('register_date'),
# 全量累计统计
'first_consume_date': stats.get('first_consume_date'),
'last_consume_date': stats.get('last_consume_date'),
'total_visit_count': self.safe_int(stats.get('total_visit_count', 0)),
'total_consume_amount': self.safe_decimal(stats.get('total_consume_amount', 0)),
'total_recharge_amount': self.safe_decimal(memb_info.get('recharge_money_sum', 0)),
'total_table_fee': self.safe_decimal(stats.get('total_table_fee', 0)),
'total_goods_amount': self.safe_decimal(stats.get('total_goods_amount', 0)),
'total_assistant_amount': self.safe_decimal(stats.get('total_assistant_amount', 0)),
# 滚动窗口统计
'visit_count_7d': self.safe_int(stats.get('visit_count_7d', 0)),
'visit_count_10d': self.safe_int(stats.get('visit_count_10d', 0)),
'visit_count_15d': self.safe_int(stats.get('visit_count_15d', 0)),
'visit_count_30d': self.safe_int(stats.get('visit_count_30d', 0)),
'visit_count_60d': self.safe_int(stats.get('visit_count_60d', 0)),
'visit_count_90d': self.safe_int(stats.get('visit_count_90d', 0)),
'consume_amount_7d': self.safe_decimal(stats.get('consume_amount_7d', 0)),
'consume_amount_10d': self.safe_decimal(stats.get('consume_amount_10d', 0)),
'consume_amount_15d': self.safe_decimal(stats.get('consume_amount_15d', 0)),
'consume_amount_30d': self.safe_decimal(stats.get('consume_amount_30d', 0)),
'consume_amount_60d': self.safe_decimal(stats.get('consume_amount_60d', 0)),
'consume_amount_90d': self.safe_decimal(stats.get('consume_amount_90d', 0)),
# 卡余额
'cash_card_balance': self.safe_decimal(balance.get('cash_balance', 0)),
'gift_card_balance': self.safe_decimal(balance.get('gift_balance', 0)),
'total_card_balance': self.safe_decimal(balance.get('total_balance', 0)),
# 活跃度指标
'days_since_last': days_since_last,
'is_active_7d': self.safe_int(stats.get('visit_count_7d', 0)) > 0,
'is_active_30d': self.safe_int(stats.get('visit_count_30d', 0)) > 0,
'is_active_90d': self.safe_int(stats.get('visit_count_90d', 0)) > 0,
# 客户分层
'customer_tier': customer_tier,
}
results.append(record)
return results
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="stat_date"
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_consumption_stats(
self,
site_id: int,
stat_date: date
) -> List[Dict[str, Any]]:
"""
提取会员消费统计(含滚动窗口)
"""
sql = """
WITH consume_base AS (
SELECT
member_id,
DATE(pay_time) AS consume_date,
consume_money,
table_charge_money,
goods_money,
assistant_pd_money + assistant_cx_money AS assistant_amount
FROM dwd.dwd_settlement_head
WHERE site_id = %s
AND member_id IS NOT NULL
AND member_id != 0
)
SELECT
member_id,
MIN(consume_date) AS first_consume_date,
MAX(consume_date) AS last_consume_date,
-- 全量累计
COUNT(*) AS total_visit_count,
SUM(consume_money) AS total_consume_amount,
SUM(table_charge_money) AS total_table_fee,
SUM(goods_money) AS total_goods_amount,
SUM(assistant_amount) AS total_assistant_amount,
-- 滚动窗口
COUNT(CASE WHEN consume_date >= %s - INTERVAL '6 days' THEN 1 END) AS visit_count_7d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '9 days' THEN 1 END) AS visit_count_10d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '14 days' THEN 1 END) AS visit_count_15d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '29 days' THEN 1 END) AS visit_count_30d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '59 days' THEN 1 END) AS visit_count_60d,
COUNT(CASE WHEN consume_date >= %s - INTERVAL '89 days' THEN 1 END) AS visit_count_90d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '6 days' THEN consume_money ELSE 0 END) AS consume_amount_7d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '9 days' THEN consume_money ELSE 0 END) AS consume_amount_10d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '14 days' THEN consume_money ELSE 0 END) AS consume_amount_15d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '29 days' THEN consume_money ELSE 0 END) AS consume_amount_30d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '59 days' THEN consume_money ELSE 0 END) AS consume_amount_60d,
SUM(CASE WHEN consume_date >= %s - INTERVAL '89 days' THEN consume_money ELSE 0 END) AS consume_amount_90d
FROM consume_base
GROUP BY member_id
"""
params = [site_id] + [stat_date] * 12
rows = self.db.query(sql, tuple(params))
return [dict(row) for row in rows] if rows else []
def _extract_member_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取会员信息
"""
sql = """
SELECT
member_id,
nickname,
mobile,
member_card_grade_name,
DATE(create_time) AS register_date,
recharge_money_sum
FROM dwd.dim_member
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['member_id']] = row_dict
return result
def _extract_card_balances(self, site_id: int) -> Dict[int, Dict[str, Decimal]]:
"""
提取会员卡余额
"""
# 卡类型ID
CASH_CARD_TYPE_ID = 2793249295533893
GIFT_CARD_TYPE_IDS = [2791990152417157, 2793266846533445, 2794699703437125]
sql = """
SELECT
tenant_member_id AS member_id,
card_type_id,
balance
FROM dwd.dim_member_card_account
WHERE site_id = %s
AND scd2_is_current = 1
AND COALESCE(is_delete, 0) = 0
"""
rows = self.db.query(sql, (site_id,))
result: Dict[int, Dict[str, Decimal]] = {}
for row in (rows or []):
row_dict = dict(row)
member_id = row_dict.get('member_id')
card_type_id = row_dict.get('card_type_id')
balance = self.safe_decimal(row_dict.get('balance', 0))
if member_id not in result:
result[member_id] = {
'cash_balance': Decimal('0'),
'gift_balance': Decimal('0'),
'total_balance': Decimal('0')
}
if card_type_id == CASH_CARD_TYPE_ID:
result[member_id]['cash_balance'] += balance
elif card_type_id in GIFT_CARD_TYPE_IDS:
result[member_id]['gift_balance'] += balance
result[member_id]['total_balance'] = (
result[member_id]['cash_balance'] + result[member_id]['gift_balance']
)
return result
# ==========================================================================
# 工具方法
# ==========================================================================
def _mask_mobile(self, mobile: Optional[str]) -> Optional[str]:
return mask_mobile(mobile)
def _calc_days_since(self, stat_date: date, last_date: Optional[date]) -> Optional[int]:
return calc_days_since(stat_date, last_date)
def _calculate_customer_tier(
self,
stats: Dict[str, Any],
days_since_last: Optional[int]
) -> str:
"""
计算客户分层
分层规则:
- 高价值90天内消费>=3次 且 消费金额>=1000
- 中等30天内有消费
- 低活跃90天内有消费但30天内无消费
- 流失90天内无消费
"""
visit_90d = self.safe_int(stats.get('visit_count_90d', 0))
visit_30d = self.safe_int(stats.get('visit_count_30d', 0))
amount_90d = self.safe_decimal(stats.get('consume_amount_90d', 0))
if visit_90d >= 3 and amount_90d >= 1000:
return "高价值"
elif visit_30d > 0:
return "中等"
elif visit_90d > 0:
return "低活跃"
else:
return "流失"
# 便于外部导入
__all__ = ['MemberConsumptionTask']

View File

@@ -0,0 +1,399 @@
# -*- coding: utf-8 -*-
"""
会员来店明细任务
功能说明:
"会员+订单"为粒度,记录每次来店消费明细
数据来源:
- dwd_settlement_head: 结账单头表
- dwd_assistant_service_log: 助教服务流水
- dim_member: 会员维度
- dim_table: 台桌维度
- cfg_area_category: 区域分类映射
目标表:
dws.dws_member_visit_detail
更新策略:
- 更新频率:每日增量更新
- 幂等方式delete-before-insert按日期窗口
业务规则:
- 散客处理member_id=0 不进入此表
- 区域分类使用cfg_area_category映射
- 助教服务以JSON格式存储多个助教的服务明细
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
import json
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
from .dws_helpers import mask_mobile
class MemberVisitTask(BaseDwsTask):
"""
会员来店明细任务
记录每个会员每次来店的:
- 台桌信息和区域分类
- 消费金额明细
- 支付方式明细
- 助教服务明细JSON格式
"""
# CHANGE 2025-07-15 | task 1.3: 声明日期列,供基类默认 load() 使用
DATE_COL = "visit_date"
def get_task_code(self) -> str:
return "DWS_MEMBER_VISIT"
def get_target_table(self) -> str:
return "dws_member_visit_detail"
def get_primary_keys(self) -> List[str]:
return ["site_id", "member_id", "order_settle_id"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据
"""
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
self.logger.info(
"%s: 提取数据,日期范围 %s ~ %s",
self.get_task_code(), start_date, end_date
)
# 1. 获取结账单
settlements = self._extract_settlements(site_id, start_date, end_date)
# 2. 获取助教服务明细
assistant_services = self._extract_assistant_services(site_id, start_date, end_date)
# 2.1 获取台费时长(真实秒数)
table_fee_durations = self._extract_table_fee_durations(site_id, start_date, end_date)
# 3. 获取会员信息
member_info = self._extract_member_info(site_id)
# 4. 获取台桌信息
table_info = self._extract_table_info(site_id)
# 5. 加载配置
self.load_config_cache()
return {
'settlements': settlements,
'assistant_services': assistant_services,
'member_info': member_info,
'table_info': table_info,
'table_fee_durations': table_fee_durations,
'start_date': start_date,
'end_date': end_date,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据
"""
settlements = extracted['settlements']
assistant_services = extracted['assistant_services']
member_info = extracted['member_info']
table_info = extracted['table_info']
table_fee_durations = extracted['table_fee_durations']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 条结账单",
self.get_task_code(), len(settlements)
)
# 构建助教服务索引order_settle_id -> [services]
service_index = self._build_service_index(assistant_services)
# 构建台费时长索引order_settle_id -> total_seconds
table_duration_index = {
row.get('order_settle_id'): self.safe_int(row.get('table_use_seconds', 0))
for row in (table_fee_durations or [])
if row.get('order_settle_id')
}
results = []
for settle in settlements:
member_id = settle.get('member_id')
# 跳过散客
if self.is_guest(member_id):
continue
order_settle_id = settle.get('order_settle_id')
table_id = settle.get('table_id')
memb_info = member_info.get(member_id, {})
tbl_info = table_info.get(table_id, {})
services = service_index.get(order_settle_id, [])
# 获取区域分类
area_name = tbl_info.get('area_name')
area_cat = self.get_area_category(area_name)
# 构建助教服务JSON
assistant_services_json = self._build_assistant_services_json(services)
# 计算时长
table_seconds = table_duration_index.get(order_settle_id, 0)
table_duration = self._calc_table_duration(table_seconds)
assistant_duration = sum(
self.safe_int(s.get('income_seconds', 0))
for s in services
) // 60 # 转为分钟
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'member_id': member_id,
'order_settle_id': order_settle_id,
'visit_date': settle.get('visit_date'),
'visit_time': settle.get('create_time'),
# 会员信息
'member_nickname': memb_info.get('nickname'),
'member_mobile': self._mask_mobile(memb_info.get('mobile')),
'member_birthday': memb_info.get('birthday'),
# 台桌信息
'table_id': table_id,
'table_name': tbl_info.get('table_name'),
'area_name': area_name,
'area_category': area_cat.get('category_name'),
# 消费金额
'table_fee': self.safe_decimal(settle.get('table_charge_money', 0)),
'goods_amount': self.safe_decimal(settle.get('goods_money', 0)),
'assistant_amount': self.safe_decimal(settle.get('assistant_pd_money', 0)) + \
self.safe_decimal(settle.get('assistant_cx_money', 0)),
'total_consume': self.safe_decimal(settle.get('consume_money', 0)),
'total_discount': self._calc_total_discount(settle),
'actual_pay': self.safe_decimal(settle.get('pay_amount', 0)),
# 支付方式
'cash_pay': self.safe_decimal(settle.get('pay_amount', 0)),
'cash_card_pay': self.safe_decimal(settle.get('balance_amount', 0)),
'gift_card_pay': self.safe_decimal(settle.get('gift_card_amount', 0)),
'groupbuy_pay': self.safe_decimal(settle.get('coupon_amount', 0)),
# 时长
'table_duration_min': table_duration,
'assistant_duration_min': assistant_duration,
# 助教服务明细
'assistant_services': assistant_services_json,
}
results.append(record)
return results
# load() 已移除——使用 BaseDwsTask 默认实现DATE_COL="visit_date"
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _extract_settlements(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取结账单
"""
sql = """
SELECT
order_settle_id,
order_trade_no,
table_id,
member_id,
create_time,
pay_time,
DATE(pay_time) AS visit_date,
consume_money,
pay_amount,
table_charge_money,
goods_money,
assistant_pd_money,
assistant_cx_money,
coupon_amount,
adjust_amount,
member_discount_amount,
rounding_amount,
gift_card_amount,
balance_amount,
recharge_card_amount
FROM dwd.dwd_settlement_head
WHERE site_id = %s
AND DATE(pay_time) >= %s
AND DATE(pay_time) <= %s
AND member_id IS NOT NULL
AND member_id != 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_assistant_services(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取助教服务明细
"""
sql = """
SELECT
order_settle_id,
site_assistant_id AS assistant_id,
nickname AS assistant_nickname,
income_seconds,
ledger_amount
FROM dwd.dwd_assistant_service_log
WHERE site_id = %s
AND DATE(start_use_time) >= %s
AND DATE(start_use_time) <= %s
AND is_delete = 0
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_table_fee_durations(
self,
site_id: int,
start_date: date,
end_date: date
) -> List[Dict[str, Any]]:
"""
提取台费时长(真实秒数)
"""
sql = """
SELECT
order_settle_id,
SUM(COALESCE(real_table_use_seconds, 0)) AS table_use_seconds
FROM dwd.dwd_table_fee_log
WHERE site_id = %s
AND DATE(ledger_end_time) >= %s
AND DATE(ledger_end_time) <= %s
AND COALESCE(is_delete, 0) = 0
GROUP BY order_settle_id
"""
rows = self.db.query(sql, (site_id, start_date, end_date))
return [dict(row) for row in rows] if rows else []
def _extract_member_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取会员信息
"""
sql = """
SELECT
member_id,
nickname,
mobile,
birthday
FROM dwd.dim_member
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
return {r['member_id']: dict(r) for r in (rows or [])}
def _extract_table_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取台桌信息
"""
sql = """
SELECT
site_table_id AS table_id,
site_table_name AS table_name,
site_table_area_name AS area_name
FROM dwd.dim_table
WHERE site_id = %s
AND scd2_is_current = 1
"""
rows = self.db.query(sql, (site_id,))
return {r['table_id']: dict(r) for r in (rows or [])}
# ==========================================================================
# 工具方法
# ==========================================================================
def _build_service_index(
self,
services: List[Dict[str, Any]]
) -> Dict[int, List[Dict[str, Any]]]:
"""
构建助教服务索引
"""
index: Dict[int, List[Dict[str, Any]]] = {}
for service in services:
order_id = service.get('order_settle_id')
if order_id:
if order_id not in index:
index[order_id] = []
index[order_id].append(service)
return index
def _build_assistant_services_json(
self,
services: List[Dict[str, Any]]
) -> Optional[str]:
"""
构建助教服务JSON
"""
if not services:
return None
json_data = []
for s in services:
json_data.append({
'assistant_id': s.get('assistant_id'),
'nickname': s.get('assistant_nickname'),
'duration_min': self.safe_int(s.get('income_seconds', 0)) // 60,
'amount': float(self.safe_decimal(s.get('ledger_amount', 0)))
})
return json.dumps(json_data, ensure_ascii=False)
def _calc_table_duration(self, table_use_seconds: int) -> int:
"""
计算台桌使用时长(分钟)
使用真实台费流水秒数
"""
if not table_use_seconds or table_use_seconds <= 0:
return 0
return int(table_use_seconds // 60)
def _calc_total_discount(self, settle: Dict[str, Any]) -> Decimal:
"""
计算总优惠
"""
adjust = self.safe_decimal(settle.get('adjust_amount', 0))
member_discount = self.safe_decimal(settle.get('member_discount_amount', 0))
rounding = self.safe_decimal(settle.get('rounding_amount', 0))
return adjust + member_discount + rounding
def _mask_mobile(self, mobile: Optional[str]) -> Optional[str]:
return mask_mobile(mobile)
# 便于外部导入
__all__ = ['MemberVisitTask']

View File

@@ -0,0 +1,196 @@
# -*- coding: utf-8 -*-
"""
DWS 物化视图刷新任务
说明:
- 按 L1/L2/L3/L4 时间分层刷新物化视图
- 默认受 dws.mv.enabled 与 dws.retention.* 配置联动控制
"""
from __future__ import annotations
import json
from typing import Any, Dict, List, Optional
from .base_dws_task import BaseDwsTask, TaskContext, TimeLayer
class BaseMvRefreshTask(BaseDwsTask):
"""物化视图刷新基类"""
BASE_TABLE: str = ""
DATE_COL: str = ""
VIEW_PREFIX = "mv_"
LAYER_ORDER = [
TimeLayer.LAST_2_DAYS,
TimeLayer.LAST_1_MONTH,
TimeLayer.LAST_3_MONTHS,
TimeLayer.LAST_6_MONTHS,
]
LAYER_SUFFIX = {
TimeLayer.LAST_2_DAYS: "l1",
TimeLayer.LAST_1_MONTH: "l2",
TimeLayer.LAST_3_MONTHS: "l3",
TimeLayer.LAST_6_MONTHS: "l4",
}
def get_target_table(self) -> str:
return self.BASE_TABLE
def get_primary_keys(self) -> List[str]:
return []
def extract(self, context: TaskContext) -> Dict[str, Any]:
return {"site_id": context.store_id}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
return extracted
def load(self, transformed: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
if not self._is_enabled():
self.logger.info("%s: 未启用物化刷新,跳过", self.get_task_code())
return {"counts": {"refreshed": 0}}
layers = self._resolve_layers()
refreshed = 0
details = []
for layer in layers:
view_name = self._get_view_name(layer)
if not view_name:
continue
if not self._view_exists(view_name):
self.logger.warning("%s: 物化视图不存在,跳过 %s", self.get_task_code(), view_name)
continue
self._refresh_view(view_name)
refreshed += 1
details.append({"view": view_name, "layer": layer.value})
self.logger.info("%s: 刷新完成,物化视图数=%d", self.get_task_code(), refreshed)
return {"counts": {"refreshed": refreshed}, "extra": {"details": details}}
def _is_enabled(self) -> bool:
enabled = bool(self.config.get("dws.mv.enabled", False))
if not enabled:
return False
tables = self._parse_list(self.config.get("dws.mv.tables"))
if not tables:
tables = self._parse_list(self.config.get("dws.retention.tables"))
if tables and self.BASE_TABLE not in tables:
return False
return True
def _resolve_layers(self) -> List[TimeLayer]:
# 显式配置优先
configured = self._parse_layers(self.config.get("dws.mv.layers"))
if configured:
return configured
# 表级覆盖:优先 mv.table_layers其次 retention.table_layers
table_layers = self._resolve_layer_map(
self.config.get("dws.mv.table_layers") or self.config.get("dws.retention.table_layers")
)
layer_name = table_layers.get(self.BASE_TABLE)
if layer_name:
layer = self._get_layer(layer_name)
if layer and layer != TimeLayer.ALL:
return self._layers_up_to(layer)
# 默认使用 retention.layer
retention_layer = self._get_layer(self.config.get("dws.retention.layer"))
if retention_layer and retention_layer != TimeLayer.ALL:
return self._layers_up_to(retention_layer)
return list(self.LAYER_ORDER)
def _layers_up_to(self, target: TimeLayer) -> List[TimeLayer]:
layers = []
for layer in self.LAYER_ORDER:
layers.append(layer)
if layer == target:
break
return layers
def _get_view_name(self, layer: TimeLayer) -> Optional[str]:
suffix = self.LAYER_SUFFIX.get(layer)
if not suffix or not self.BASE_TABLE:
return None
return f"{self.VIEW_PREFIX}{self.BASE_TABLE}_{suffix}"
def _view_exists(self, view_name: str) -> bool:
sql = "SELECT to_regclass(%s) AS reg"
rows = self.db.query(sql, (f"{self.DWS_SCHEMA}.{view_name}",))
return bool(rows and rows[0].get("reg"))
def _refresh_view(self, view_name: str) -> None:
concurrently = bool(self.config.get("dws.mv.refresh_concurrently", False))
keyword = "CONCURRENTLY " if concurrently else ""
sql = f"REFRESH MATERIALIZED VIEW {keyword}{self.DWS_SCHEMA}.{view_name}"
self.db.execute(sql)
def _get_layer(self, layer_name: Optional[str]) -> Optional[TimeLayer]:
if not layer_name:
return None
name = str(layer_name).upper()
try:
return TimeLayer[name]
except KeyError:
return None
def _resolve_layer_map(self, raw: Any) -> Dict[str, str]:
if not raw:
return {}
if isinstance(raw, dict):
return {str(k): str(v) for k, v in raw.items()}
if isinstance(raw, str):
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return {str(k): str(v) for k, v in parsed.items()}
except json.JSONDecodeError:
return {}
return {}
def _parse_layers(self, raw: Any) -> List[TimeLayer]:
if not raw:
return []
if isinstance(raw, str):
items = [v.strip() for v in raw.split(",") if v.strip()]
elif isinstance(raw, (list, tuple, set)):
items = [str(v).strip() for v in raw if str(v).strip()]
else:
return []
layers = []
for item in items:
layer = self._get_layer(item)
if layer and layer not in layers:
layers.append(layer)
return layers
def _parse_list(self, raw: Any) -> List[str]:
if not raw:
return []
if isinstance(raw, str):
return [v.strip() for v in raw.split(",") if v.strip()]
if isinstance(raw, (list, tuple, set)):
return [str(v).strip() for v in raw if str(v).strip()]
return []
class DwsMvRefreshFinanceDailyTask(BaseMvRefreshTask):
BASE_TABLE = "dws_finance_daily_summary"
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_MV_REFRESH_FINANCE_DAILY"
class DwsMvRefreshAssistantDailyTask(BaseMvRefreshTask):
BASE_TABLE = "dws_assistant_daily_detail"
DATE_COL = "stat_date"
def get_task_code(self) -> str:
return "DWS_MV_REFRESH_ASSISTANT_DAILY"
__all__ = ["DwsMvRefreshFinanceDailyTask", "DwsMvRefreshAssistantDailyTask"]

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""
DWS 时间分层清理任务
功能说明:
按配置的时间分层范围,对 DWS 表执行历史数据清理。
该任务默认不启用,需通过配置显式开启。
配置示例(.env / settings
DWS_RETENTION_ENABLED=true
DWS_RETENTION_LAYER=LAST_3_MONTHS
DWS_RETENTION_TABLES=dws_finance_daily_summary,dws_assistant_daily_detail
DWS_RETENTION_TABLE_LAYERS={"dws_finance_expense_summary":"ALL"}
作者ETL团队
创建日期2026-02-03
"""
from __future__ import annotations
import json
from datetime import date
from typing import Any, Dict, List, Optional
from .base_dws_task import BaseDwsTask, TaskContext, TimeLayer
class DwsRetentionCleanupTask(BaseDwsTask):
"""
DWS 时间分层清理任务
"""
DEFAULT_TABLES = [
{"table": "dws_assistant_daily_detail", "date_col": "stat_date"},
{"table": "dws_assistant_monthly_summary", "date_col": "stat_month"},
{"table": "dws_assistant_customer_stats", "date_col": "stat_date"},
{"table": "dws_assistant_salary_calc", "date_col": "salary_month"},
{"table": "dws_assistant_recharge_commission", "date_col": "commission_month"},
{"table": "dws_assistant_finance_analysis", "date_col": "stat_date"},
{"table": "dws_member_consumption_summary", "date_col": "stat_date"},
{"table": "dws_member_visit_detail", "date_col": "visit_date"},
{"table": "dws_finance_daily_summary", "date_col": "stat_date"},
{"table": "dws_finance_income_structure", "date_col": "stat_date"},
{"table": "dws_finance_discount_detail", "date_col": "stat_date"},
{"table": "dws_finance_recharge_summary", "date_col": "stat_date"},
{"table": "dws_finance_expense_summary", "date_col": "expense_month"},
{"table": "dws_platform_settlement", "date_col": "settlement_date"},
]
def get_task_code(self) -> str:
return "DWS_RETENTION_CLEANUP"
def get_target_table(self) -> str:
return "dws_finance_daily_summary"
def get_primary_keys(self) -> List[str]:
return []
def extract(self, context: TaskContext) -> Dict[str, Any]:
return {"site_id": context.store_id}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> Dict[str, Any]:
return extracted
def load(self, transformed: Dict[str, Any], context: TaskContext) -> Dict:
"""
执行清理逻辑
"""
if not self._is_retention_enabled():
self.logger.info("%s: 未启用清理配置,跳过", self.get_task_code())
return {"counts": {"cleaned": 0}}
base_date = context.window_end.date() if hasattr(context.window_end, "date") else context.window_end
default_layer = self._get_retention_layer(self.config.get("dws.retention.layer", "ALL"))
if default_layer is None:
self.logger.warning("%s: 未识别的清理层级,跳过", self.get_task_code())
return {"counts": {"cleaned": 0}}
target_tables = self._resolve_target_tables()
if not target_tables:
self.logger.info("%s: 未配置需要清理的表,跳过", self.get_task_code())
return {"counts": {"cleaned": 0}}
table_layers = self._resolve_table_layers()
total_deleted = 0
details = []
for item in target_tables:
table = item["table"]
date_col = item["date_col"]
layer_name = table_layers.get(table, default_layer.value)
layer = self._get_retention_layer(layer_name)
if layer is None or layer == TimeLayer.ALL:
continue
time_range = self.get_time_layer_range(layer, base_date)
cutoff = self._normalize_cutoff(date_col, time_range.start)
deleted = self._cleanup_table(table, date_col, cutoff, context.store_id)
total_deleted += deleted
details.append({"table": table, "deleted": deleted, "cutoff": str(cutoff)})
self.logger.info("%s: 清理完成,总删除 %d", self.get_task_code(), total_deleted)
return {"counts": {"cleaned": total_deleted}, "extra": {"details": details}}
def _is_retention_enabled(self) -> bool:
return bool(self.config.get("dws.retention.enabled", False))
def _get_retention_layer(self, layer_name: Optional[str]) -> Optional[TimeLayer]:
if not layer_name:
return None
name = str(layer_name).upper()
try:
return TimeLayer[name]
except KeyError:
return None
def _resolve_target_tables(self) -> List[Dict[str, str]]:
table_list = self.config.get("dws.retention.tables")
if not table_list:
return self.DEFAULT_TABLES
if isinstance(table_list, str):
names = [t.strip() for t in table_list.split(",") if t.strip()]
else:
names = list(table_list)
selected = []
for item in self.DEFAULT_TABLES:
if item["table"] in names:
selected.append(item)
return selected
def _resolve_table_layers(self) -> Dict[str, str]:
raw = self.config.get("dws.retention.table_layers")
if not raw:
return {}
if isinstance(raw, dict):
return {str(k): str(v) for k, v in raw.items()}
if isinstance(raw, str):
try:
parsed = json.loads(raw)
if isinstance(parsed, dict):
return {str(k): str(v) for k, v in parsed.items()}
except json.JSONDecodeError:
return {}
return {}
def _normalize_cutoff(self, date_col: str, cutoff: date) -> date:
monthly_cols = {"stat_month", "salary_month", "commission_month", "expense_month"}
if date_col in monthly_cols:
return cutoff.replace(day=1)
return cutoff
def _cleanup_table(self, table: str, date_col: str, cutoff: date, site_id: int) -> int:
full_table = f"{self.DWS_SCHEMA}.{table}"
sql = f"DELETE FROM {full_table} WHERE site_id = %s AND {date_col} < %s"
with self.db.conn.cursor() as cur:
cur.execute(sql, (site_id, cutoff))
return cur.rowcount
__all__ = ["DwsRetentionCleanupTask"]