初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

View File

@@ -0,0 +1,600 @@
# -*- coding: utf-8 -*-
"""
助教月度业绩汇总任务
功能说明:
"助教+月份"为粒度,汇总月度业绩及档位计算
数据来源:
- dws_assistant_daily_detail: 日度明细(聚合)
- dim_assistant: 助教维度(入职日期、等级)
- cfg_performance_tier: 绩效档位配置
目标表:
billiards_dws.dws_assistant_monthly_summary
更新策略:
- 更新频率:每日更新当月数据
- 幂等方式delete-before-insert按月份
业务规则:
- 新入职判断入职日期在月1日0点之后则为新入职
- 有效业绩total_hours - trashed_hours
- 档位匹配根据有效业绩小时数匹配cfg_performance_tier
- 排名计算按有效业绩小时数降序考虑并列如2个第一则都是1下一个是3
作者ETL团队
创建日期2026-02-01
"""
from __future__ import annotations
from datetime import date, datetime, timedelta
from decimal import Decimal
from typing import Any, Dict, List, Optional, Set, Tuple
from .base_dws_task import BaseDwsTask, TaskContext
class AssistantMonthlyTask(BaseDwsTask):
"""
助教月度业绩汇总任务
汇总每个助教每月的:
- 工作天数、服务次数、时长
- 有效业绩(扣除废除记录后)
- 档位匹配
- 月度排名用于Top3奖金
"""
def get_task_code(self) -> str:
return "DWS_ASSISTANT_MONTHLY"
def get_target_table(self) -> str:
return "dws_assistant_monthly_summary"
def get_primary_keys(self) -> List[str]:
return ["site_id", "assistant_id", "stat_month"]
# ==========================================================================
# ETL主流程
# ==========================================================================
def extract(self, context: TaskContext) -> Dict[str, Any]:
"""
提取数据:从日度明细表聚合
"""
# 确定月份范围
start_date = context.window_start.date() if hasattr(context.window_start, 'date') else context.window_start
end_date = context.window_end.date() if hasattr(context.window_end, 'date') else context.window_end
site_id = context.store_id
# 获取涉及的月份列表
months = self._get_months_in_range(start_date, end_date)
months = self._filter_months_for_schedule(months, end_date)
self.logger.info(
"%s: 提取数据,月份范围 %s",
self.get_task_code(), [str(m) for m in months]
)
if not months:
self.logger.info("%s: 无需处理月份,跳过", self.get_task_code())
return {
'daily_aggregates': [],
'monthly_uniques': [],
'assistant_info': {},
'months': [],
'site_id': site_id
}
# 1. 获取日度明细聚合数据
daily_aggregates = self._extract_daily_aggregates(site_id, months)
# 1.1 获取月度去重客户/台桌统计从DWD直接去重
monthly_uniques = self._extract_monthly_uniques(site_id, months)
# 2. 获取助教基本信息
assistant_info = self._extract_assistant_info(site_id)
# 3. 加载配置缓存
self.load_config_cache()
return {
'daily_aggregates': daily_aggregates,
'monthly_uniques': monthly_uniques,
'assistant_info': assistant_info,
'months': months,
'site_id': site_id
}
def transform(self, extracted: Dict[str, Any], context: TaskContext) -> List[Dict[str, Any]]:
"""
转换数据:计算月度汇总、档位匹配、排名
"""
daily_aggregates = extracted['daily_aggregates']
monthly_uniques = extracted['monthly_uniques']
assistant_info = extracted['assistant_info']
months = extracted['months']
site_id = extracted['site_id']
self.logger.info(
"%s: 转换数据,%d 个月份,%d 条聚合记录",
self.get_task_code(), len(months), len(daily_aggregates)
)
# 月度去重索引
monthly_unique_index = {
(row.get('assistant_id'), row.get('stat_month')): row
for row in (monthly_uniques or [])
if row.get('assistant_id') and row.get('stat_month')
}
# 按月份处理
all_results = []
for month in months:
month_results = self._process_month(
daily_aggregates,
assistant_info,
monthly_unique_index,
month,
site_id
)
all_results.extend(month_results)
return all_results
def load(self, transformed: List[Dict[str, Any]], context: TaskContext) -> Dict:
"""
加载数据写入DWS表
"""
if not transformed:
self.logger.info("%s: 无数据需要写入", self.get_task_code())
return {"counts": {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}}
# 删除已存在的数据(按月份)
deleted = self._delete_by_months(context, transformed)
# 批量插入
inserted = self.bulk_insert(transformed)
self.logger.info(
"%s: 加载完成,删除 %d 行,插入 %d",
self.get_task_code(), deleted, inserted
)
return {
"counts": {
"fetched": len(transformed),
"inserted": inserted,
"updated": 0,
"skipped": 0,
"errors": 0
},
"extra": {"deleted": deleted}
}
# ==========================================================================
# 数据提取方法
# ==========================================================================
def _get_months_in_range(self, start_date: date, end_date: date) -> List[date]:
"""
获取日期范围内的所有月份(月第一天)
"""
months = []
current = start_date.replace(day=1)
end_month = end_date.replace(day=1)
while current <= end_month:
months.append(current)
# 下个月
if current.month == 12:
current = current.replace(year=current.year + 1, month=1)
else:
current = current.replace(month=current.month + 1)
return months
def _filter_months_for_schedule(self, months: List[date], end_date: date) -> List[date]:
"""
按调度口径过滤历史月份(默认仅当月,月初可包含上月)
"""
if not months:
return []
history_months = self.safe_int(self.config.get("dws.monthly.history_months", 0))
if history_months > 0:
current_month = self.get_month_first_day(end_date)
allowed = {current_month}
for offset in range(1, history_months + 1):
allowed.add(self.get_month_first_day(self._shift_months(current_month, -offset)))
filtered = [m for m in months if m in allowed]
skipped = [m for m in months if m not in allowed]
if skipped:
self.logger.info(
"%s: 跳过历史月份 %s",
self.get_task_code(),
[str(m) for m in skipped]
)
return filtered
allow_history = bool(self.config.get("dws.monthly.allow_history", False))
if allow_history:
return months
current_month = self.get_month_first_day(end_date)
allowed = {current_month}
grace_days = self.safe_int(self.config.get("dws.monthly.prev_month_grace_days", 5))
if grace_days > 0 and end_date.day <= grace_days:
prev_month = self.get_month_first_day(self._shift_months(current_month, -1))
allowed.add(prev_month)
filtered = [m for m in months if m in allowed]
skipped = [m for m in months if m not in allowed]
if skipped:
self.logger.info(
"%s: 跳过历史月份 %s",
self.get_task_code(),
[str(m) for m in skipped]
)
return filtered
def _extract_daily_aggregates(
self,
site_id: int,
months: List[date]
) -> List[Dict[str, Any]]:
"""
从日度明细表提取并按月聚合
"""
if not months:
return []
# 构建月份条件
month_conditions = []
for month in months:
next_month = (month.replace(day=28) + timedelta(days=4)).replace(day=1)
month_conditions.append(f"(stat_date >= '{month}' AND stat_date < '{next_month}')")
month_where = " OR ".join(month_conditions)
sql = f"""
SELECT
assistant_id,
assistant_nickname,
assistant_level_code,
assistant_level_name,
DATE_TRUNC('month', stat_date)::DATE AS stat_month,
COUNT(DISTINCT stat_date) AS work_days,
SUM(total_service_count) AS total_service_count,
SUM(base_service_count) AS base_service_count,
SUM(bonus_service_count) AS bonus_service_count,
SUM(room_service_count) AS room_service_count,
SUM(total_hours) AS total_hours,
SUM(base_hours) AS base_hours,
SUM(bonus_hours) AS bonus_hours,
SUM(room_hours) AS room_hours,
SUM(total_ledger_amount) AS total_ledger_amount,
SUM(base_ledger_amount) AS base_ledger_amount,
SUM(bonus_ledger_amount) AS bonus_ledger_amount,
SUM(room_ledger_amount) AS room_ledger_amount,
SUM(unique_customers) AS total_unique_customers,
SUM(unique_tables) AS total_unique_tables,
SUM(trashed_seconds) AS trashed_seconds,
SUM(trashed_count) AS trashed_count
FROM billiards_dws.dws_assistant_daily_detail
WHERE site_id = %s AND ({month_where})
GROUP BY assistant_id, assistant_nickname, assistant_level_code, assistant_level_name,
DATE_TRUNC('month', stat_date)
"""
rows = self.db.query(sql, (site_id,))
return [dict(row) for row in rows] if rows else []
def _extract_monthly_uniques(
self,
site_id: int,
months: List[date]
) -> List[Dict[str, Any]]:
"""
从DWD按月直接去重客户与台桌
"""
if not months:
return []
start_month = min(months)
end_month = max(months)
next_month = (end_month.replace(day=28) + timedelta(days=4)).replace(day=1)
sql = """
SELECT
site_assistant_id AS assistant_id,
DATE_TRUNC('month', start_use_time)::DATE AS stat_month,
COUNT(DISTINCT CASE WHEN tenant_member_id > 0 THEN tenant_member_id END) AS unique_customers,
COUNT(DISTINCT site_table_id) AS unique_tables
FROM billiards_dwd.dwd_assistant_service_log
WHERE site_id = %s
AND start_use_time >= %s
AND start_use_time < %s
AND is_delete = 0
GROUP BY site_assistant_id, DATE_TRUNC('month', start_use_time)
"""
rows = self.db.query(sql, (site_id, start_month, next_month))
return [dict(row) for row in rows] if rows else []
def _extract_assistant_info(self, site_id: int) -> Dict[int, Dict[str, Any]]:
"""
提取助教基本信息
"""
sql = """
SELECT
assistant_id,
nickname,
level AS assistant_level,
entry_time AS hire_date
FROM billiards_dwd.dim_assistant
WHERE site_id = %s
AND scd2_is_current = 1 -- 当前有效记录
"""
rows = self.db.query(sql, (site_id,))
result = {}
for row in (rows or []):
row_dict = dict(row)
result[row_dict['assistant_id']] = row_dict
return result
# ==========================================================================
# 数据转换方法
# ==========================================================================
def _process_month(
self,
daily_aggregates: List[Dict[str, Any]],
assistant_info: Dict[int, Dict[str, Any]],
monthly_unique_index: Dict[Tuple[int, date], Dict[str, Any]],
month: date,
site_id: int
) -> List[Dict[str, Any]]:
"""
处理单个月份的数据
"""
# 筛选该月份的数据
month_data = [
agg for agg in daily_aggregates
if agg.get('stat_month') == month
]
if not month_data:
return []
# 构建月度汇总记录
month_records = []
for agg in month_data:
assistant_id = agg.get('assistant_id')
asst_info = assistant_info.get(assistant_id, {})
# 计算有效业绩
total_hours = self.safe_decimal(agg.get('total_hours', 0))
trashed_hours = self.seconds_to_hours(self.safe_int(agg.get('trashed_seconds', 0)))
effective_hours = total_hours - trashed_hours
# 判断是否新入职
hire_date = asst_info.get('hire_date')
is_new_hire = False
if hire_date:
if isinstance(hire_date, datetime):
hire_date = hire_date.date()
is_new_hire = self.is_new_hire_in_month(hire_date, month)
# 匹配档位
tier_hours = effective_hours
max_tier_level = None
if is_new_hire:
tier_hours = self._calc_new_hire_tier_hours(effective_hours, self.safe_int(agg.get('work_days', 0)))
if self._should_apply_new_hire_tier_cap(month, hire_date):
max_tier_level = self._get_new_hire_max_tier_level()
tier = self.get_performance_tier(
tier_hours,
is_new_hire,
effective_date=month,
max_tier_level=max_tier_level
)
# 获取月末的等级信息(用于记录)
month_end = self._get_month_end(month)
level_info = self.get_assistant_level_asof(assistant_id, month_end)
# 月度去重客户/台桌从DWD直接去重
unique_info = monthly_unique_index.get((assistant_id, month), {})
unique_customers = self.safe_int(
unique_info.get('unique_customers', agg.get('total_unique_customers', 0))
)
unique_tables = self.safe_int(
unique_info.get('unique_tables', agg.get('total_unique_tables', 0))
)
record = {
'site_id': site_id,
'tenant_id': self.config.get("app.tenant_id", site_id),
'assistant_id': assistant_id,
'assistant_nickname': agg.get('assistant_nickname'),
'stat_month': month,
'assistant_level_code': level_info.get('level_code') if level_info else agg.get('assistant_level_code'),
'assistant_level_name': level_info.get('level_name') if level_info else agg.get('assistant_level_name'),
'hire_date': hire_date,
'is_new_hire': is_new_hire,
'work_days': self.safe_int(agg.get('work_days', 0)),
'total_service_count': self.safe_int(agg.get('total_service_count', 0)),
'base_service_count': self.safe_int(agg.get('base_service_count', 0)),
'bonus_service_count': self.safe_int(agg.get('bonus_service_count', 0)),
'room_service_count': self.safe_int(agg.get('room_service_count', 0)),
'total_hours': total_hours,
'base_hours': self.safe_decimal(agg.get('base_hours', 0)),
'bonus_hours': self.safe_decimal(agg.get('bonus_hours', 0)),
'room_hours': self.safe_decimal(agg.get('room_hours', 0)),
'effective_hours': effective_hours,
'trashed_hours': trashed_hours,
'total_ledger_amount': self.safe_decimal(agg.get('total_ledger_amount', 0)),
'base_ledger_amount': self.safe_decimal(agg.get('base_ledger_amount', 0)),
'bonus_ledger_amount': self.safe_decimal(agg.get('bonus_ledger_amount', 0)),
'room_ledger_amount': self.safe_decimal(agg.get('room_ledger_amount', 0)),
'unique_customers': unique_customers,
'unique_tables': unique_tables,
'avg_service_seconds': self._calc_avg_service_seconds(agg),
'tier_id': tier.get('tier_id') if tier else None,
'tier_code': tier.get('tier_code') if tier else None,
'tier_name': tier.get('tier_name') if tier else None,
'rank_by_hours': None, # 后面计算
'rank_with_ties': None, # 后面计算
}
month_records.append(record)
# 计算排名
self._calculate_ranks(month_records)
return month_records
def _get_month_end(self, month: date) -> date:
"""
获取月末日期
"""
if month.month == 12:
next_month = month.replace(year=month.year + 1, month=1, day=1)
else:
next_month = month.replace(month=month.month + 1, day=1)
return next_month - timedelta(days=1)
def _calc_avg_service_seconds(self, agg: Dict[str, Any]) -> Decimal:
"""
计算平均单次服务时长
"""
total_count = self.safe_int(agg.get('total_service_count', 0))
if total_count == 0:
return Decimal('0')
total_hours = self.safe_decimal(agg.get('total_hours', 0))
total_seconds = total_hours * Decimal('3600')
return total_seconds / Decimal(str(total_count))
def _calc_new_hire_tier_hours(self, effective_hours: Decimal, work_days: int) -> Decimal:
"""
新入职定档:日均 * 30仅用于定档不影响奖金与排名
"""
if work_days <= 0:
return Decimal('0')
return (effective_hours / Decimal(str(work_days))) * Decimal('30')
def _should_apply_new_hire_tier_cap(self, stat_month: date, hire_date: Optional[date]) -> bool:
"""
新入职封顶规则是否生效:
- 仅在规则生效月及之后(默认 2026-03-01 起)
- 仅当入职日期晚于封顶日(默认当月 25 日)
"""
if not hire_date:
return False
effective_from = self._get_new_hire_cap_effective_from()
cap_day = self._get_new_hire_cap_day()
return stat_month >= effective_from and hire_date.day > cap_day
def _get_new_hire_cap_effective_from(self) -> date:
"""
获取新入职封顶规则生效月份(默认 2026-03-01
"""
raw_value = self.config.get("dws.monthly.new_hire_cap_effective_from", "2026-03-01")
if isinstance(raw_value, datetime):
return raw_value.date()
if isinstance(raw_value, date):
return raw_value
if isinstance(raw_value, str):
try:
return datetime.strptime(raw_value.strip(), "%Y-%m-%d").date()
except ValueError:
pass
return date(2026, 3, 1)
def _get_new_hire_cap_day(self) -> int:
"""
获取新入职封顶日(默认 25
"""
value = self.safe_int(self.config.get("dws.monthly.new_hire_cap_day", 25))
return min(max(value, 1), 31)
def _get_new_hire_max_tier_level(self) -> int:
"""
获取新入职封顶档位等级(默认 2 档)
"""
value = self.safe_int(self.config.get("dws.monthly.new_hire_max_tier_level", 2))
return max(value, 0)
def _calculate_ranks(self, records: List[Dict[str, Any]]) -> None:
"""
计算排名(考虑并列)
Top3排名口径按有效业绩总小时数排名
如遇并列则都算比如2个第一则记为2个第一一个第三
"""
if not records:
return
# 按有效业绩降序排序
sorted_records = sorted(
records,
key=lambda x: x.get('effective_hours', Decimal('0')),
reverse=True
)
# 计算考虑并列的排名
values = [
(r.get('assistant_id'), r.get('effective_hours', Decimal('0')))
for r in sorted_records
]
ranked = self.calculate_rank_with_ties(values)
# 创建排名映射
rank_map = {
assistant_id: (rank, dense_rank)
for assistant_id, rank, dense_rank in ranked
}
# 更新记录
for record in records:
assistant_id = record.get('assistant_id')
if assistant_id in rank_map:
rank, _ = rank_map[assistant_id]
record['rank_by_hours'] = rank
record['rank_with_ties'] = rank # 使用考虑并列的排名
def _delete_by_months(
self,
context: TaskContext,
records: List[Dict[str, Any]]
) -> int:
"""
按月份删除已存在的数据
"""
# 获取涉及的月份
months = set(r.get('stat_month') for r in records if r.get('stat_month'))
if not months:
return 0
target_table = self.get_target_table()
full_table = f"{self.DWS_SCHEMA}.{target_table}"
total_deleted = 0
with self.db.conn.cursor() as cur:
for month in months:
sql = f"""
DELETE FROM {full_table}
WHERE site_id = %s AND stat_month = %s
"""
cur.execute(sql, (context.store_id, month))
total_deleted += cur.rowcount
return total_deleted
# 便于外部导入
__all__ = ['AssistantMonthlyTask']