# -*- coding: utf-8 -*- """ 历史任务推演脚本:基于日快照指数重放 task_generator + recall_detector 逻辑 前置条件:先运行 backfill_index_snapshots.py 生成历史指数快照。 用法: cd C:\\Project\\NeoZQYY uv run python scripts/ops/simulate_historical_tasks.py \\ --start 2025-08-01 --end 2026-03-28 # 干跑模式 uv run python scripts/ops/simulate_historical_tasks.py \\ --start 2025-08-01 --end 2026-03-28 --dry-run # 清理之前的模拟数据后重跑 uv run python scripts/ops/simulate_historical_tasks.py \\ --start 2025-08-01 --end 2026-03-28 --clean """ from __future__ import annotations import argparse import json import logging import sys import time from datetime import date, datetime, timedelta from decimal import Decimal from pathlib import Path from zoneinfo import ZoneInfo # ── 环境加载 ── sys.path.insert(0, str(Path(__file__).resolve().parent)) from _env_paths import ensure_repo_root ensure_repo_root() import os from dotenv import load_dotenv _ROOT = Path(__file__).resolve().parents[2] load_dotenv(_ROOT / ".env", override=False) # 导入 task_generator 的纯函数(在 backend 目录下) _BACKEND = _ROOT / "apps" / "backend" sys.path.insert(0, str(_BACKEND)) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) logger = logging.getLogger("simulate_tasks") import psycopg2 import psycopg2.extras TZ = ZoneInfo("Asia/Shanghai") # 导入 task_generator 纯函数 from app.services.task_generator import ( IndexData, determine_task_type, should_replace_task, TASK_TYPE_PRIORITY, ) # 推演截止日期(现有 active 任务从 03-29 开始) CUTOFF_DATE = date(2026, 3, 28) # 回访任务过期时长 FOLLOW_UP_HOURS = 48 def get_connections(): """获取 ETL 库和业务库连接。""" pg_dsn = os.environ.get("PG_DSN") app_dsn = os.environ.get("APP_DB_DSN") if not pg_dsn or not app_dsn: print("错误:PG_DSN 和 APP_DB_DSN 必须在 .env 中配置", file=sys.stderr) sys.exit(1) etl_conn = psycopg2.connect(pg_dsn) etl_conn.set_client_encoding("UTF8") app_conn = psycopg2.connect(app_dsn) app_conn.set_client_encoding("UTF8") return etl_conn, app_conn def load_index_snapshot(etl_conn, site_id: int, stat_date: date) -> dict: """加载指定日期的指数快照。 返回: { "relation": {(assistant_id, member_id): {rs, os_label, ...}}, "wbi": {member_id: Decimal}, "nci": {member_id: Decimal}, } """ result = {"relation": {}, "wbi": {}, "nci": {}} with etl_conn.cursor() as cur: # 关系指数 cur.execute( """SELECT assistant_id, member_id, rs_display, os_label, os_share, session_count FROM dws.dws_member_assistant_relation_index WHERE site_id = %s AND stat_date = %s""", (site_id, stat_date), ) for row in cur.fetchall(): result["relation"][(row[0], row[1])] = { "rs": Decimal(str(row[2])), "os_label": row[3], "os_share": Decimal(str(row[4])), "session_count": row[5], } # WBI cur.execute( """SELECT member_id, display_score FROM dws.dws_member_winback_index WHERE site_id = %s AND stat_date = %s""", (site_id, stat_date), ) for row in cur.fetchall(): result["wbi"][row[0]] = Decimal(str(row[1])) if row[1] else Decimal(0) # NCI cur.execute( """SELECT member_id, display_score FROM dws.dws_member_newconv_index WHERE site_id = %s AND stat_date = %s""", (site_id, stat_date), ) for row in cur.fetchall(): result["nci"][row[0]] = Decimal(str(row[1])) if row[1] else Decimal(0) etl_conn.commit() return result def load_settlements_for_day(etl_conn, site_id: int, d: date) -> dict: """加载指定日期的结算记录。 返回:{(assistant_id, member_id): latest_pay_time} """ settlements = {} day_start = datetime(d.year, d.month, d.day, 0, 0, 0, tzinfo=TZ) day_end = day_start + timedelta(days=1) with etl_conn.cursor() as cur: cur.execute( """ SELECT sl.site_assistant_id AS assistant_id, sh.member_id, MAX(sh.pay_time) AS latest_pay_time FROM dwd.dwd_settlement_head sh JOIN dwd.dwd_assistant_service_log sl ON sl.order_settle_id = sh.order_settle_id AND sl.is_delete = 0 WHERE sh.site_id = %s AND sh.settle_type IN (1, 3) AND sh.pay_time >= %s AND sh.pay_time < %s GROUP BY sl.site_assistant_id, sh.member_id """, (site_id, day_start, day_end), ) for row in cur.fetchall(): if row[0] and row[1]: settlements[(row[0], row[1])] = row[2] etl_conn.commit() return settlements def simulate_day( app_conn, etl_conn, site_id: int, d: date, snapshot: dict, active_tasks: dict, stats: dict, dry_run: bool = False, ): """模拟单天的任务生成和召回检测。 active_tasks: {(assistant_id, member_id): {"id": int, "task_type": str, "created_at": datetime, "expires_at": datetime|None}} stats: 累计统计 """ day_datetime = datetime(d.year, d.month, d.day, 7, 0, 0, tzinfo=TZ) # ── 1. 过期检测(回访任务 expires_at < 当天) ── expired_keys = [] for key, task in active_tasks.items(): if task.get("expires_at") and task["expires_at"] < day_datetime: expired_keys.append(key) for key in expired_keys: task = active_tasks.pop(key) stats["expired"] += 1 if not dry_run: with app_conn.cursor() as cur: cur.execute( "UPDATE biz.coach_tasks SET status = 'expired', updated_at = %s WHERE id = %s", (day_datetime, task["id"]), ) _insert_history(cur, task["id"], "expired", "active", "expired", task["task_type"], task["task_type"], {"reason": "follow_up_expired", "simulated": True}) # ── 2. 任务生成(基于指数快照) ── relation = snapshot["relation"] wbi_map = snapshot["wbi"] nci_map = snapshot["nci"] # 收集 MAIN/COMANAGE 对 ownership_pairs = [ (aid, mid, info) for (aid, mid), info in relation.items() if info["os_label"] in ("MAIN", "COMANAGE") and info["session_count"] > 0 ] for aid, mid, info in ownership_pairs: wbi = wbi_map.get(mid, Decimal(0)) nci = nci_map.get(mid, Decimal(0)) rs = info["rs"] index_data = IndexData( site_id=site_id, assistant_id=aid, member_id=mid, wbi=wbi, nci=nci, rs=rs, has_active_recall=False, has_follow_up_note=False, ) new_type = determine_task_type(index_data) if not new_type: continue key = (aid, mid) existing = active_tasks.get(key) if existing: if existing["task_type"] == new_type: continue # 同类型跳过 if not should_replace_task(existing["task_type"], new_type): continue # 关闭旧任务 if not dry_run: with app_conn.cursor() as cur: cur.execute( "UPDATE biz.coach_tasks SET status = 'inactive', updated_at = %s WHERE id = %s", (day_datetime, existing["id"]), ) _insert_history(cur, existing["id"], "type_change_close", "active", "inactive", existing["task_type"], new_type, {"reason": "replaced_by_simulation", "simulated": True}) # 计算 priority_score priority_score = float(max(wbi, nci)) if new_type in ("high_priority_recall", "priority_recall") else float(rs) if dry_run: active_tasks[key] = { "id": None, "task_type": new_type, "created_at": day_datetime, "expires_at": None, } stats["created"] += 1 continue with app_conn.cursor() as cur: cur.execute( """INSERT INTO biz.coach_tasks (site_id, assistant_id, member_id, task_type, status, priority_score, created_at, updated_at) VALUES (%s, %s, %s, %s, 'active', %s, %s, %s) RETURNING id""", (site_id, aid, mid, new_type, priority_score, day_datetime, day_datetime), ) task_id = cur.fetchone()[0] _insert_history(cur, task_id, "created", None, "active", None, new_type, {"reason": "simulation_generated", "simulated": True}) active_tasks[key] = { "id": task_id, "task_type": new_type, "created_at": day_datetime, "expires_at": None, } stats["created"] += 1 # ── 3. 召回检测(基于当天结算记录) ── settlements = load_settlements_for_day(etl_conn, site_id, d) for (aid, mid), pay_time in settlements.items(): key = (aid, mid) task = active_tasks.get(key) # 写 recall_event if not dry_run: with app_conn.cursor() as cur: try: cur.execute( """INSERT INTO biz.recall_events (site_id, assistant_id, member_id, pay_time, task_id, task_type, created_at) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (site_id, assistant_id, member_id, (date_trunc('day', pay_time AT TIME ZONE 'Asia/Shanghai'))) DO NOTHING RETURNING id""", (site_id, aid, mid, pay_time, task["id"] if task else None, task["task_type"] if task else None, day_datetime), ) inserted = cur.fetchone() if inserted: stats["recall_events"] += 1 except Exception: pass # 去重冲突或其他 if not task: continue # 完成召回任务 if task["task_type"] in ("high_priority_recall", "priority_recall"): if pay_time > task["created_at"]: stats["completed"] += 1 if not dry_run: with app_conn.cursor() as cur: cur.execute( """UPDATE biz.coach_tasks SET status = 'completed', completed_at = %s, completed_task_type = %s, completion_type = 'auto', updated_at = %s WHERE id = %s AND status = 'active'""", (pay_time, task["task_type"], day_datetime, task["id"]), ) _insert_history(cur, task["id"], "completed", "active", "completed", task["task_type"], task["task_type"], {"service_time": str(pay_time), "simulated": True}) # 生成回访任务 expires_at = pay_time + timedelta(hours=FOLLOW_UP_HOURS) if not dry_run: with app_conn.cursor() as cur: cur.execute( """INSERT INTO biz.coach_tasks (site_id, assistant_id, member_id, task_type, status, expires_at, created_at, updated_at) VALUES (%s, %s, %s, 'follow_up_visit', 'active', %s, %s, %s) RETURNING id""", (site_id, aid, mid, expires_at, day_datetime, day_datetime), ) fu_id = cur.fetchone()[0] _insert_history(cur, fu_id, "created", None, "active", None, "follow_up_visit", {"reason": "recall_completed", "simulated": True}) active_tasks[key] = { "id": fu_id, "task_type": "follow_up_visit", "created_at": day_datetime, "expires_at": expires_at, } else: active_tasks[key] = { "id": None, "task_type": "follow_up_visit", "created_at": day_datetime, "expires_at": expires_at, } stats["follow_up_created"] += 1 if not dry_run: app_conn.commit() def _insert_history(cur, task_id, action, old_status, new_status, old_task_type, new_task_type, detail=None): """在 coach_task_history 中记录变更。""" if task_id is None: return cur.execute( """INSERT INTO biz.coach_task_history (task_id, action, old_status, new_status, old_task_type, new_task_type, detail) VALUES (%s, %s, %s, %s, %s, %s, %s)""", (task_id, action, old_status, new_status, old_task_type, new_task_type, json.dumps(detail) if detail else None), ) def get_site_id(etl_conn) -> int: """从 relation_index 获取 site_id。""" with etl_conn.cursor() as cur: cur.execute("SELECT DISTINCT site_id FROM dws.dws_member_assistant_relation_index LIMIT 1") row = cur.fetchone() if not row: raise RuntimeError("relation_index 表为空,请先运行 backfill_index_snapshots.py") return row[0] etl_conn.commit() def clean_simulated_data(app_conn, cutoff: date): """清理模拟产生的数据。""" cutoff_dt = datetime(cutoff.year, cutoff.month, cutoff.day, 23, 59, 59, tzinfo=TZ) with app_conn.cursor() as cur: # 先删 history(外键依赖) cur.execute( """DELETE FROM biz.coach_task_history WHERE task_id IN (SELECT id FROM biz.coach_tasks WHERE created_at < %s)""", (cutoff_dt,), ) history_count = cur.rowcount # 删 recall_events cur.execute( "DELETE FROM biz.recall_events WHERE created_at < %s", (cutoff_dt,), ) events_count = cur.rowcount # 删 coach_tasks cur.execute( "DELETE FROM biz.coach_tasks WHERE created_at < %s", (cutoff_dt,), ) tasks_count = cur.rowcount app_conn.commit() logger.info( "清理完成: %d 条 history, %d 条 recall_events, %d 条 coach_tasks", history_count, events_count, tasks_count, ) def run_simulation(start_date: date, end_date: date, dry_run: bool = False, clean: bool = False): """执行历史任务推演主流程。""" if end_date > CUTOFF_DATE: logger.warning("end_date %s 超过截止日期 %s,自动截断", end_date, CUTOFF_DATE) end_date = CUTOFF_DATE etl_conn, app_conn = get_connections() site_id = get_site_id(etl_conn) if clean: logger.info("清理 %s 之前的模拟数据...", CUTOFF_DATE) clean_simulated_data(app_conn, CUTOFF_DATE) total_days = (end_date - start_date).days + 1 logger.info( "推演参数: %s ~ %s (%d天), site_id=%s, dry_run=%s", start_date, end_date, total_days, site_id, dry_run, ) # 内存中维护 active 任务集 active_tasks: dict[tuple[int, int], dict] = {} stats = { "created": 0, "completed": 0, "expired": 0, "follow_up_created": 0, "recall_events": 0, "skipped_no_snapshot": 0, } t0 = time.time() current = start_date while current <= end_date: snapshot = load_index_snapshot(etl_conn, site_id, current) if not snapshot["relation"] and not snapshot["wbi"] and not snapshot["nci"]: stats["skipped_no_snapshot"] += 1 current += timedelta(days=1) continue simulate_day(app_conn, etl_conn, site_id, current, snapshot, active_tasks, stats, dry_run) day_num = (current - start_date).days + 1 if day_num % 30 == 0 or current == end_date: elapsed = time.time() - t0 logger.info( "进度: %s (%d/%d) | 已创建=%d 已完成=%d 已过期=%d 回访=%d 事件=%d | %.0fs", current, day_num, total_days, stats["created"], stats["completed"], stats["expired"], stats["follow_up_created"], stats["recall_events"], elapsed, ) current += timedelta(days=1) total_elapsed = time.time() - t0 logger.info("=" * 60) logger.info("推演完成: %.1f 秒 (%.1f 分钟)", total_elapsed, total_elapsed / 60) logger.info(" 任务创建: %d", stats["created"]) logger.info(" 任务完成: %d", stats["completed"]) logger.info(" 任务过期: %d", stats["expired"]) logger.info(" 回访生成: %d", stats["follow_up_created"]) logger.info(" 召回事件: %d", stats["recall_events"]) logger.info(" 跳过(无快照): %d", stats["skipped_no_snapshot"]) logger.info(" 推演结束时 active 任务数: %d", len(active_tasks)) # 统计类型分布 type_dist = {} for task in active_tasks.values(): tt = task["task_type"] type_dist[tt] = type_dist.get(tt, 0) + 1 for tt, cnt in sorted(type_dist.items()): logger.info(" %s: %d", tt, cnt) etl_conn.close() app_conn.close() def main(): parser = argparse.ArgumentParser(description="历史任务推演(基于指数日快照)") parser.add_argument("--start", required=True, help="起始日期 (YYYY-MM-DD)") parser.add_argument("--end", required=True, help="结束日期 (YYYY-MM-DD)") parser.add_argument("--dry-run", action="store_true", help="干跑模式") parser.add_argument("--clean", action="store_true", help="清理之前的模拟数据后重跑") args = parser.parse_args() start_date = date.fromisoformat(args.start) end_date = date.fromisoformat(args.end) if start_date > end_date: print("错误:start 必须 <= end", file=sys.stderr) sys.exit(1) run_simulation(start_date, end_date, dry_run=args.dry_run, clean=args.clean) if __name__ == "__main__": main()