Neo-ZQYY/scripts/ops/simulate_historical_tasks.py

# -*- coding: utf-8 -*-
"""
历史任务推演脚本：基于日快照指数重放 task_generator + recall_detector 逻辑

前置条件：先运行 backfill_index_snapshots.py 生成历史指数快照。

用法：
    cd C:\\Project\\NeoZQYY
    uv run python scripts/ops/simulate_historical_tasks.py \\
        --start 2025-08-01 --end 2026-03-28

    # 干跑模式
    uv run python scripts/ops/simulate_historical_tasks.py \\
        --start 2025-08-01 --end 2026-03-28 --dry-run

    # 清理之前的模拟数据后重跑
    uv run python scripts/ops/simulate_historical_tasks.py \\
        --start 2025-08-01 --end 2026-03-28 --clean
"""
from __future__ import annotations

import argparse
import json
import logging
import sys
import time
from datetime import date, datetime, timedelta
from decimal import Decimal
from pathlib import Path
from zoneinfo import ZoneInfo

# ── 环境加载 ──
sys.path.insert(0, str(Path(__file__).resolve().parent))
from _env_paths import ensure_repo_root

ensure_repo_root()

import os
from dotenv import load_dotenv

_ROOT = Path(__file__).resolve().parents[2]
load_dotenv(_ROOT / ".env", override=False)

# 导入 task_generator 的纯函数（在 backend 目录下）
_BACKEND = _ROOT / "apps" / "backend"
sys.path.insert(0, str(_BACKEND))

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("simulate_tasks")

import psycopg2
import psycopg2.extras

TZ = ZoneInfo("Asia/Shanghai")

# 导入 task_generator 纯函数
from app.services.task_generator import (
    IndexData,
    determine_task_type,
    should_replace_task,
    TASK_TYPE_PRIORITY,
)

# 推演截止日期（现有 active 任务从 03-29 开始）
CUTOFF_DATE = date(2026, 3, 28)
# 回访任务过期时长
FOLLOW_UP_HOURS = 48


def get_connections():
    """获取 ETL 库和业务库连接。"""
    pg_dsn = os.environ.get("PG_DSN")
    app_dsn = os.environ.get("APP_DB_DSN")
    if not pg_dsn or not app_dsn:
        print("错误：PG_DSN 和 APP_DB_DSN 必须在 .env 中配置", file=sys.stderr)
        sys.exit(1)

    etl_conn = psycopg2.connect(pg_dsn)
    etl_conn.set_client_encoding("UTF8")
    app_conn = psycopg2.connect(app_dsn)
    app_conn.set_client_encoding("UTF8")
    return etl_conn, app_conn


def load_index_snapshot(etl_conn, site_id: int, stat_date: date) -> dict:
    """加载指定日期的指数快照。

    返回：
    {
        "relation": {(assistant_id, member_id): {rs, os_label, ...}},
        "wbi": {member_id: Decimal},
        "nci": {member_id: Decimal},
    }
    """
    result = {"relation": {}, "wbi": {}, "nci": {}}

    with etl_conn.cursor() as cur:
        # 关系指数
        cur.execute(
            """SELECT assistant_id, member_id, rs_display, os_label, os_share, session_count
            FROM dws.dws_member_assistant_relation_index
            WHERE site_id = %s AND stat_date = %s""",
            (site_id, stat_date),
        )
        for row in cur.fetchall():
            result["relation"][(row[0], row[1])] = {
                "rs": Decimal(str(row[2])),
                "os_label": row[3],
                "os_share": Decimal(str(row[4])),
                "session_count": row[5],
            }

        # WBI
        cur.execute(
            """SELECT member_id, display_score
            FROM dws.dws_member_winback_index
            WHERE site_id = %s AND stat_date = %s""",
            (site_id, stat_date),
        )
        for row in cur.fetchall():
            result["wbi"][row[0]] = Decimal(str(row[1])) if row[1] else Decimal(0)

        # NCI
        cur.execute(
            """SELECT member_id, display_score
            FROM dws.dws_member_newconv_index
            WHERE site_id = %s AND stat_date = %s""",
            (site_id, stat_date),
        )
        for row in cur.fetchall():
            result["nci"][row[0]] = Decimal(str(row[1])) if row[1] else Decimal(0)

    etl_conn.commit()
    return result


def load_settlements_for_day(etl_conn, site_id: int, d: date) -> dict:
    """加载指定日期的结算记录。

    返回：{(assistant_id, member_id): latest_pay_time}
    """
    settlements = {}
    day_start = datetime(d.year, d.month, d.day, 0, 0, 0, tzinfo=TZ)
    day_end = day_start + timedelta(days=1)

    with etl_conn.cursor() as cur:
        cur.execute(
            """
            SELECT sl.site_assistant_id AS assistant_id,
                   sh.member_id,
                   MAX(sh.pay_time) AS latest_pay_time
            FROM dwd.dwd_settlement_head sh
            JOIN dwd.dwd_assistant_service_log sl
              ON sl.order_settle_id = sh.order_settle_id
             AND sl.is_delete = 0
            WHERE sh.site_id = %s
              AND sh.settle_type IN (1, 3)
              AND sh.pay_time >= %s AND sh.pay_time < %s
            GROUP BY sl.site_assistant_id, sh.member_id
            """,
            (site_id, day_start, day_end),
        )
        for row in cur.fetchall():
            if row[0] and row[1]:
                settlements[(row[0], row[1])] = row[2]

    etl_conn.commit()
    return settlements


def simulate_day(
    app_conn,
    etl_conn,
    site_id: int,
    d: date,
    snapshot: dict,
    active_tasks: dict,
    stats: dict,
    dry_run: bool = False,
):
    """模拟单天的任务生成和召回检测。

    active_tasks: {(assistant_id, member_id): {"id": int, "task_type": str, "created_at": datetime, "expires_at": datetime|None}}
    stats: 累计统计
    """
    day_datetime = datetime(d.year, d.month, d.day, 7, 0, 0, tzinfo=TZ)

    # ── 1. 过期检测（回访任务 expires_at < 当天） ──
    expired_keys = []
    for key, task in active_tasks.items():
        if task.get("expires_at") and task["expires_at"] < day_datetime:
            expired_keys.append(key)
    for key in expired_keys:
        task = active_tasks.pop(key)
        stats["expired"] += 1
        if not dry_run:
            with app_conn.cursor() as cur:
                cur.execute(
                    "UPDATE biz.coach_tasks SET status = 'expired', updated_at = %s WHERE id = %s",
                    (day_datetime, task["id"]),
                )
                _insert_history(cur, task["id"], "expired", "active", "expired",
                                task["task_type"], task["task_type"],
                                {"reason": "follow_up_expired", "simulated": True})

    # ── 2. 任务生成（基于指数快照） ──
    relation = snapshot["relation"]
    wbi_map = snapshot["wbi"]
    nci_map = snapshot["nci"]

    # 收集 MAIN/COMANAGE 对
    ownership_pairs = [
        (aid, mid, info)
        for (aid, mid), info in relation.items()
        if info["os_label"] in ("MAIN", "COMANAGE") and info["session_count"] > 0
    ]

    for aid, mid, info in ownership_pairs:
        wbi = wbi_map.get(mid, Decimal(0))
        nci = nci_map.get(mid, Decimal(0))
        rs = info["rs"]

        index_data = IndexData(
            site_id=site_id,
            assistant_id=aid,
            member_id=mid,
            wbi=wbi,
            nci=nci,
            rs=rs,
            has_active_recall=False,
            has_follow_up_note=False,
        )
        new_type = determine_task_type(index_data)
        if not new_type:
            continue

        key = (aid, mid)
        existing = active_tasks.get(key)

        if existing:
            if existing["task_type"] == new_type:
                continue  # 同类型跳过
            if not should_replace_task(existing["task_type"], new_type):
                continue
            # 关闭旧任务
            if not dry_run:
                with app_conn.cursor() as cur:
                    cur.execute(
                        "UPDATE biz.coach_tasks SET status = 'inactive', updated_at = %s WHERE id = %s",
                        (day_datetime, existing["id"]),
                    )
                    _insert_history(cur, existing["id"], "type_change_close", "active", "inactive",
                                    existing["task_type"], new_type,
                                    {"reason": "replaced_by_simulation", "simulated": True})

        # 计算 priority_score
        priority_score = float(max(wbi, nci)) if new_type in ("high_priority_recall", "priority_recall") else float(rs)

        if dry_run:
            active_tasks[key] = {
                "id": None,
                "task_type": new_type,
                "created_at": day_datetime,
                "expires_at": None,
            }
            stats["created"] += 1
            continue

        with app_conn.cursor() as cur:
            cur.execute(
                """INSERT INTO biz.coach_tasks
                    (site_id, assistant_id, member_id, task_type, status, priority_score, created_at, updated_at)
                VALUES (%s, %s, %s, %s, 'active', %s, %s, %s)
                RETURNING id""",
                (site_id, aid, mid, new_type, priority_score, day_datetime, day_datetime),
            )
            task_id = cur.fetchone()[0]
            _insert_history(cur, task_id, "created", None, "active", None, new_type,
                            {"reason": "simulation_generated", "simulated": True})

        active_tasks[key] = {
            "id": task_id,
            "task_type": new_type,
            "created_at": day_datetime,
            "expires_at": None,
        }
        stats["created"] += 1

    # ── 3. 召回检测（基于当天结算记录） ──
    settlements = load_settlements_for_day(etl_conn, site_id, d)

    for (aid, mid), pay_time in settlements.items():
        key = (aid, mid)
        task = active_tasks.get(key)

        # 写 recall_event
        if not dry_run:
            with app_conn.cursor() as cur:
                try:
                    cur.execute(
                        """INSERT INTO biz.recall_events
                            (site_id, assistant_id, member_id, pay_time, task_id, task_type, created_at)
                        VALUES (%s, %s, %s, %s, %s, %s, %s)
                        ON CONFLICT (site_id, assistant_id, member_id,
                                     (date_trunc('day', pay_time AT TIME ZONE 'Asia/Shanghai')))
                        DO NOTHING RETURNING id""",
                        (site_id, aid, mid, pay_time,
                         task["id"] if task else None,
                         task["task_type"] if task else None,
                         day_datetime),
                    )
                    inserted = cur.fetchone()
                    if inserted:
                        stats["recall_events"] += 1
                except Exception:
                    pass  # 去重冲突或其他

        if not task:
            continue

        # 完成召回任务
        if task["task_type"] in ("high_priority_recall", "priority_recall"):
            if pay_time > task["created_at"]:
                stats["completed"] += 1
                if not dry_run:
                    with app_conn.cursor() as cur:
                        cur.execute(
                            """UPDATE biz.coach_tasks
                            SET status = 'completed', completed_at = %s, completed_task_type = %s,
                                completion_type = 'auto', updated_at = %s
                            WHERE id = %s AND status = 'active'""",
                            (pay_time, task["task_type"], day_datetime, task["id"]),
                        )
                        _insert_history(cur, task["id"], "completed", "active", "completed",
                                        task["task_type"], task["task_type"],
                                        {"service_time": str(pay_time), "simulated": True})

                # 生成回访任务
                expires_at = pay_time + timedelta(hours=FOLLOW_UP_HOURS)
                if not dry_run:
                    with app_conn.cursor() as cur:
                        cur.execute(
                            """INSERT INTO biz.coach_tasks
                                (site_id, assistant_id, member_id, task_type, status, expires_at, created_at, updated_at)
                            VALUES (%s, %s, %s, 'follow_up_visit', 'active', %s, %s, %s)
                            RETURNING id""",
                            (site_id, aid, mid, expires_at, day_datetime, day_datetime),
                        )
                        fu_id = cur.fetchone()[0]
                        _insert_history(cur, fu_id, "created", None, "active", None, "follow_up_visit",
                                        {"reason": "recall_completed", "simulated": True})

                    active_tasks[key] = {
                        "id": fu_id,
                        "task_type": "follow_up_visit",
                        "created_at": day_datetime,
                        "expires_at": expires_at,
                    }
                else:
                    active_tasks[key] = {
                        "id": None,
                        "task_type": "follow_up_visit",
                        "created_at": day_datetime,
                        "expires_at": expires_at,
                    }
                stats["follow_up_created"] += 1

    if not dry_run:
        app_conn.commit()


def _insert_history(cur, task_id, action, old_status, new_status, old_task_type, new_task_type, detail=None):
    """在 coach_task_history 中记录变更。"""
    if task_id is None:
        return
    cur.execute(
        """INSERT INTO biz.coach_task_history
            (task_id, action, old_status, new_status, old_task_type, new_task_type, detail)
        VALUES (%s, %s, %s, %s, %s, %s, %s)""",
        (task_id, action, old_status, new_status, old_task_type, new_task_type,
         json.dumps(detail) if detail else None),
    )


def get_site_id(etl_conn) -> int:
    """从 relation_index 获取 site_id。"""
    with etl_conn.cursor() as cur:
        cur.execute("SELECT DISTINCT site_id FROM dws.dws_member_assistant_relation_index LIMIT 1")
        row = cur.fetchone()
        if not row:
            raise RuntimeError("relation_index 表为空，请先运行 backfill_index_snapshots.py")
        return row[0]
    etl_conn.commit()


def clean_simulated_data(app_conn, cutoff: date):
    """清理模拟产生的数据。"""
    cutoff_dt = datetime(cutoff.year, cutoff.month, cutoff.day, 23, 59, 59, tzinfo=TZ)
    with app_conn.cursor() as cur:
        # 先删 history（外键依赖）
        cur.execute(
            """DELETE FROM biz.coach_task_history
            WHERE task_id IN (SELECT id FROM biz.coach_tasks WHERE created_at < %s)""",
            (cutoff_dt,),
        )
        history_count = cur.rowcount

        # 删 recall_events
        cur.execute(
            "DELETE FROM biz.recall_events WHERE created_at < %s",
            (cutoff_dt,),
        )
        events_count = cur.rowcount

        # 删 coach_tasks
        cur.execute(
            "DELETE FROM biz.coach_tasks WHERE created_at < %s",
            (cutoff_dt,),
        )
        tasks_count = cur.rowcount

    app_conn.commit()
    logger.info(
        "清理完成: %d 条 history, %d 条 recall_events, %d 条 coach_tasks",
        history_count, events_count, tasks_count,
    )


def run_simulation(start_date: date, end_date: date, dry_run: bool = False, clean: bool = False):
    """执行历史任务推演主流程。"""
    if end_date > CUTOFF_DATE:
        logger.warning("end_date %s 超过截止日期 %s，自动截断", end_date, CUTOFF_DATE)
        end_date = CUTOFF_DATE

    etl_conn, app_conn = get_connections()
    site_id = get_site_id(etl_conn)

    if clean:
        logger.info("清理 %s 之前的模拟数据...", CUTOFF_DATE)
        clean_simulated_data(app_conn, CUTOFF_DATE)

    total_days = (end_date - start_date).days + 1
    logger.info(
        "推演参数: %s ~ %s (%d天), site_id=%s, dry_run=%s",
        start_date, end_date, total_days, site_id, dry_run,
    )

    # 内存中维护 active 任务集
    active_tasks: dict[tuple[int, int], dict] = {}
    stats = {
        "created": 0, "completed": 0, "expired": 0,
        "follow_up_created": 0, "recall_events": 0, "skipped_no_snapshot": 0,
    }

    t0 = time.time()
    current = start_date
    while current <= end_date:
        snapshot = load_index_snapshot(etl_conn, site_id, current)

        if not snapshot["relation"] and not snapshot["wbi"] and not snapshot["nci"]:
            stats["skipped_no_snapshot"] += 1
            current += timedelta(days=1)
            continue

        simulate_day(app_conn, etl_conn, site_id, current, snapshot, active_tasks, stats, dry_run)

        day_num = (current - start_date).days + 1
        if day_num % 30 == 0 or current == end_date:
            elapsed = time.time() - t0
            logger.info(
                "进度: %s (%d/%d) | 已创建=%d 已完成=%d 已过期=%d 回访=%d 事件=%d | %.0fs",
                current, day_num, total_days,
                stats["created"], stats["completed"], stats["expired"],
                stats["follow_up_created"], stats["recall_events"], elapsed,
            )

        current += timedelta(days=1)

    total_elapsed = time.time() - t0
    logger.info("=" * 60)
    logger.info("推演完成: %.1f 秒 (%.1f 分钟)", total_elapsed, total_elapsed / 60)
    logger.info("  任务创建: %d", stats["created"])
    logger.info("  任务完成: %d", stats["completed"])
    logger.info("  任务过期: %d", stats["expired"])
    logger.info("  回访生成: %d", stats["follow_up_created"])
    logger.info("  召回事件: %d", stats["recall_events"])
    logger.info("  跳过（无快照）: %d", stats["skipped_no_snapshot"])
    logger.info("  推演结束时 active 任务数: %d", len(active_tasks))

    # 统计类型分布
    type_dist = {}
    for task in active_tasks.values():
        tt = task["task_type"]
        type_dist[tt] = type_dist.get(tt, 0) + 1
    for tt, cnt in sorted(type_dist.items()):
        logger.info("    %s: %d", tt, cnt)

    etl_conn.close()
    app_conn.close()


def main():
    parser = argparse.ArgumentParser(description="历史任务推演（基于指数日快照）")
    parser.add_argument("--start", required=True, help="起始日期 (YYYY-MM-DD)")
    parser.add_argument("--end", required=True, help="结束日期 (YYYY-MM-DD)")
    parser.add_argument("--dry-run", action="store_true", help="干跑模式")
    parser.add_argument("--clean", action="store_true", help="清理之前的模拟数据后重跑")
    args = parser.parse_args()

    start_date = date.fromisoformat(args.start)
    end_date = date.fromisoformat(args.end)

    if start_date > end_date:
        print("错误：start 必须 <= end", file=sys.stderr)
        sys.exit(1)

    run_simulation(start_date, end_date, dry_run=args.dry_run, clean=args.clean)


if __name__ == "__main__":
    main()