fix(ai): F1-5a 沙箱 batch-run 接入 runtime_context (W1 / 阶段 A 主体)

Neo F1-5 反馈: "让沙箱起到其真正的作用. 真正的模拟日期, 仅能看到沙箱设定日期及之前日期的数据, 并运行 AI 的各个业务." 调研发现 (4 个并行子代理): batch-run 端点 _run_batch 是空壳 stub (只 logger.info, 实际不跑 AI), GUC apply_runtime_session_vars 0 处调用 (dead code), 7 张业务表 6 张有 runtime 复合索引唯独 ai_run_logs 漏建, App2/2a 3 行 _calc_date_range 漏传 ref_date. 本 commit (F1-5a 阶段 A 主体, F1-5b 后续完整 zqyy_app RLS 视图层): 后端核心: - admin_service.py: _run_batch 真实化 (Semaphore(5)+asyncio.gather+ return_exceptions=True+ctx_snapshot 防漂移); estimate 入口抓 RuntimeContext 快照, confirm 取出传给 worker - admin_ai.py: confirm_batch_run lazy 注入 dispatcher - admin_service.retry_trigger_job: INSERT 落 runtime_mode + sandbox_instance_id 列 (用 runtime_insert_columns helper) - runtime_context.py: get_runtime_context 加 bind_to_session 参数, 激活 GUC app.current_business_date / app.current_runtime_mode - run_log_service.create_log: 启用 bind_to_session=True 试点 App2/2a 3 行 ref_date 修复: - app2_finance_prompt.py:817 储值卡余额变化板块 - app2_finance_prompt.py:841 日粒度 series + 异常检测窗口 - app2a_finance_area_prompt.py:466 区域日粒度 series DB: - migrations/20260505__ai_run_logs_runtime_index.sql: 补 (site_id, runtime_mode, sandbox_instance_id, created_at DESC) 复合索引前端: - AIOperations.tsx: 顶部加 sandbox 模式提示条 (Alert 显示 sandbox_date + sandbox_instance_id + 影响范围 + 切回 live 入口) 未做 (留 F1-5b 完整 zqyy_app RLS 视图层一并): - B1 admin_service 6 处 CURRENT_DATE -> business_date - B2 fdw_queries 异常分支兜底 - GUC 完整传递 (fdw_queries / page_context 等) - 测试 3 套 (.gitignore:71 排除, F2-2 入仓时 commit) - P20 SPEC \xa76/\xa710/\xa711/\xa715 (F1-5b 完整收口后同步更准确) Neo 决策: docs/_overview/wave1-findings/F1-5-impl-decisions.md 详见 docs/audit/changes/2026-05-05__wave1_f1_5a_sandbox_batch_run.md
2026-05-05 03:01:48 +08:00
parent a99bbd9a74
commit 421e193041
10 changed files with 909 additions and 40 deletions
--- a/apps/admin-web/src/pages/AIOperations.tsx
+++ b/apps/admin-web/src/pages/AIOperations.tsx
@@ -10,7 +10,7 @@

 import React, { useEffect, useState, useCallback } from "react";
 import {
-  Card, Row, Col, Select, Input, Button, Table, Tag, Space,
+  Alert, Card, Row, Col, Select, Input, Button, Table, Tag, Space,
  Checkbox, Modal, Statistic, message, Typography,
 } from "antd";
 import { ReloadOutlined } from "@ant-design/icons";
@@ -20,6 +20,8 @@ import {
  getAlerts, ackAlert, ignoreAlert, runApp, triggerEvent,
  type AlertItem, type AppType, type BatchRunEstimate,
 } from "../api/adminAI";
+// F1-5a: sandbox 模式提示条数据源
+import { fetchRuntimeContext, type RuntimeContext } from "../api/runtimeContext";

 const EVENT_TYPE_OPTIONS = [
  { label: "消费事件（App3→App8→App7 [+ App4→App5]）", value: "consumption" },
@@ -213,6 +215,24 @@ const AIOperations: React.FC = () => {
    }
  };

+  // ---- F1-5a: Sandbox 模式提示条 ----
+  // 沙箱机制 P0-7 主线:让运维进入 AI 操作页前能看到当前 sandbox 状态,
+  // 避免"以为 live 模式"误触发批量执行实际跑在 sandbox 数据集上的混淆。
+  const [runtimeCtx, setRuntimeCtx] = useState<RuntimeContext | null>(null);
+  useEffect(() => {
+    // 复用 cacheSiteId 作为当前关注 site(默认 2790685415443269,与 cacheSiteId / runSiteId 一致)
+    let cancelled = false;
+    (async () => {
+      try {
+        const ctx = await fetchRuntimeContext(cacheSiteId);
+        if (!cancelled) setRuntimeCtx(ctx);
+      } catch {
+        // 失败不阻断页面渲染(get_runtime_context 表不存在时后端降级 live)
+      }
+    })();
+    return () => { cancelled = true; };
+  }, [cacheSiteId]);
+
  // ---- Card 4: 告警管理 ----
  const [alerts, setAlerts] = useState<AlertItem[]>([]);
  const [alertTotal, setAlertTotal] = useState(0);
@@ -282,6 +302,28 @@ const AIOperations: React.FC = () => {
    <div>
      <Title level={4} style={{ marginBottom: 16 }}>AI 手动操作</Title>

+      {runtimeCtx && runtimeCtx.is_sandbox && (
+        <Alert
+          type="warning"
+          showIcon
+          style={{ marginBottom: 16 }}
+          message={
+            <span>
+              <strong>沙箱模式</strong> · 业务日 <strong>{runtimeCtx.sandbox_date ?? "—"}</strong> ·
+              实例 <code>{runtimeCtx.sandbox_instance_id ?? "—"}</code>
+            </span>
+          }
+          description={
+            <span>
+              当前 site_id={cacheSiteId} 处于沙箱模式。本页所有 AI 触发(手动重跑 / 缓存失效 / 按需执行 / 批量执行)将使用
+              沙箱业务日 ({runtimeCtx.sandbox_date}) 而非真实今日;ETL 视图自动按业务日上界裁剪,助教/会员消费数据仅可见
+              沙箱日及之前。结果写入 ai_run_logs 时 runtime_mode=sandbox + sandbox_instance_id 隔离,
+              不污染 live 数据。如需切回 live,前往 <a href="/settings/runtime-context">运行上下文</a> 页。
+            </span>
+          }
+        />
+      )}
+
      <Row gutter={16} style={{ marginBottom: 16 }}>
        {/* Card 1: 手动重跑 */}
        <Col span={12}>
--- a/apps/backend/app/ai/prompts/app2_finance_prompt.py
+++ b/apps/backend/app/ai/prompts/app2_finance_prompt.py
@@ -814,7 +814,10 @@ async def build_prompt(
    #    避免 AI 在只看当期充值/消耗时对"余额为何涨"的矛盾自圆其说
    if area == "all" and isinstance(recharge := board_data.get("recharge"), dict):
        try:
-            start_date_obj, _end = _calc_date_range(board_time)
+            from app.services.runtime_context import get_runtime_context
+            start_date_obj, _end = _calc_date_range(
+                board_time, ref_date=get_runtime_context(site_id).business_date,
+            )
            opening = _fetch_card_balance_opening(site_id, str(start_date_obj))
            closing = float(recharge.get("card_balance") or 0)
            period_recharge = float(recharge.get("actual_income") or 0)
@@ -838,7 +841,10 @@ async def build_prompt(
    #    - 日粒度异常：同星期均值基线下的极端偏离
    if area == "all":
        try:
-            start_date, end_date = _calc_date_range(board_time)
+            from app.services.runtime_context import get_runtime_context
+            start_date, end_date = _calc_date_range(
+                board_time, ref_date=get_runtime_context(site_id).business_date,
+            )
            series = _fetch_daily_series(site_id, str(start_date), str(end_date))
            # 上期序列（用于客单价环比）
            prev_series: list[tuple] | None = None
--- a/apps/backend/app/ai/prompts/app2a_finance_area_prompt.py
+++ b/apps/backend/app/ai/prompts/app2a_finance_area_prompt.py
@@ -463,7 +463,10 @@ async def build_prompt(

    # 日粒度派生（区域级）
    try:
-        start_date, end_date = _calc_date_range(board_time)
+        from app.services.runtime_context import get_runtime_context
+        start_date, end_date = _calc_date_range(
+            board_time, ref_date=get_runtime_context(site_id).business_date,
+        )
        series = _fetch_area_daily_series(
            site_id, str(start_date), str(end_date), area_code=area,
        )
--- a/apps/backend/app/ai/run_log_service.py
+++ b/apps/backend/app/ai/run_log_service.py
@@ -59,7 +59,10 @@ class AIRunLogService:
        truncated = _truncate_prompt(request_prompt)
        conn = self._get_conn()
        try:
-            ctx = get_runtime_context(site_id, conn=conn)
+            # F1-5a: bind_to_session=True 激活 GUC app.current_business_date,
+            # 让本事务内所有 ETL 库 app.v_* 视图自动按 business_date 上界裁剪
+            # (如果后续 fetch 走 ETL 视图)。
+            ctx = get_runtime_context(site_id, conn=conn, bind_to_session=True)
            runtime_mode = MODE_SANDBOX if ctx.is_sandbox else MODE_LIVE
            sandbox_instance_id = ctx.sandbox_instance_id if ctx.is_sandbox else LIVE_INSTANCE_ID
            with conn.cursor() as cur:
--- a/apps/backend/app/routers/admin_ai.py
+++ b/apps/backend/app/routers/admin_ai.py
@@ -285,7 +285,18 @@ async def confirm_batch_run(
    body: BatchRunConfirm,
    user: CurrentUser = Depends(_require_admin()),
 ) -> BatchRunConfirmResponse:
-    """确认批量执行，后台异步执行。"""
+    """确认批量执行，后台异步执行。
+
+    F1-5a: lazy 注入 dispatcher 到 _admin_svc(首次调用时绑定),
+    避免模块加载顺序问题(dispatcher 通常在 lifespan startup 才初始化)。
+    """
+    if _admin_svc._dispatcher is None:
+        try:
+            from app.ai.dispatcher import get_dispatcher
+            _admin_svc.set_dispatcher(get_dispatcher())
+        except RuntimeError as exc:
+            raise HTTPException(status_code=503, detail=f"Dispatcher 未初始化: {exc}") from exc
+
    try:
        await _admin_svc.confirm_batch(batch_id=body.batch_id)
    except ValueError as exc:
--- a/apps/backend/app/services/ai/admin_service.py
+++ b/apps/backend/app/services/ai/admin_service.py
@@ -13,9 +13,18 @@ import asyncio
 import logging
 import uuid
 from datetime import datetime, timezone, timedelta
+from typing import TYPE_CHECKING, Any

 from app.ai.budget_tracker import BudgetTracker
 from app.database import get_connection
+from app.services.runtime_context import (
+    RuntimeContext,
+    get_runtime_context,
+    runtime_insert_columns,
+)
+
+if TYPE_CHECKING:
+    from app.ai.dispatcher import AIDispatcher

 logger = logging.getLogger(__name__)

@@ -25,13 +34,22 @@ AVG_TOKENS_PER_CALL = 2000
 # 批量执行内存存储 TTL（秒）
 _BATCH_TTL_SECONDS = 600  # 10 分钟

+# F1-5a 批量执行并发上限
+# Neo 决策 N=5（与 dispatcher 现有 circuit_breaker / rate_limiter 配合，不打爆 DashScope 1000 RPM 限制）
+_BATCH_CONCURRENCY = 5
+

 class AdminAIService:
    """AI 监控后台聚合服务。"""

    def __init__(self, budget_tracker: BudgetTracker | None = None) -> None:
        self._budget = budget_tracker
-        self._batch_store: dict[str, dict] = {}  # batch_id → {params, expires_at}
+        self._batch_store: dict[str, dict] = {}  # batch_id → {params, ctx_snapshot, expires_at}
+        self._dispatcher: AIDispatcher | None = None  # F1-5a: lifespan 启动时注入
+
+    def set_dispatcher(self, dispatcher: AIDispatcher) -> None:
+        """F1-5a: lifespan 启动时注入 dispatcher，用于 _run_batch 实际执行 AI 调用。"""
+        self._dispatcher = dispatcher

    # ── Dashboard ─────────────────────────────────────────

@@ -344,29 +362,37 @@ class AdminAIService:
        return _row_to_dict(cols, row)

    async def retry_trigger_job(self, job_id: int) -> int:
-        """创建新 trigger_job（is_forced=true），返回新 job_id。"""
+        """创建新 trigger_job（is_forced=true），返回新 job_id。
+
+        F1-5a: INSERT 显式落 runtime_mode + sandbox_instance_id，
+        与原 trigger_job 的 runtime 上下文保持一致（避免依赖默认值导致重试时丢失 sandbox 标记）。
+        """
        original = await self.get_trigger_job(job_id)
        if original is None:
            raise ValueError(f"trigger_job {job_id} 不存在")

+        site_id = original["site_id"]
+        cols, placeholders, runtime_params = runtime_insert_columns(site_id)
+
        conn = get_connection()
        try:
            with conn.cursor() as cur:
                cur.execute(
-                    """
+                    f"""
                    INSERT INTO biz.ai_trigger_jobs
                        (event_type, member_id, site_id, connector_type,
-                         payload, app_chain, is_forced, status)
-                    VALUES (%s, %s, %s, %s, %s, %s, true, 'pending')
+                         payload, app_chain, is_forced, status, {cols})
+                    VALUES (%s, %s, %s, %s, %s, %s, true, 'pending', {placeholders})
                    RETURNING id
                    """,
                    (
                        original["event_type"],
                        original.get("member_id"),
-                        original["site_id"],
+                        site_id,
                        original.get("connector_type", "feiqiu"),
                        original.get("payload"),
                        original.get("app_chain"),
+                        *runtime_params,
                    ),
                )
                new_id = cur.fetchone()[0]
@@ -576,22 +602,41 @@ class AdminAIService:
    async def estimate_batch(
        self, app_types: list[str], member_ids: list[int], site_id: int,
    ) -> dict:
-        """生成 batch_id，存入内存（TTL 10min），返回预估。"""
+        """生成 batch_id，存入内存（TTL 10min），返回预估。
+
+        F1-5a: estimate 阶段抓 RuntimeContext 快照存入 _batch_store，
+        confirm 时取出传给 _run_batch，避免 estimate→confirm 间 Neo 切 sandbox 模式造成数据漂移污染。
+        """
        self._cleanup_expired_batches()

        batch_id = uuid.uuid4().hex
        estimated_calls = len(app_types) * len(member_ids)
        estimated_tokens = estimated_calls * AVG_TOKENS_PER_CALL

+        # F1-5a: 抓 ctx_snapshot
+        ctx_snapshot = get_runtime_context(site_id)
+
        self._batch_store[batch_id] = {
            "params": {
                "app_types": app_types,
                "member_ids": member_ids,
                "site_id": site_id,
+                "batch_id": batch_id,  # 用于 _run_batch 内 triggered_by 标注
            },
+            "ctx_snapshot": ctx_snapshot,
            "expires_at": datetime.now(timezone.utc) + timedelta(seconds=_BATCH_TTL_SECONDS),
        }

+        logger.info(
+            "批量执行预估: batch_id=%s apps=%s members=%d site_id=%s mode=%s sandbox_date=%s",
+            batch_id,
+            app_types,
+            len(member_ids),
+            site_id,
+            ctx_snapshot.mode,
+            ctx_snapshot.sandbox_date.isoformat() if ctx_snapshot.sandbox_date else None,
+        )
+
        return {
            "batch_id": batch_id,
            "estimated_calls": estimated_calls,
@@ -599,7 +644,7 @@ class AdminAIService:
        }

    async def confirm_batch(self, batch_id: str) -> None:
-        """取出参数，异步执行批量调用。"""
+        """取出参数 + ctx_snapshot，异步执行批量调用。"""
        self._cleanup_expired_batches()

        entry = self._batch_store.pop(batch_id, None)
@@ -607,28 +652,77 @@ class AdminAIService:
            raise ValueError(f"batch_id 无效或已过期: {batch_id}")

        params = entry["params"]
+        ctx_snapshot: RuntimeContext = entry["ctx_snapshot"]
        logger.info(
-            "批量执行确认: batch_id=%s apps=%s members=%d site_id=%s",
+            "批量执行确认: batch_id=%s apps=%s members=%d site_id=%s mode=%s",
            batch_id,
            params["app_types"],
            len(params["member_ids"]),
            params["site_id"],
+            ctx_snapshot.mode,
        )
-        # 后台异步执行（具体调用链由路由层注入 dispatcher 处理）
-        asyncio.create_task(
-            self._run_batch(params["app_types"], params["member_ids"], params["site_id"])
-        )
+        # F1-5a: 后台异步执行，传入 ctx_snapshot 防止运行时 Neo 切模式
+        asyncio.create_task(self._run_batch(params, ctx_snapshot))

    async def _run_batch(
-        self, app_types: list[str], member_ids: list[int], site_id: int,
+        self, params: dict[str, Any], ctx_snapshot: RuntimeContext,
    ) -> None:
-        """后台批量执行（占位实现，实际由 dispatcher 驱动）。"""
+        """F1-5a: 后台批量执行真正实现。
+
+        - Semaphore(_BATCH_CONCURRENCY=5) 限并发，避免打爆 DashScope 限流
+        - asyncio.gather + return_exceptions=True：单个 member 失败不连坐
+        - context 显式带 business_date（来自 ctx_snapshot），prompt builder 自取沙箱日期
+        - triggered_by=f"batch:{batch_id}"：打标 ai_run_logs，Wave 2 进度查询基础
+        """
+        if self._dispatcher is None:
+            logger.error(
+                "批量执行失败: dispatcher 未注入 batch_id=%s",
+                params.get("batch_id"),
+            )
+            return
+
+        app_types: list[str] = params["app_types"]
+        member_ids: list[int] = params["member_ids"]
+        site_id: int = params["site_id"]
+        batch_id: str = params["batch_id"]
+
+        sem = asyncio.Semaphore(_BATCH_CONCURRENCY)
+        triggered_by = f"batch:{batch_id}"
+        business_date_iso = ctx_snapshot.business_date.isoformat()
+
+        async def _run_one(app_type: str, member_id: int) -> None:
+            async with sem:
+                try:
+                    await self._dispatcher.run_single_app(
+                        app_type=app_type,
+                        context={
+                            "site_id": site_id,
+                            "member_id": member_id,
+                            "business_date": business_date_iso,
+                        },
+                        triggered_by=triggered_by,
+                    )
+                except Exception:
+                    # 单个失败已写 ai_run_logs，此处仅记录（不连坐其他 member）
+                    logger.exception(
+                        "批量执行单步失败 batch_id=%s app=%s member=%s",
+                        batch_id, app_type, member_id,
+                    )
+
+        tasks = [
+            asyncio.create_task(_run_one(at, mid))
+            for at in app_types
+            for mid in member_ids
+        ]
+
        logger.info(
-            "批量执行开始: apps=%s members=%d site_id=%s",
-            app_types, len(member_ids), site_id,
+            "批量执行开始: batch_id=%s tasks=%d concurrency=%d mode=%s",
+            batch_id, len(tasks), _BATCH_CONCURRENCY, ctx_snapshot.mode,
        )
-        # 实际执行逻辑在路由层通过 dispatcher.handle_trigger 驱动
-        # 此处仅记录日志，避免服务层直接依赖 dispatcher 实例
+
+        await asyncio.gather(*tasks, return_exceptions=True)
+
+        logger.info("批量执行完成: batch_id=%s tasks=%d", batch_id, len(tasks))

    def _cleanup_expired_batches(self) -> None:
        """清理过期 batch。"""
--- a/apps/backend/app/services/runtime_context.py
+++ b/apps/backend/app/services/runtime_context.py
@@ -8,6 +8,7 @@

 from __future__ import annotations

+import logging
 import uuid
 from dataclasses import dataclass
 from datetime import date, datetime, time, timedelta, timezone
@@ -15,6 +16,8 @@ from typing import Any

 from app import config

+logger = logging.getLogger(__name__)
+
 _LOCAL_TZ = timezone(timedelta(hours=8))
 MODE_LIVE = "live"
 MODE_SANDBOX = "sandbox"
@@ -85,10 +88,23 @@ def _default_context(site_id: int) -> RuntimeContext:
    return RuntimeContext(site_id=site_id)


-def get_runtime_context(site_id: int, conn: Any | None = None) -> RuntimeContext:
+def get_runtime_context(
+    site_id: int,
+    conn: Any | None = None,
+    *,
+    bind_to_session: bool = False,
+) -> RuntimeContext:
    """读取门店运行上下文。

    表不存在或未配置时降级为 live，保证迁移前不影响正式链路。
+
+    F1-5a 新增 ``bind_to_session``:当 True 且 conn 非空时,在返回前调用
+    ``apply_runtime_session_vars(conn, ctx)`` 设置 GUC ``app.current_business_date`` /
+    ``app.current_runtime_mode``,激活 ETL 库 26 个 ``app.v_*`` 视图的业务日上界裁剪
+    (`app.business_date_now()` 函数读取 GUC)。
+
+    使用场景:fdw_queries 等走 ETL 库视图的查询入口处显式 ``bind_to_session=True``。
+    其余只读取 ctx 不查 ETL 视图的调用方保持默认 ``False`` 即可。
    """
    own_conn = conn is None
    if own_conn:
@@ -120,22 +136,34 @@ def get_runtime_context(site_id: int, conn: Any | None = None) -> RuntimeContext
            conn.close()

    if not row:
-        return _default_context(site_id)
+        ctx = _default_context(site_id)
+    else:
+        mode, sandbox_date, sandbox_instance_id, ai_mode, status = row
+        if mode not in (MODE_LIVE, MODE_SANDBOX):
+            mode = MODE_LIVE
+        if mode == MODE_SANDBOX and (sandbox_date is None or not sandbox_instance_id):
+            mode = MODE_LIVE

-    mode, sandbox_date, sandbox_instance_id, ai_mode, status = row
-    if mode not in (MODE_LIVE, MODE_SANDBOX):
-        mode = MODE_LIVE
-    if mode == MODE_SANDBOX and (sandbox_date is None or not sandbox_instance_id):
-        mode = MODE_LIVE
+        ctx = RuntimeContext(
+            site_id=site_id,
+            mode=mode,
+            sandbox_date=sandbox_date,
+            sandbox_instance_id=sandbox_instance_id,
+            ai_mode=ai_mode or AI_MODE_LIVE,
+            status=status or "active",
+        )

-    return RuntimeContext(
-        site_id=site_id,
-        mode=mode,
-        sandbox_date=sandbox_date,
-        sandbox_instance_id=sandbox_instance_id,
-        ai_mode=ai_mode or AI_MODE_LIVE,
-        status=status or "active",
-    )
+    # F1-5a: 显式开启时,绑定到当前 session,激活 ETL 库视图业务日上界
+    if bind_to_session and not own_conn and conn is not None:
+        try:
+            apply_runtime_session_vars(conn, ctx=ctx)
+        except Exception:
+            logger.debug(
+                "apply_runtime_session_vars 失败(不阻塞主流程) site_id=%d", site_id,
+                exc_info=True,
+            )
+
+    return ctx


 def namespace_ai_target_id(site_id: int, target_id: str, conn: Any | None = None) -> str: