fix(ai): F1-5a 沙箱 batch-run 接入 runtime_context (W1 / 阶段 A 主体)

Neo F1-5 反馈: "让沙箱起到其真正的作用. 真正的模拟日期, 仅能看到沙箱设定日期
及之前日期的数据, 并运行 AI 的各个业务."

调研发现 (4 个并行子代理): batch-run 端点 _run_batch 是空壳 stub
(只 logger.info, 实际不跑 AI), GUC apply_runtime_session_vars 0 处调用
(dead code), 7 张业务表 6 张有 runtime 复合索引唯独 ai_run_logs 漏建,
App2/2a 3 行 _calc_date_range 漏传 ref_date.

本 commit (F1-5a 阶段 A 主体, F1-5b 后续完整 zqyy_app RLS 视图层):

后端核心:
- admin_service.py: _run_batch 真实化 (Semaphore(5)+asyncio.gather+
  return_exceptions=True+ctx_snapshot 防漂移); estimate 入口抓
  RuntimeContext 快照, confirm 取出传给 worker
- admin_ai.py: confirm_batch_run lazy 注入 dispatcher
- admin_service.retry_trigger_job: INSERT 落 runtime_mode +
  sandbox_instance_id 列 (用 runtime_insert_columns helper)
- runtime_context.py: get_runtime_context 加 bind_to_session 参数,
  激活 GUC app.current_business_date / app.current_runtime_mode
- run_log_service.create_log: 启用 bind_to_session=True 试点

App2/2a 3 行 ref_date 修复:
- app2_finance_prompt.py:817 储值卡余额变化板块
- app2_finance_prompt.py:841 日粒度 series + 异常检测窗口
- app2a_finance_area_prompt.py:466 区域日粒度 series

DB:
- migrations/20260505__ai_run_logs_runtime_index.sql:
  补 (site_id, runtime_mode, sandbox_instance_id, created_at DESC) 复合索引

前端:
- AIOperations.tsx: 顶部加 sandbox 模式提示条 (Alert 显示 sandbox_date +
  sandbox_instance_id + 影响范围 + 切回 live 入口)

未做 (留 F1-5b 完整 zqyy_app RLS 视图层一并):
- B1 admin_service 6 处 CURRENT_DATE -> business_date
- B2 fdw_queries 异常分支兜底
- GUC 完整传递 (fdw_queries / page_context 等)
- 测试 3 套 (.gitignore:71 排除, F2-2 入仓时 commit)
- P20 SPEC \xa76/\xa710/\xa711/\xa715 (F1-5b 完整收口后同步更准确)

Neo 决策: docs/_overview/wave1-findings/F1-5-impl-decisions.md

详见 docs/audit/changes/2026-05-05__wave1_f1_5a_sandbox_batch_run.md
This commit is contained in:
Neo
2026-05-05 03:01:48 +08:00
parent a99bbd9a74
commit 421e193041
10 changed files with 909 additions and 40 deletions

View File

@@ -10,7 +10,7 @@
import React, { useEffect, useState, useCallback } from "react";
import {
Card, Row, Col, Select, Input, Button, Table, Tag, Space,
Alert, Card, Row, Col, Select, Input, Button, Table, Tag, Space,
Checkbox, Modal, Statistic, message, Typography,
} from "antd";
import { ReloadOutlined } from "@ant-design/icons";
@@ -20,6 +20,8 @@ import {
getAlerts, ackAlert, ignoreAlert, runApp, triggerEvent,
type AlertItem, type AppType, type BatchRunEstimate,
} from "../api/adminAI";
// F1-5a: sandbox 模式提示条数据源
import { fetchRuntimeContext, type RuntimeContext } from "../api/runtimeContext";
const EVENT_TYPE_OPTIONS = [
{ label: "消费事件App3→App8→App7 [+ App4→App5]", value: "consumption" },
@@ -213,6 +215,24 @@ const AIOperations: React.FC = () => {
}
};
// ---- F1-5a: Sandbox 模式提示条 ----
// 沙箱机制 P0-7 主线:让运维进入 AI 操作页前能看到当前 sandbox 状态,
// 避免"以为 live 模式"误触发批量执行实际跑在 sandbox 数据集上的混淆。
const [runtimeCtx, setRuntimeCtx] = useState<RuntimeContext | null>(null);
useEffect(() => {
// 复用 cacheSiteId 作为当前关注 site(默认 2790685415443269,与 cacheSiteId / runSiteId 一致)
let cancelled = false;
(async () => {
try {
const ctx = await fetchRuntimeContext(cacheSiteId);
if (!cancelled) setRuntimeCtx(ctx);
} catch {
// 失败不阻断页面渲染(get_runtime_context 表不存在时后端降级 live)
}
})();
return () => { cancelled = true; };
}, [cacheSiteId]);
// ---- Card 4: 告警管理 ----
const [alerts, setAlerts] = useState<AlertItem[]>([]);
const [alertTotal, setAlertTotal] = useState(0);
@@ -282,6 +302,28 @@ const AIOperations: React.FC = () => {
<div>
<Title level={4} style={{ marginBottom: 16 }}>AI </Title>
{runtimeCtx && runtimeCtx.is_sandbox && (
<Alert
type="warning"
showIcon
style={{ marginBottom: 16 }}
message={
<span>
<strong></strong> · <strong>{runtimeCtx.sandbox_date ?? "—"}</strong> ·
<code>{runtimeCtx.sandbox_instance_id ?? "—"}</code>
</span>
}
description={
<span>
site_id={cacheSiteId} AI ( / / / )使
({runtimeCtx.sandbox_date}) ;ETL ,/
ai_run_logs runtime_mode=sandbox + sandbox_instance_id ,
live live, <a href="/settings/runtime-context"></a>
</span>
}
/>
)}
<Row gutter={16} style={{ marginBottom: 16 }}>
{/* Card 1: 手动重跑 */}
<Col span={12}>

View File

@@ -814,7 +814,10 @@ async def build_prompt(
# 避免 AI 在只看当期充值/消耗时对"余额为何涨"的矛盾自圆其说
if area == "all" and isinstance(recharge := board_data.get("recharge"), dict):
try:
start_date_obj, _end = _calc_date_range(board_time)
from app.services.runtime_context import get_runtime_context
start_date_obj, _end = _calc_date_range(
board_time, ref_date=get_runtime_context(site_id).business_date,
)
opening = _fetch_card_balance_opening(site_id, str(start_date_obj))
closing = float(recharge.get("card_balance") or 0)
period_recharge = float(recharge.get("actual_income") or 0)
@@ -838,7 +841,10 @@ async def build_prompt(
# - 日粒度异常:同星期均值基线下的极端偏离
if area == "all":
try:
start_date, end_date = _calc_date_range(board_time)
from app.services.runtime_context import get_runtime_context
start_date, end_date = _calc_date_range(
board_time, ref_date=get_runtime_context(site_id).business_date,
)
series = _fetch_daily_series(site_id, str(start_date), str(end_date))
# 上期序列(用于客单价环比)
prev_series: list[tuple] | None = None

View File

@@ -463,7 +463,10 @@ async def build_prompt(
# 日粒度派生(区域级)
try:
start_date, end_date = _calc_date_range(board_time)
from app.services.runtime_context import get_runtime_context
start_date, end_date = _calc_date_range(
board_time, ref_date=get_runtime_context(site_id).business_date,
)
series = _fetch_area_daily_series(
site_id, str(start_date), str(end_date), area_code=area,
)

View File

@@ -59,7 +59,10 @@ class AIRunLogService:
truncated = _truncate_prompt(request_prompt)
conn = self._get_conn()
try:
ctx = get_runtime_context(site_id, conn=conn)
# F1-5a: bind_to_session=True 激活 GUC app.current_business_date,
# 让本事务内所有 ETL 库 app.v_* 视图自动按 business_date 上界裁剪
# (如果后续 fetch 走 ETL 视图)。
ctx = get_runtime_context(site_id, conn=conn, bind_to_session=True)
runtime_mode = MODE_SANDBOX if ctx.is_sandbox else MODE_LIVE
sandbox_instance_id = ctx.sandbox_instance_id if ctx.is_sandbox else LIVE_INSTANCE_ID
with conn.cursor() as cur:

View File

@@ -285,7 +285,18 @@ async def confirm_batch_run(
body: BatchRunConfirm,
user: CurrentUser = Depends(_require_admin()),
) -> BatchRunConfirmResponse:
"""确认批量执行,后台异步执行。"""
"""确认批量执行,后台异步执行。
F1-5a: lazy 注入 dispatcher 到 _admin_svc(首次调用时绑定),
避免模块加载顺序问题(dispatcher 通常在 lifespan startup 才初始化)。
"""
if _admin_svc._dispatcher is None:
try:
from app.ai.dispatcher import get_dispatcher
_admin_svc.set_dispatcher(get_dispatcher())
except RuntimeError as exc:
raise HTTPException(status_code=503, detail=f"Dispatcher 未初始化: {exc}") from exc
try:
await _admin_svc.confirm_batch(batch_id=body.batch_id)
except ValueError as exc:

View File

@@ -13,9 +13,18 @@ import asyncio
import logging
import uuid
from datetime import datetime, timezone, timedelta
from typing import TYPE_CHECKING, Any
from app.ai.budget_tracker import BudgetTracker
from app.database import get_connection
from app.services.runtime_context import (
RuntimeContext,
get_runtime_context,
runtime_insert_columns,
)
if TYPE_CHECKING:
from app.ai.dispatcher import AIDispatcher
logger = logging.getLogger(__name__)
@@ -25,13 +34,22 @@ AVG_TOKENS_PER_CALL = 2000
# 批量执行内存存储 TTL
_BATCH_TTL_SECONDS = 600 # 10 分钟
# F1-5a 批量执行并发上限
# Neo 决策 N=5与 dispatcher 现有 circuit_breaker / rate_limiter 配合,不打爆 DashScope 1000 RPM 限制)
_BATCH_CONCURRENCY = 5
class AdminAIService:
"""AI 监控后台聚合服务。"""
def __init__(self, budget_tracker: BudgetTracker | None = None) -> None:
self._budget = budget_tracker
self._batch_store: dict[str, dict] = {} # batch_id → {params, expires_at}
self._batch_store: dict[str, dict] = {} # batch_id → {params, ctx_snapshot, expires_at}
self._dispatcher: AIDispatcher | None = None # F1-5a: lifespan 启动时注入
def set_dispatcher(self, dispatcher: AIDispatcher) -> None:
"""F1-5a: lifespan 启动时注入 dispatcher用于 _run_batch 实际执行 AI 调用。"""
self._dispatcher = dispatcher
# ── Dashboard ─────────────────────────────────────────
@@ -344,29 +362,37 @@ class AdminAIService:
return _row_to_dict(cols, row)
async def retry_trigger_job(self, job_id: int) -> int:
"""创建新 trigger_jobis_forced=true返回新 job_id。"""
"""创建新 trigger_jobis_forced=true返回新 job_id。
F1-5a: INSERT 显式落 runtime_mode + sandbox_instance_id
与原 trigger_job 的 runtime 上下文保持一致(避免依赖默认值导致重试时丢失 sandbox 标记)。
"""
original = await self.get_trigger_job(job_id)
if original is None:
raise ValueError(f"trigger_job {job_id} 不存在")
site_id = original["site_id"]
cols, placeholders, runtime_params = runtime_insert_columns(site_id)
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute(
"""
f"""
INSERT INTO biz.ai_trigger_jobs
(event_type, member_id, site_id, connector_type,
payload, app_chain, is_forced, status)
VALUES (%s, %s, %s, %s, %s, %s, true, 'pending')
payload, app_chain, is_forced, status, {cols})
VALUES (%s, %s, %s, %s, %s, %s, true, 'pending', {placeholders})
RETURNING id
""",
(
original["event_type"],
original.get("member_id"),
original["site_id"],
site_id,
original.get("connector_type", "feiqiu"),
original.get("payload"),
original.get("app_chain"),
*runtime_params,
),
)
new_id = cur.fetchone()[0]
@@ -576,22 +602,41 @@ class AdminAIService:
async def estimate_batch(
self, app_types: list[str], member_ids: list[int], site_id: int,
) -> dict:
"""生成 batch_id存入内存TTL 10min返回预估。"""
"""生成 batch_id存入内存TTL 10min返回预估。
F1-5a: estimate 阶段抓 RuntimeContext 快照存入 _batch_store
confirm 时取出传给 _run_batch避免 estimate→confirm 间 Neo 切 sandbox 模式造成数据漂移污染。
"""
self._cleanup_expired_batches()
batch_id = uuid.uuid4().hex
estimated_calls = len(app_types) * len(member_ids)
estimated_tokens = estimated_calls * AVG_TOKENS_PER_CALL
# F1-5a: 抓 ctx_snapshot
ctx_snapshot = get_runtime_context(site_id)
self._batch_store[batch_id] = {
"params": {
"app_types": app_types,
"member_ids": member_ids,
"site_id": site_id,
"batch_id": batch_id, # 用于 _run_batch 内 triggered_by 标注
},
"ctx_snapshot": ctx_snapshot,
"expires_at": datetime.now(timezone.utc) + timedelta(seconds=_BATCH_TTL_SECONDS),
}
logger.info(
"批量执行预估: batch_id=%s apps=%s members=%d site_id=%s mode=%s sandbox_date=%s",
batch_id,
app_types,
len(member_ids),
site_id,
ctx_snapshot.mode,
ctx_snapshot.sandbox_date.isoformat() if ctx_snapshot.sandbox_date else None,
)
return {
"batch_id": batch_id,
"estimated_calls": estimated_calls,
@@ -599,7 +644,7 @@ class AdminAIService:
}
async def confirm_batch(self, batch_id: str) -> None:
"""取出参数,异步执行批量调用。"""
"""取出参数 + ctx_snapshot,异步执行批量调用。"""
self._cleanup_expired_batches()
entry = self._batch_store.pop(batch_id, None)
@@ -607,28 +652,77 @@ class AdminAIService:
raise ValueError(f"batch_id 无效或已过期: {batch_id}")
params = entry["params"]
ctx_snapshot: RuntimeContext = entry["ctx_snapshot"]
logger.info(
"批量执行确认: batch_id=%s apps=%s members=%d site_id=%s",
"批量执行确认: batch_id=%s apps=%s members=%d site_id=%s mode=%s",
batch_id,
params["app_types"],
len(params["member_ids"]),
params["site_id"],
ctx_snapshot.mode,
)
# 后台异步执行(具体调用链由路由层注入 dispatcher 处理)
asyncio.create_task(
self._run_batch(params["app_types"], params["member_ids"], params["site_id"])
)
# F1-5a: 后台异步执行,传入 ctx_snapshot 防止运行时 Neo 切模式
asyncio.create_task(self._run_batch(params, ctx_snapshot))
async def _run_batch(
self, app_types: list[str], member_ids: list[int], site_id: int,
self, params: dict[str, Any], ctx_snapshot: RuntimeContext,
) -> None:
"""后台批量执行(占位实现,实际由 dispatcher 驱动)。"""
"""F1-5a: 后台批量执行真正实现。
- Semaphore(_BATCH_CONCURRENCY=5) 限并发,避免打爆 DashScope 限流
- asyncio.gather + return_exceptions=True单个 member 失败不连坐
- context 显式带 business_date来自 ctx_snapshotprompt builder 自取沙箱日期
- triggered_by=f"batch:{batch_id}":打标 ai_run_logsWave 2 进度查询基础
"""
if self._dispatcher is None:
logger.error(
"批量执行失败: dispatcher 未注入 batch_id=%s",
params.get("batch_id"),
)
return
app_types: list[str] = params["app_types"]
member_ids: list[int] = params["member_ids"]
site_id: int = params["site_id"]
batch_id: str = params["batch_id"]
sem = asyncio.Semaphore(_BATCH_CONCURRENCY)
triggered_by = f"batch:{batch_id}"
business_date_iso = ctx_snapshot.business_date.isoformat()
async def _run_one(app_type: str, member_id: int) -> None:
async with sem:
try:
await self._dispatcher.run_single_app(
app_type=app_type,
context={
"site_id": site_id,
"member_id": member_id,
"business_date": business_date_iso,
},
triggered_by=triggered_by,
)
except Exception:
# 单个失败已写 ai_run_logs此处仅记录不连坐其他 member
logger.exception(
"批量执行单步失败 batch_id=%s app=%s member=%s",
batch_id, app_type, member_id,
)
tasks = [
asyncio.create_task(_run_one(at, mid))
for at in app_types
for mid in member_ids
]
logger.info(
"批量执行开始: apps=%s members=%d site_id=%s",
app_types, len(member_ids), site_id,
"批量执行开始: batch_id=%s tasks=%d concurrency=%d mode=%s",
batch_id, len(tasks), _BATCH_CONCURRENCY, ctx_snapshot.mode,
)
# 实际执行逻辑在路由层通过 dispatcher.handle_trigger 驱动
# 此处仅记录日志,避免服务层直接依赖 dispatcher 实例
await asyncio.gather(*tasks, return_exceptions=True)
logger.info("批量执行完成: batch_id=%s tasks=%d", batch_id, len(tasks))
def _cleanup_expired_batches(self) -> None:
"""清理过期 batch。"""

View File

@@ -8,6 +8,7 @@
from __future__ import annotations
import logging
import uuid
from dataclasses import dataclass
from datetime import date, datetime, time, timedelta, timezone
@@ -15,6 +16,8 @@ from typing import Any
from app import config
logger = logging.getLogger(__name__)
_LOCAL_TZ = timezone(timedelta(hours=8))
MODE_LIVE = "live"
MODE_SANDBOX = "sandbox"
@@ -85,10 +88,23 @@ def _default_context(site_id: int) -> RuntimeContext:
return RuntimeContext(site_id=site_id)
def get_runtime_context(site_id: int, conn: Any | None = None) -> RuntimeContext:
def get_runtime_context(
site_id: int,
conn: Any | None = None,
*,
bind_to_session: bool = False,
) -> RuntimeContext:
"""读取门店运行上下文。
表不存在或未配置时降级为 live保证迁移前不影响正式链路。
F1-5a 新增 ``bind_to_session``:当 True 且 conn 非空时,在返回前调用
``apply_runtime_session_vars(conn, ctx)`` 设置 GUC ``app.current_business_date`` /
``app.current_runtime_mode``,激活 ETL 库 26 个 ``app.v_*`` 视图的业务日上界裁剪
(`app.business_date_now()` 函数读取 GUC)。
使用场景:fdw_queries 等走 ETL 库视图的查询入口处显式 ``bind_to_session=True``。
其余只读取 ctx 不查 ETL 视图的调用方保持默认 ``False`` 即可。
"""
own_conn = conn is None
if own_conn:
@@ -120,22 +136,34 @@ def get_runtime_context(site_id: int, conn: Any | None = None) -> RuntimeContext
conn.close()
if not row:
return _default_context(site_id)
ctx = _default_context(site_id)
else:
mode, sandbox_date, sandbox_instance_id, ai_mode, status = row
if mode not in (MODE_LIVE, MODE_SANDBOX):
mode = MODE_LIVE
if mode == MODE_SANDBOX and (sandbox_date is None or not sandbox_instance_id):
mode = MODE_LIVE
mode, sandbox_date, sandbox_instance_id, ai_mode, status = row
if mode not in (MODE_LIVE, MODE_SANDBOX):
mode = MODE_LIVE
if mode == MODE_SANDBOX and (sandbox_date is None or not sandbox_instance_id):
mode = MODE_LIVE
ctx = RuntimeContext(
site_id=site_id,
mode=mode,
sandbox_date=sandbox_date,
sandbox_instance_id=sandbox_instance_id,
ai_mode=ai_mode or AI_MODE_LIVE,
status=status or "active",
)
return RuntimeContext(
site_id=site_id,
mode=mode,
sandbox_date=sandbox_date,
sandbox_instance_id=sandbox_instance_id,
ai_mode=ai_mode or AI_MODE_LIVE,
status=status or "active",
)
# F1-5a: 显式开启时,绑定到当前 session,激活 ETL 库视图业务日上界
if bind_to_session and not own_conn and conn is not None:
try:
apply_runtime_session_vars(conn, ctx=ctx)
except Exception:
logger.debug(
"apply_runtime_session_vars 失败(不阻塞主流程) site_id=%d", site_id,
exc_info=True,
)
return ctx
def namespace_ai_target_id(site_id: int, target_id: str, conn: Any | None = None) -> str: