UI-3 AIDashboard sandbox 提示 + today_calls 分组: - 后端 schemas/admin_ai.py DashboardResponse 加 today_live_calls / today_sandbox_calls 字段(默认 0,向后兼容) - 后端 services/ai/admin_service.py _get_range_stats SELECT 加 2 个 FILTER COUNT 表达式 - 前端 api/adminAI.ts DashboardResponse 类型补 2 字段 - 前端 pages/AIDashboard.tsx - 顶部加 sandbox Alert 提示条,选中 site sandbox 模式下显示业务日 + 实例 ID - today_calls 卡片下方加分组 Tag(实时 X / 沙箱 Y),feature flag 控制 - import fetchRuntimeContext + useEffect 拉 RuntimeContext - apps/admin-web/.env.example 新建,加 VITE_AI_RUNTIME_GROUPING=false 默认值说明 UI-5 AITriggerJobs runtime 列: - 后端 schemas/admin_ai.py TriggerJobItem 加 runtime_mode / sandbox_instance_id 可选字段 - 后端 admin_service.py list_trigger_jobs / get_trigger_job 各加 SELECT 列 - 前端 adminAI.ts TriggerJobItem 类型补 2 字段 - 前端 pages/AITriggerJobs.tsx 列表 columns 加运行模式 + 沙箱实例(同 UI-1 模式),详情 Modal 加 2 项(同 UI-2 模式) 双口径验证(Playwright + DB 直查): - UI-3 4a live: 选中默认门店,无 Alert,today_card 仅显示总数(flag off) - UI-3 4b sandbox=4-20: Alert 显示"沙箱 + 业务日 + sbx_…",today_calls=93(sandbox 当日) - UI-5 4a/4b: SQL INSERT 注入 walkthrough 测试行(id=9 live, id=10 sandbox),列表正确渲染 Tag + 短哈希 trend_7d 双线 / app_distribution 堆叠分布等更深入分组改造延后到 Wave C(§8.3 风险:破坏图表)。 审计: - docs/audit/changes/2026-05-05__wave1_f1_5b_ui3_aidashboard_sandbox.md - docs/audit/changes/2026-05-05__wave1_f1_5b_ui5_aitriggerjobs_runtime.md Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1068 lines
39 KiB
Python
1068 lines
39 KiB
Python
"""AI 监控后台聚合服务层。
|
||
|
||
提供 Dashboard 总览、调度任务管理、调用记录查询、缓存失效、
|
||
Token 预算、批量执行(含成本二次确认)、告警管理等功能。
|
||
|
||
所有数据库操作使用 psycopg2 同步连接,方法签名为 async(FastAPI 兼容)。
|
||
查询强制 site_id 隔离(当 site_id 参数不为 None 时)。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import uuid
|
||
from datetime import datetime, timezone, timedelta
|
||
from typing import TYPE_CHECKING, Any
|
||
|
||
import psycopg2.extras
|
||
|
||
from app.ai.budget_tracker import BudgetTracker
|
||
from app.database import get_connection
|
||
from app.services.runtime_context import (
|
||
RuntimeContext,
|
||
as_runtime_today_param,
|
||
get_runtime_context,
|
||
runtime_insert_columns,
|
||
)
|
||
|
||
if TYPE_CHECKING:
|
||
from app.ai.dispatcher import AIDispatcher
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 批量执行预估:每次调用平均 Token 消耗
|
||
AVG_TOKENS_PER_CALL = 2000
|
||
|
||
# 批量执行内存存储 TTL(秒)
|
||
_BATCH_TTL_SECONDS = 600 # 10 分钟
|
||
|
||
# F1-5a 批量执行并发上限
|
||
# Neo 决策 N=5(与 dispatcher 现有 circuit_breaker / rate_limiter 配合,不打爆 DashScope 1000 RPM 限制)
|
||
_BATCH_CONCURRENCY = 5
|
||
|
||
|
||
class AdminAIService:
|
||
"""AI 监控后台聚合服务。"""
|
||
|
||
def __init__(self, budget_tracker: BudgetTracker | None = None) -> None:
|
||
self._budget = budget_tracker
|
||
self._batch_store: dict[str, dict] = {} # batch_id → {params, ctx_snapshot, expires_at}
|
||
self._dispatcher: AIDispatcher | None = None # F1-5a: lifespan 启动时注入
|
||
|
||
def set_dispatcher(self, dispatcher: AIDispatcher) -> None:
|
||
"""F1-5a: lifespan 启动时注入 dispatcher,用于 _run_batch 实际执行 AI 调用。"""
|
||
self._dispatcher = dispatcher
|
||
|
||
# ── Dashboard ─────────────────────────────────────────
|
||
|
||
async def get_dashboard(
|
||
self,
|
||
site_id: int | None = None,
|
||
range_days: int | None = None,
|
||
date_from: str | None = None,
|
||
date_to: str | None = None,
|
||
) -> dict:
|
||
"""聚合所有 Dashboard 数据。
|
||
|
||
时间范围优先级:
|
||
1. 若 date_from / date_to 同时给出(指定日期)→ 闭区间 [from, to]
|
||
2. 若 range_days=N → [CURRENT_DATE - (N-1) days, 现在]
|
||
3. 默认 range_days=1(今日)
|
||
"""
|
||
today_stats = await self._get_range_stats(site_id, range_days, date_from, date_to)
|
||
trend_7d = await self._get_7d_trend(site_id)
|
||
app_dist = await self._get_app_distribution(site_id)
|
||
app_health = await self._get_app_health(site_id)
|
||
budget = await self.get_budget()
|
||
recent_alerts = await self._get_recent_alerts(site_id)
|
||
return {
|
||
**today_stats,
|
||
"trend_7d": trend_7d,
|
||
"app_distribution": app_dist,
|
||
"budget": budget,
|
||
"recent_alerts": recent_alerts,
|
||
"app_health": app_health,
|
||
}
|
||
|
||
async def _get_range_stats(
|
||
self,
|
||
site_id: int | None,
|
||
range_days: int | None,
|
||
date_from: str | None,
|
||
date_to: str | None,
|
||
) -> dict:
|
||
"""指定时间段内的调用次数、成功率、Token 消耗、平均延迟。
|
||
|
||
字段名沿用 today_* 前缀以兼容前端 DashboardResponse schema。
|
||
|
||
F1-5b A1: 时间窗口基准日:
|
||
- site_id 非 None: 用 RuntimeContext.business_date(sandbox 模式取虚拟日)
|
||
- site_id 为 None: 全局聚合,用 PG CURRENT_DATE(全局视图无单一业务日)
|
||
"""
|
||
site_clause, site_params = _site_filter(site_id)
|
||
|
||
if date_from and date_to:
|
||
time_clause = "created_at >= %s::date AND created_at < (%s::date + INTERVAL '1 day')"
|
||
time_params: tuple = (date_from, date_to)
|
||
elif site_id is not None:
|
||
days = range_days if range_days and range_days > 0 else 1
|
||
today = as_runtime_today_param(site_id)
|
||
time_clause = (
|
||
"created_at >= %s::date - (%s::int - 1) * INTERVAL '1 day' "
|
||
"AND created_at < %s::date + INTERVAL '1 day'"
|
||
)
|
||
time_params = (today, days, today)
|
||
else:
|
||
days = range_days if range_days and range_days > 0 else 1
|
||
time_clause = (
|
||
"created_at >= CURRENT_DATE - (%s::int - 1) * INTERVAL '1 day' "
|
||
"AND created_at < CURRENT_DATE + INTERVAL '1 day'"
|
||
)
|
||
time_params = (days,)
|
||
|
||
params = time_params + site_params
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
# F1-5b UI-3: SELECT 加 live/sandbox 分组 COUNT,
|
||
# 总数 total_calls 与 today_live + today_sandbox 应一致。
|
||
cur.execute(
|
||
f"""
|
||
SELECT
|
||
COUNT(*) AS total_calls,
|
||
COUNT(*) FILTER (WHERE status = 'success') AS success_count,
|
||
COALESCE(SUM(tokens_used), 0) AS total_tokens,
|
||
COALESCE(AVG(latency_ms) FILTER (WHERE latency_ms IS NOT NULL), 0)
|
||
AS avg_latency,
|
||
COUNT(*) FILTER (WHERE COALESCE(runtime_mode, 'live') = 'live')
|
||
AS live_calls,
|
||
COUNT(*) FILTER (WHERE runtime_mode = 'sandbox') AS sandbox_calls
|
||
FROM biz.ai_run_logs
|
||
WHERE {time_clause}
|
||
{site_clause}
|
||
""",
|
||
params,
|
||
)
|
||
row = cur.fetchone()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
total, success, tokens, avg_lat, live_cnt, sandbox_cnt = row if row else (0, 0, 0, 0, 0, 0)
|
||
rate = round(success / total, 4) if total > 0 else 0.0
|
||
return {
|
||
"today_calls": total,
|
||
"today_success_rate": rate,
|
||
"today_tokens": int(tokens),
|
||
"today_avg_latency_ms": round(float(avg_lat), 2),
|
||
"today_live_calls": int(live_cnt),
|
||
"today_sandbox_calls": int(sandbox_cnt),
|
||
}
|
||
|
||
async def _get_7d_trend(self, site_id: int | None) -> list[dict]:
|
||
"""近 7 天按日聚合。
|
||
|
||
F1-5b A1: site_id 非 None 用业务日上下界(sandbox 取虚拟日);
|
||
site_id None 全局聚合用 CURRENT_DATE。
|
||
2026-05-05 修补:同时加上界 < %s + 1day,sandbox 不漏未来数据。
|
||
"""
|
||
site_clause, site_params = _site_filter(site_id)
|
||
if site_id is not None:
|
||
today = as_runtime_today_param(site_id)
|
||
time_clause = (
|
||
"created_at >= %s::date - INTERVAL '6 days' "
|
||
"AND created_at < %s::date + INTERVAL '1 day'"
|
||
)
|
||
params = (today, today) + site_params
|
||
else:
|
||
time_clause = (
|
||
"created_at >= CURRENT_DATE - INTERVAL '6 days' "
|
||
"AND created_at < CURRENT_DATE + INTERVAL '1 day'"
|
||
)
|
||
params = site_params
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
SELECT
|
||
created_at::date AS day,
|
||
COUNT(*) AS calls,
|
||
COUNT(*) FILTER (WHERE status = 'success') AS success_count
|
||
FROM biz.ai_run_logs
|
||
WHERE {time_clause}
|
||
{site_clause}
|
||
GROUP BY day
|
||
ORDER BY day
|
||
""",
|
||
params,
|
||
)
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
return [
|
||
{
|
||
"date": row[0].isoformat(),
|
||
"calls": row[1],
|
||
"success_rate": round(row[2] / row[1], 4) if row[1] > 0 else 0.0,
|
||
}
|
||
for row in rows
|
||
]
|
||
|
||
async def _get_app_distribution(self, site_id: int | None) -> list[dict]:
|
||
"""各 App 调用占比。
|
||
|
||
F1-5b A1: site_id 非 None 用业务日上下界(sandbox 取虚拟日);
|
||
site_id None 全局聚合用 CURRENT_DATE。
|
||
2026-05-05 修补:同时加上界 < %s + 1day,sandbox 不漏未来数据。
|
||
"""
|
||
site_clause, site_params = _site_filter(site_id)
|
||
if site_id is not None:
|
||
today = as_runtime_today_param(site_id)
|
||
time_clause = (
|
||
"created_at >= %s::date - INTERVAL '6 days' "
|
||
"AND created_at < %s::date + INTERVAL '1 day'"
|
||
)
|
||
params = (today, today) + site_params
|
||
else:
|
||
time_clause = (
|
||
"created_at >= CURRENT_DATE - INTERVAL '6 days' "
|
||
"AND created_at < CURRENT_DATE + INTERVAL '1 day'"
|
||
)
|
||
params = site_params
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
SELECT app_type, COUNT(*) AS cnt
|
||
FROM biz.ai_run_logs
|
||
WHERE {time_clause}
|
||
{site_clause}
|
||
GROUP BY app_type
|
||
ORDER BY cnt DESC
|
||
""",
|
||
params,
|
||
)
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
total = sum(r[1] for r in rows) or 1
|
||
return [
|
||
{
|
||
"app_type": row[0],
|
||
"count": row[1],
|
||
"percentage": round(row[1] / total, 4),
|
||
}
|
||
for row in rows
|
||
]
|
||
|
||
async def _get_app_health(self, site_id: int | None) -> list[dict]:
|
||
"""各 App 最近一次调用状态。"""
|
||
site_clause, params = _site_filter(site_id)
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
SELECT DISTINCT ON (app_type)
|
||
app_type,
|
||
status AS last_status,
|
||
created_at AS last_call_at
|
||
FROM biz.ai_run_logs
|
||
WHERE TRUE {site_clause}
|
||
ORDER BY app_type, created_at DESC
|
||
""",
|
||
params,
|
||
)
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
return [
|
||
{
|
||
"app_type": row[0],
|
||
"last_status": row[1],
|
||
"last_call_at": row[2].isoformat() if row[2] else None,
|
||
}
|
||
for row in rows
|
||
]
|
||
|
||
async def _get_recent_alerts(self, site_id: int | None, limit: int = 10) -> list[dict]:
|
||
"""最近告警事件(Dashboard 用)。"""
|
||
site_clause, params = _site_filter(site_id)
|
||
params = (*params, limit)
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
SELECT id, app_type, status, alert_status,
|
||
error_message, created_at
|
||
FROM biz.ai_run_logs
|
||
WHERE status IN ('failed', 'timeout', 'circuit_open')
|
||
{site_clause}
|
||
ORDER BY created_at DESC
|
||
LIMIT %s
|
||
""",
|
||
params,
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
return [_row_to_dict(cols, r) for r in rows]
|
||
|
||
# ── 调度任务 ──────────────────────────────────────────
|
||
|
||
async def list_trigger_jobs(
|
||
self, filters: dict, page: int = 1, page_size: int = 20,
|
||
) -> dict:
|
||
"""分页查询 ai_trigger_jobs + 今日去重统计。"""
|
||
where_parts: list[str] = []
|
||
params: list = []
|
||
|
||
for key in ("event_type", "status", "site_id"):
|
||
if filters.get(key) is not None:
|
||
where_parts.append(f"{key} = %s")
|
||
params.append(filters[key])
|
||
|
||
if filters.get("date_from"):
|
||
where_parts.append("created_at >= %s")
|
||
params.append(filters["date_from"])
|
||
if filters.get("date_to"):
|
||
where_parts.append("created_at <= %s")
|
||
params.append(filters["date_to"])
|
||
|
||
where_sql = ("WHERE " + " AND ".join(where_parts)) if where_parts else ""
|
||
offset = (page - 1) * page_size
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
# 总数
|
||
cur.execute(
|
||
f"SELECT COUNT(*) FROM biz.ai_trigger_jobs {where_sql}",
|
||
params,
|
||
)
|
||
total = cur.fetchone()[0]
|
||
|
||
# 分页数据
|
||
# F1-5b UI-5: 列表 SELECT 加 runtime_mode + sandbox_instance_id 透出
|
||
cur.execute(
|
||
f"""
|
||
SELECT id, event_type, member_id, status, app_chain,
|
||
is_forced, site_id, started_at, finished_at, created_at,
|
||
runtime_mode, sandbox_instance_id
|
||
FROM biz.ai_trigger_jobs
|
||
{where_sql}
|
||
ORDER BY created_at DESC
|
||
LIMIT %s OFFSET %s
|
||
""",
|
||
(*params, page_size, offset),
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
rows = cur.fetchall()
|
||
|
||
# 今日去重跳过数
|
||
cur.execute(
|
||
"""
|
||
SELECT COUNT(*)
|
||
FROM biz.ai_trigger_jobs
|
||
WHERE status = 'skipped_duplicate'
|
||
AND created_at >= CURRENT_DATE
|
||
AND created_at < CURRENT_DATE + INTERVAL '1 day'
|
||
""",
|
||
)
|
||
today_skipped = cur.fetchone()[0]
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
return {
|
||
"items": [_row_to_dict(cols, r) for r in rows],
|
||
"total": total,
|
||
"page": page,
|
||
"page_size": page_size,
|
||
"today_skipped_duplicates": today_skipped,
|
||
}
|
||
|
||
async def get_trigger_job(self, job_id: int) -> dict | None:
|
||
"""单条调度任务详情。"""
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
# F1-5b UI-5: 详情 SELECT 加 runtime_mode + sandbox_instance_id 透出
|
||
cur.execute(
|
||
"""
|
||
SELECT id, event_type, member_id, status, app_chain,
|
||
is_forced, site_id, started_at, finished_at,
|
||
created_at, payload, error_message, connector_type,
|
||
runtime_mode, sandbox_instance_id
|
||
FROM biz.ai_trigger_jobs
|
||
WHERE id = %s
|
||
""",
|
||
(job_id,),
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
row = cur.fetchone()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
if row is None:
|
||
return None
|
||
return _row_to_dict(cols, row)
|
||
|
||
async def retry_trigger_job(self, job_id: int) -> int:
|
||
"""创建新 trigger_job(is_forced=true),返回新 job_id。
|
||
|
||
F1-5a: INSERT 显式落 runtime_mode + sandbox_instance_id,
|
||
与原 trigger_job 的 runtime 上下文保持一致(避免依赖默认值导致重试时丢失 sandbox 标记)。
|
||
"""
|
||
original = await self.get_trigger_job(job_id)
|
||
if original is None:
|
||
raise ValueError(f"trigger_job {job_id} 不存在")
|
||
|
||
site_id = original["site_id"]
|
||
cols, placeholders, runtime_params = runtime_insert_columns(site_id)
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
INSERT INTO biz.ai_trigger_jobs
|
||
(event_type, member_id, site_id, connector_type,
|
||
payload, app_chain, is_forced, status, {cols})
|
||
VALUES (%s, %s, %s, %s, %s, %s, true, 'pending', {placeholders})
|
||
RETURNING id
|
||
""",
|
||
(
|
||
original["event_type"],
|
||
original.get("member_id"),
|
||
site_id,
|
||
original.get("connector_type", "feiqiu"),
|
||
# F1-5a 走查发现:psycopg2 把 jsonb 列读成 dict,
|
||
# INSERT 时需 Json() 适配,否则 "can't adapt type 'dict'"
|
||
psycopg2.extras.Json(original["payload"])
|
||
if original.get("payload") is not None
|
||
else None,
|
||
original.get("app_chain"),
|
||
*runtime_params,
|
||
),
|
||
)
|
||
new_id = cur.fetchone()[0]
|
||
conn.commit()
|
||
except Exception:
|
||
conn.rollback()
|
||
raise
|
||
finally:
|
||
conn.close()
|
||
|
||
return new_id
|
||
|
||
# ── 调用记录 ──────────────────────────────────────────
|
||
|
||
async def list_run_logs(
|
||
self, filters: dict, page: int = 1, page_size: int = 20,
|
||
) -> dict:
|
||
"""分页查询 ai_run_logs。"""
|
||
where_parts: list[str] = []
|
||
params: list = []
|
||
|
||
for key in ("app_type", "status", "trigger_type", "site_id"):
|
||
if filters.get(key) is not None:
|
||
where_parts.append(f"{key} = %s")
|
||
params.append(filters[key])
|
||
|
||
if filters.get("date_from"):
|
||
where_parts.append("created_at >= %s")
|
||
params.append(filters["date_from"])
|
||
if filters.get("date_to"):
|
||
where_parts.append("created_at <= %s")
|
||
params.append(filters["date_to"])
|
||
|
||
where_sql = ("WHERE " + " AND ".join(where_parts)) if where_parts else ""
|
||
offset = (page - 1) * page_size
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"SELECT COUNT(*) FROM biz.ai_run_logs {where_sql}",
|
||
params,
|
||
)
|
||
total = cur.fetchone()[0]
|
||
|
||
cur.execute(
|
||
f"""
|
||
SELECT id, app_type, trigger_type, member_id,
|
||
tokens_used, latency_ms, status, site_id, created_at,
|
||
runtime_mode, sandbox_instance_id
|
||
FROM biz.ai_run_logs
|
||
{where_sql}
|
||
ORDER BY created_at DESC
|
||
LIMIT %s OFFSET %s
|
||
""",
|
||
(*params, page_size, offset),
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
return {
|
||
"items": [_row_to_dict(cols, r) for r in rows],
|
||
"total": total,
|
||
"page": page,
|
||
"page_size": page_size,
|
||
}
|
||
|
||
async def get_run_log(self, log_id: int) -> dict | None:
|
||
"""单条调用记录详情(含完整 prompt/response,不脱敏)。"""
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
"""
|
||
SELECT id, app_type, trigger_type, member_id,
|
||
tokens_used, latency_ms, status, site_id,
|
||
created_at, request_prompt, response_text,
|
||
error_message, session_id, finished_at,
|
||
runtime_mode, sandbox_instance_id
|
||
FROM biz.ai_run_logs
|
||
WHERE id = %s
|
||
""",
|
||
(log_id,),
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
row = cur.fetchone()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
if row is None:
|
||
return None
|
||
return _row_to_dict(cols, row)
|
||
|
||
# ── 缓存管理 ──────────────────────────────────────────
|
||
|
||
async def invalidate_cache(
|
||
self, site_id: int, app_type: str | None = None, member_id: int | None = None,
|
||
) -> int:
|
||
"""批量缓存失效,返回受影响记录数。"""
|
||
where_parts = ["site_id = %s"]
|
||
params: list = [site_id]
|
||
|
||
if app_type is not None:
|
||
where_parts.append("cache_type = %s")
|
||
params.append(app_type)
|
||
if member_id is not None:
|
||
where_parts.append("target_id = %s")
|
||
params.append(str(member_id))
|
||
|
||
where_sql = " AND ".join(where_parts)
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
UPDATE biz.ai_cache
|
||
SET status = 'invalidated'
|
||
WHERE {where_sql}
|
||
AND status != 'invalidated'
|
||
""",
|
||
params,
|
||
)
|
||
affected = cur.rowcount
|
||
conn.commit()
|
||
except Exception:
|
||
conn.rollback()
|
||
raise
|
||
finally:
|
||
conn.close()
|
||
|
||
# Phase 1.4:广播 cache_invalidated 事件,admin-web / 小程序可实时刷新
|
||
if affected > 0:
|
||
try:
|
||
from app.ai.event_bus import AIEvent, get_event_bus
|
||
get_event_bus().publish(AIEvent(
|
||
type="cache_invalidated",
|
||
site_id=site_id,
|
||
payload={
|
||
"cache_type": app_type,
|
||
"member_id": member_id,
|
||
"affected": affected,
|
||
},
|
||
))
|
||
except Exception:
|
||
logger.debug("cache_invalidated 事件广播失败", exc_info=True)
|
||
|
||
return affected
|
||
|
||
# ── Token 预算 ────────────────────────────────────────
|
||
|
||
async def get_budget(self) -> dict:
|
||
"""Token 预算使用情况。"""
|
||
if self._budget is not None:
|
||
status = self._budget.check_budget()
|
||
daily_limit = self._budget.daily_limit
|
||
monthly_limit = self._budget.monthly_limit
|
||
return {
|
||
"daily_used": status.daily_used,
|
||
"daily_limit": daily_limit,
|
||
"daily_pct": round(status.daily_used / daily_limit, 4) if daily_limit > 0 else 0.0,
|
||
"monthly_used": status.monthly_used,
|
||
"monthly_limit": monthly_limit,
|
||
"monthly_pct": round(status.monthly_used / monthly_limit, 4) if monthly_limit > 0 else 0.0,
|
||
}
|
||
|
||
# 无 BudgetTracker 时直接查询
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
"""
|
||
SELECT
|
||
COALESCE(SUM(tokens_used) FILTER (
|
||
WHERE created_at >= CURRENT_DATE
|
||
AND created_at < CURRENT_DATE + INTERVAL '1 day'
|
||
), 0) AS daily_used,
|
||
COALESCE(SUM(tokens_used) FILTER (
|
||
WHERE created_at >= date_trunc('month', CURRENT_DATE)
|
||
AND created_at < date_trunc('month', CURRENT_DATE) + INTERVAL '1 month'
|
||
), 0) AS monthly_used
|
||
FROM biz.ai_run_logs
|
||
WHERE status = 'success'
|
||
""",
|
||
)
|
||
row = cur.fetchone()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
daily_used, monthly_used = (int(row[0]), int(row[1])) if row else (0, 0)
|
||
daily_limit = 100_000
|
||
monthly_limit = 2_000_000
|
||
return {
|
||
"daily_used": daily_used,
|
||
"daily_limit": daily_limit,
|
||
"daily_pct": round(daily_used / daily_limit, 4) if daily_limit > 0 else 0.0,
|
||
"monthly_used": monthly_used,
|
||
"monthly_limit": monthly_limit,
|
||
"monthly_pct": round(monthly_used / monthly_limit, 4) if monthly_limit > 0 else 0.0,
|
||
}
|
||
|
||
# ── 批量执行 ──────────────────────────────────────────
|
||
|
||
async def estimate_batch(
|
||
self, app_types: list[str], member_ids: list[int], site_id: int,
|
||
) -> dict:
|
||
"""生成 batch_id,存入内存(TTL 10min),返回预估。
|
||
|
||
F1-5a: estimate 阶段抓 RuntimeContext 快照存入 _batch_store,
|
||
confirm 时取出传给 _run_batch,避免 estimate→confirm 间 Neo 切 sandbox 模式造成数据漂移污染。
|
||
"""
|
||
self._cleanup_expired_batches()
|
||
|
||
batch_id = uuid.uuid4().hex
|
||
estimated_calls = len(app_types) * len(member_ids)
|
||
estimated_tokens = estimated_calls * AVG_TOKENS_PER_CALL
|
||
|
||
# F1-5a: 抓 ctx_snapshot
|
||
ctx_snapshot = get_runtime_context(site_id)
|
||
|
||
self._batch_store[batch_id] = {
|
||
"params": {
|
||
"app_types": app_types,
|
||
"member_ids": member_ids,
|
||
"site_id": site_id,
|
||
"batch_id": batch_id, # 用于 _run_batch 内 triggered_by 标注
|
||
},
|
||
"ctx_snapshot": ctx_snapshot,
|
||
"expires_at": datetime.now(timezone.utc) + timedelta(seconds=_BATCH_TTL_SECONDS),
|
||
}
|
||
|
||
logger.info(
|
||
"批量执行预估: batch_id=%s apps=%s members=%d site_id=%s mode=%s sandbox_date=%s",
|
||
batch_id,
|
||
app_types,
|
||
len(member_ids),
|
||
site_id,
|
||
ctx_snapshot.mode,
|
||
ctx_snapshot.sandbox_date.isoformat() if ctx_snapshot.sandbox_date else None,
|
||
)
|
||
|
||
return {
|
||
"batch_id": batch_id,
|
||
"estimated_calls": estimated_calls,
|
||
"estimated_tokens": estimated_tokens,
|
||
}
|
||
|
||
async def confirm_batch(self, batch_id: str) -> None:
|
||
"""取出参数 + ctx_snapshot,异步执行批量调用。"""
|
||
self._cleanup_expired_batches()
|
||
|
||
entry = self._batch_store.pop(batch_id, None)
|
||
if entry is None:
|
||
raise ValueError(f"batch_id 无效或已过期: {batch_id}")
|
||
|
||
params = entry["params"]
|
||
ctx_snapshot: RuntimeContext = entry["ctx_snapshot"]
|
||
logger.info(
|
||
"批量执行确认: batch_id=%s apps=%s members=%d site_id=%s mode=%s",
|
||
batch_id,
|
||
params["app_types"],
|
||
len(params["member_ids"]),
|
||
params["site_id"],
|
||
ctx_snapshot.mode,
|
||
)
|
||
# F1-5a: 后台异步执行,传入 ctx_snapshot 防止运行时 Neo 切模式
|
||
asyncio.create_task(self._run_batch(params, ctx_snapshot))
|
||
|
||
async def _run_batch(
|
||
self, params: dict[str, Any], ctx_snapshot: RuntimeContext,
|
||
) -> None:
|
||
"""F1-5a: 后台批量执行真正实现。
|
||
|
||
- Semaphore(_BATCH_CONCURRENCY=5) 限并发,避免打爆 DashScope 限流
|
||
- asyncio.gather + return_exceptions=True:单个 member 失败不连坐
|
||
- context 显式带 business_date(来自 ctx_snapshot),prompt builder 自取沙箱日期
|
||
- triggered_by=f"batch:{batch_id}":打标 ai_run_logs,Wave 2 进度查询基础
|
||
"""
|
||
if self._dispatcher is None:
|
||
logger.error(
|
||
"批量执行失败: dispatcher 未注入 batch_id=%s",
|
||
params.get("batch_id"),
|
||
)
|
||
return
|
||
|
||
app_types: list[str] = params["app_types"]
|
||
member_ids: list[int] = params["member_ids"]
|
||
site_id: int = params["site_id"]
|
||
batch_id: str = params["batch_id"]
|
||
|
||
sem = asyncio.Semaphore(_BATCH_CONCURRENCY)
|
||
triggered_by = f"batch:{batch_id}"
|
||
business_date_iso = ctx_snapshot.business_date.isoformat()
|
||
|
||
async def _run_one(app_type: str, member_id: int) -> None:
|
||
async with sem:
|
||
try:
|
||
await self._dispatcher.run_single_app(
|
||
app_type=app_type,
|
||
context={
|
||
"site_id": site_id,
|
||
"member_id": member_id,
|
||
"business_date": business_date_iso,
|
||
},
|
||
triggered_by=triggered_by,
|
||
)
|
||
except Exception:
|
||
# 单个失败已写 ai_run_logs,此处仅记录(不连坐其他 member)
|
||
logger.exception(
|
||
"批量执行单步失败 batch_id=%s app=%s member=%s",
|
||
batch_id, app_type, member_id,
|
||
)
|
||
|
||
tasks = [
|
||
asyncio.create_task(_run_one(at, mid))
|
||
for at in app_types
|
||
for mid in member_ids
|
||
]
|
||
|
||
logger.info(
|
||
"批量执行开始: batch_id=%s tasks=%d concurrency=%d mode=%s",
|
||
batch_id, len(tasks), _BATCH_CONCURRENCY, ctx_snapshot.mode,
|
||
)
|
||
|
||
await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
logger.info("批量执行完成: batch_id=%s tasks=%d", batch_id, len(tasks))
|
||
|
||
def _cleanup_expired_batches(self) -> None:
|
||
"""清理过期 batch。"""
|
||
now = datetime.now(timezone.utc)
|
||
expired = [
|
||
bid for bid, entry in self._batch_store.items()
|
||
if entry["expires_at"] <= now
|
||
]
|
||
for bid in expired:
|
||
del self._batch_store[bid]
|
||
if expired:
|
||
logger.debug("清理过期 batch: %d 个", len(expired))
|
||
|
||
# ── 告警管理 ──────────────────────────────────────────
|
||
|
||
async def list_alerts(
|
||
self,
|
||
alert_status: str | None = None,
|
||
site_id: int | None = None,
|
||
page: int = 1,
|
||
page_size: int = 20,
|
||
) -> dict:
|
||
"""告警列表:ai_run_logs WHERE status IN ('failed','timeout','circuit_open')。"""
|
||
where_parts = ["status IN ('failed', 'timeout', 'circuit_open')"]
|
||
params: list = []
|
||
|
||
if alert_status is not None:
|
||
if alert_status == "pending":
|
||
# pending 包含 NULL 和 'pending'
|
||
where_parts.append("(alert_status IS NULL OR alert_status = 'pending')")
|
||
else:
|
||
where_parts.append("alert_status = %s")
|
||
params.append(alert_status)
|
||
|
||
if site_id is not None:
|
||
where_parts.append("site_id = %s")
|
||
params.append(site_id)
|
||
|
||
where_sql = "WHERE " + " AND ".join(where_parts)
|
||
offset = (page - 1) * page_size
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"SELECT COUNT(*) FROM biz.ai_run_logs {where_sql}",
|
||
params,
|
||
)
|
||
total = cur.fetchone()[0]
|
||
|
||
cur.execute(
|
||
f"""
|
||
SELECT id, app_type, status, alert_status,
|
||
error_message, created_at
|
||
FROM biz.ai_run_logs
|
||
{where_sql}
|
||
ORDER BY created_at DESC
|
||
LIMIT %s OFFSET %s
|
||
""",
|
||
(*params, page_size, offset),
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
return {
|
||
"items": [_row_to_dict(cols, r) for r in rows],
|
||
"total": total,
|
||
"page": page,
|
||
"page_size": page_size,
|
||
}
|
||
|
||
async def ack_alert(self, log_id: int) -> str:
|
||
"""确认告警:alert_status → acknowledged。"""
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
"""
|
||
UPDATE biz.ai_run_logs
|
||
SET alert_status = 'acknowledged'
|
||
WHERE id = %s
|
||
AND status IN ('failed', 'timeout', 'circuit_open')
|
||
""",
|
||
(log_id,),
|
||
)
|
||
conn.commit()
|
||
except Exception:
|
||
conn.rollback()
|
||
raise
|
||
finally:
|
||
conn.close()
|
||
|
||
return "acknowledged"
|
||
|
||
async def ignore_alert(self, log_id: int) -> str:
|
||
"""忽略告警:alert_status → ignored。"""
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
"""
|
||
UPDATE biz.ai_run_logs
|
||
SET alert_status = 'ignored'
|
||
WHERE id = %s
|
||
AND status IN ('failed', 'timeout', 'circuit_open')
|
||
""",
|
||
(log_id,),
|
||
)
|
||
conn.commit()
|
||
except Exception:
|
||
conn.rollback()
|
||
raise
|
||
finally:
|
||
conn.close()
|
||
|
||
return "ignored"
|
||
|
||
# ── 触发器管理(biz.trigger_jobs)───────────────────────
|
||
|
||
async def list_triggers(self) -> list[dict]:
|
||
"""列出所有 AI 相关触发器(job_type 以 ai_ 开头 + task_generator)。
|
||
|
||
返回字段:id / job_name / job_type / trigger_condition / trigger_config /
|
||
status / description / last_run_at / next_run_at / last_error
|
||
"""
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
"""
|
||
SELECT id, job_name, job_type, trigger_condition,
|
||
trigger_config, status, description,
|
||
last_run_at, next_run_at, last_error
|
||
FROM biz.trigger_jobs
|
||
WHERE job_type LIKE 'ai_%' OR job_name = 'task_generator'
|
||
ORDER BY trigger_condition DESC, job_name
|
||
"""
|
||
)
|
||
cols = [d[0] for d in cur.description]
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
return [_row_to_dict(cols, r) for r in rows]
|
||
|
||
async def update_trigger(
|
||
self, trigger_id: int,
|
||
status_new: str | None = None,
|
||
cron_expression: str | None = None,
|
||
description: str | None = None,
|
||
) -> dict:
|
||
"""更新触发器:启用/禁用、修改 cron、改描述。
|
||
|
||
仅允许修改 ai_ 前缀或 task_generator 的触发器。
|
||
"""
|
||
if status_new is not None and status_new not in ("enabled", "disabled"):
|
||
raise ValueError(f"非法 status: {status_new}")
|
||
|
||
sets: list[str] = []
|
||
params: list = []
|
||
if status_new is not None:
|
||
sets.append("status = %s")
|
||
params.append(status_new)
|
||
if cron_expression is not None:
|
||
sets.append("trigger_config = jsonb_set(trigger_config, '{cron_expression}', to_jsonb(%s::text))")
|
||
params.append(cron_expression)
|
||
if description is not None:
|
||
sets.append("description = %s")
|
||
params.append(description)
|
||
|
||
if not sets:
|
||
raise ValueError("至少修改一个字段")
|
||
|
||
params.append(trigger_id)
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
f"""
|
||
UPDATE biz.trigger_jobs
|
||
SET {", ".join(sets)}
|
||
WHERE id = %s
|
||
AND (job_type LIKE 'ai_%%' OR job_name = 'task_generator')
|
||
RETURNING id, job_name, job_type, trigger_condition,
|
||
trigger_config, status, description,
|
||
last_run_at, next_run_at, last_error
|
||
""",
|
||
params,
|
||
)
|
||
row = cur.fetchone()
|
||
if row is None:
|
||
conn.rollback()
|
||
raise ValueError("触发器不存在或不可修改")
|
||
cols = [d[0] for d in cur.description]
|
||
conn.commit()
|
||
except Exception:
|
||
conn.rollback()
|
||
raise
|
||
finally:
|
||
conn.close()
|
||
return _row_to_dict(cols, row)
|
||
|
||
# ── 预热进度(app2_finance 72 组合)──────────────────────
|
||
|
||
async def get_prewarm_progress(self, site_id: int) -> dict:
|
||
"""查询 app2_finance 72 组合缓存进度。
|
||
|
||
返回:total=72, done=N, missing=[{time_dimension, area}], last_updated
|
||
"""
|
||
time_dims = (
|
||
"this_month", "last_month", "this_week", "last_week",
|
||
"this_quarter", "last_quarter", "last_3_months", "last_6_months",
|
||
)
|
||
areas = (
|
||
"all", "hall", "hallA", "hallB", "hallC",
|
||
"vip", "snooker", "mahjong", "ktv",
|
||
)
|
||
expected = {f"{t}__{a}" for t in time_dims for a in areas}
|
||
|
||
conn = get_connection()
|
||
try:
|
||
with conn.cursor() as cur:
|
||
cur.execute(
|
||
"""
|
||
SELECT target_id, max(created_at) AS last_updated
|
||
FROM biz.ai_cache
|
||
WHERE cache_type = 'app2_finance'
|
||
AND site_id = %s
|
||
AND target_id LIKE %s ESCAPE '\\'
|
||
GROUP BY target_id
|
||
""",
|
||
(site_id, r'%\_\_%'),
|
||
)
|
||
rows = cur.fetchall()
|
||
conn.commit()
|
||
finally:
|
||
conn.close()
|
||
|
||
done_map = {r[0]: r[1] for r in rows}
|
||
missing = sorted(expected - set(done_map.keys()))
|
||
last = max(done_map.values()) if done_map else None
|
||
return {
|
||
"total": len(expected),
|
||
"done": len(expected & set(done_map.keys())),
|
||
"missing": [
|
||
{"target_id": m, "time_dimension": m.split("__")[0], "area": m.split("__")[1]}
|
||
for m in missing
|
||
],
|
||
"last_updated": last.isoformat() if last else None,
|
||
}
|
||
|
||
|
||
# ── 工具函数 ──────────────────────────────────────────────
|
||
|
||
|
||
def _site_filter(site_id: int | None) -> tuple[str, tuple]:
|
||
"""生成 site_id 过滤子句和参数。"""
|
||
if site_id is None:
|
||
return "", ()
|
||
return "AND site_id = %s", (site_id,)
|
||
|
||
|
||
def _row_to_dict(columns: list[str], row: tuple) -> dict:
|
||
"""将数据库行转换为 dict,处理 datetime 序列化。"""
|
||
result = {}
|
||
for col, val in zip(columns, row):
|
||
if isinstance(val, datetime):
|
||
result[col] = val.isoformat()
|
||
else:
|
||
result[col] = val
|
||
return result
|