Files
Neo-ZQYY/scripts/ops/test_chat_ai_quality.py

346 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
RNS1.4 CHAT 模块 AI 返回质量评估脚本。
直接调用百炼 APIOpenAI 兼容协议),模拟 4 种入口场景的对话,
评估 AI 回复的质量(语义相关性、中文正确性、上下文理解能力)。
输出Markdown 评估报告 → docs/reports/
"""
from __future__ import annotations
import asyncio
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
# 加载根 .env
_root = Path(__file__).resolve().parents[2]
load_dotenv(_root / ".env")
BAILIAN_API_KEY = os.environ.get("BAILIAN_API_KEY")
BAILIAN_BASE_URL = os.environ.get("BAILIAN_BASE_URL")
BAILIAN_MODEL = os.environ.get("BAILIAN_MODEL")
if not all([BAILIAN_API_KEY, BAILIAN_BASE_URL, BAILIAN_MODEL]):
print("ERROR: 缺少 BAILIAN_API_KEY / BAILIAN_BASE_URL / BAILIAN_MODEL 环境变量")
sys.exit(1)
import openai
client = openai.AsyncOpenAI(api_key=BAILIAN_API_KEY, base_url=BAILIAN_BASE_URL)
SYSTEM_PROMPT = json.dumps(
{"task": "你是台球门店的 AI 助手,根据用户的问题和当前页面上下文提供帮助。"},
ensure_ascii=False,
)
# ── 4 个测试场景 ──────────────────────────────────────────────
SCENARIOS: list[dict] = [
{
"name": "场景1: task 入口 — 维客任务咨询",
"context_type": "task",
"description": "助教从任务详情页进入,询问如何完成一个维客任务",
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT,
},
{
"role": "user",
"content": json.dumps(
{
"current_time": datetime.now().isoformat(),
"source_page": "task-detail",
"page_context": {
"task_type": "retention",
"member_name": "张三",
"priority_score": 85,
"expires_at": "2026-03-25",
},
"screen_content": "维客任务张三优先级85分3月25日到期",
},
ensure_ascii=False,
),
},
{
"role": "user",
"content": "这个客户最近消费频率下降了,我应该怎么跟他沟通比较好?有什么话术建议吗?",
},
],
},
{
"name": "场景2: customer 入口 — 客户详情咨询",
"context_type": "customer",
"description": "助教从客户详情页进入,询问客户消费情况分析",
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT,
},
{
"role": "user",
"content": json.dumps(
{
"current_time": datetime.now().isoformat(),
"source_page": "customer-detail",
"page_context": {
"member_id": 12345,
"member_name": "李四",
"member_level": "VIP",
"last_visit": "2026-03-15",
"total_consumption": "¥8,500",
},
"screen_content": "客户李四VIP会员累计消费¥8,500最近到店3月15日",
},
ensure_ascii=False,
),
},
{
"role": "user",
"content": "帮我分析一下这个客户的消费习惯,他适合推荐什么课程?",
},
],
},
{
"name": "场景3: coach 入口 — 助教业绩咨询",
"context_type": "coach",
"description": "助教从自己的详情页进入,询问业绩提升建议",
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT,
},
{
"role": "user",
"content": json.dumps(
{
"current_time": datetime.now().isoformat(),
"source_page": "coach-detail",
"page_context": {
"coach_name": "王教练",
"monthly_lessons": 45,
"monthly_revenue": "¥12,000",
"customer_count": 28,
},
"screen_content": "助教王教练本月课时45节收入¥12,000服务客户28人",
},
ensure_ascii=False,
),
},
{
"role": "user",
"content": "我这个月业绩一般,有什么方法可以提升客户续课率?",
},
],
},
{
"name": "场景4: general 入口 — 通用对话",
"context_type": "general",
"description": "助教从首页直接进入聊天,无特定上下文",
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT,
},
{
"role": "user",
"content": "台球馆周末客流量大的时候,怎么合理安排台位和助教排班?",
},
],
},
]
# ── 多轮追问场景1 追加) ────────────────────────────────────
FOLLOWUP_MESSAGES = [
"如果他说最近比较忙没时间来,我该怎么回应?",
"好的,那如果他愿意来,我应该推荐什么样的课程套餐?",
]
async def call_ai(messages: list[dict]) -> tuple[str, float, int | None]:
"""调用百炼 API返回 (回复内容, 耗时秒, tokens_used)。"""
t0 = time.time()
response = await client.chat.completions.create(
model=BAILIAN_MODEL,
messages=messages,
temperature=0.7,
max_tokens=2000,
)
elapsed = time.time() - t0
content = response.choices[0].message.content or ""
tokens = response.usage.total_tokens if response.usage else None
return content, elapsed, tokens
async def call_ai_stream(messages: list[dict]) -> tuple[str, float, int]:
"""流式调用百炼 API返回 (完整回复, 耗时秒, chunk数)。"""
t0 = time.time()
chunks: list[str] = []
chunk_count = 0
response = await client.chat.completions.create(
model=BAILIAN_MODEL,
messages=messages,
temperature=0.7,
max_tokens=2000,
stream=True,
)
async for chunk in response:
if chunk.choices and chunk.choices[0].delta.content:
chunks.append(chunk.choices[0].delta.content)
chunk_count += 1
elapsed = time.time() - t0
return "".join(chunks), elapsed, chunk_count
async def run_scenario(scenario: dict) -> dict:
"""执行单个场景,返回结果字典。"""
print(f"\n{'='*60}")
print(f" {scenario['name']}")
print(f"{'='*60}")
results = {"name": scenario["name"], "description": scenario["description"], "rounds": []}
messages = list(scenario["messages"])
# 第一轮:非流式
user_msg = messages[-1]["content"]
print(f"\n[用户] {user_msg[:80]}...")
reply, elapsed, tokens = await call_ai(messages)
print(f"[AI] ({elapsed:.1f}s, {tokens} tokens) {reply[:100]}...")
results["rounds"].append({
"round": 1,
"mode": "非流式",
"user_message": user_msg,
"ai_reply": reply,
"elapsed_seconds": round(elapsed, 2),
"tokens_used": tokens,
})
messages.append({"role": "assistant", "content": reply})
# 第二轮流式仅场景1
if scenario["context_type"] == "task":
for i, followup in enumerate(FOLLOWUP_MESSAGES):
messages.append({"role": "user", "content": followup})
print(f"\n[用户] {followup}")
reply_s, elapsed_s, chunk_count = await call_ai_stream(messages)
print(f"[AI-Stream] ({elapsed_s:.1f}s, {chunk_count} chunks) {reply_s[:100]}...")
results["rounds"].append({
"round": i + 2,
"mode": "流式",
"user_message": followup,
"ai_reply": reply_s,
"elapsed_seconds": round(elapsed_s, 2),
"chunk_count": chunk_count,
})
messages.append({"role": "assistant", "content": reply_s})
return results
async def main():
print("RNS1.4 CHAT AI 质量评估 — 开始")
print(f"模型: {BAILIAN_MODEL}")
print(f"端点: {BAILIAN_BASE_URL}")
print(f"时间: {datetime.now().isoformat()}")
all_results: list[dict] = []
for scenario in SCENARIOS:
try:
result = await run_scenario(scenario)
all_results.append(result)
except Exception as e:
print(f"\n ❌ 场景失败: {e}")
all_results.append({
"name": scenario["name"],
"description": scenario["description"],
"error": str(e),
})
# 生成 Markdown 报告
report = generate_report(all_results)
output_path = _root / "docs" / "reports" / "2026-03-20__rns14_chat_ai_quality_eval.md"
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(report, encoding="utf-8")
print(f"\n✅ 报告已输出: {output_path}")
def generate_report(results: list[dict]) -> str:
"""生成 Markdown 评估报告。"""
lines: list[str] = []
lines.append("# RNS1.4 CHAT 模块 AI 返回质量评估报告")
lines.append("")
lines.append(f"- 评估时间: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
lines.append(f"- 模型: {BAILIAN_MODEL}")
lines.append(f"- 端点: {BAILIAN_BASE_URL}")
lines.append(f"- 场景数: {len(results)}")
lines.append("")
lines.append("---")
lines.append("")
for r in results:
lines.append(f"## {r['name']}")
lines.append("")
lines.append(f"**场景描述**: {r['description']}")
lines.append("")
if "error" in r:
lines.append(f"**❌ 执行失败**: {r['error']}")
lines.append("")
continue
for rd in r.get("rounds", []):
lines.append(f"### 第 {rd['round']} 轮({rd['mode']}")
lines.append("")
lines.append(f"**用户发送**:")
lines.append("")
lines.append(f"```")
lines.append(rd["user_message"])
lines.append(f"```")
lines.append("")
lines.append(f"**AI 回复**:")
lines.append("")
lines.append(f"```")
lines.append(rd["ai_reply"])
lines.append(f"```")
lines.append("")
meta_parts = [f"耗时 {rd['elapsed_seconds']}s"]
if rd.get("tokens_used"):
meta_parts.append(f"tokens: {rd['tokens_used']}")
if rd.get("chunk_count"):
meta_parts.append(f"chunks: {rd['chunk_count']}")
lines.append(f"**性能**: {' | '.join(meta_parts)}")
lines.append("")
lines.append("---")
lines.append("")
# AI 评价占位(由执行者填写)
lines.append("## 综合评价")
lines.append("")
lines.append("| 维度 | 评分 | 说明 |")
lines.append("|------|------|------|")
lines.append("| 语义相关性 | — | — |")
lines.append("| 中文表达质量 | — | — |")
lines.append("| 上下文理解 | — | — |")
lines.append("| 多轮连贯性 | — | — |")
lines.append("| 响应速度 | — | — |")
lines.append("| 流式输出稳定性 | — | — |")
lines.append("")
lines.append("> 评分标准: ✅ 优秀 / ⚠️ 可接受 / ❌ 不合格")
lines.append("")
return "\n".join(lines)
if __name__ == "__main__":
asyncio.run(main())