346 lines
12 KiB
Python
346 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
RNS1.4 CHAT 模块 AI 返回质量评估脚本。
|
||
|
||
直接调用百炼 API(OpenAI 兼容协议),模拟 4 种入口场景的对话,
|
||
评估 AI 回复的质量(语义相关性、中文正确性、上下文理解能力)。
|
||
|
||
输出:Markdown 评估报告 → docs/reports/
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
from dotenv import load_dotenv
|
||
|
||
# 加载根 .env
|
||
_root = Path(__file__).resolve().parents[2]
|
||
load_dotenv(_root / ".env")
|
||
|
||
BAILIAN_API_KEY = os.environ.get("BAILIAN_API_KEY")
|
||
BAILIAN_BASE_URL = os.environ.get("BAILIAN_BASE_URL")
|
||
BAILIAN_MODEL = os.environ.get("BAILIAN_MODEL")
|
||
|
||
if not all([BAILIAN_API_KEY, BAILIAN_BASE_URL, BAILIAN_MODEL]):
|
||
print("ERROR: 缺少 BAILIAN_API_KEY / BAILIAN_BASE_URL / BAILIAN_MODEL 环境变量")
|
||
sys.exit(1)
|
||
|
||
import openai
|
||
|
||
client = openai.AsyncOpenAI(api_key=BAILIAN_API_KEY, base_url=BAILIAN_BASE_URL)
|
||
|
||
SYSTEM_PROMPT = json.dumps(
|
||
{"task": "你是台球门店的 AI 助手,根据用户的问题和当前页面上下文提供帮助。"},
|
||
ensure_ascii=False,
|
||
)
|
||
|
||
# ── 4 个测试场景 ──────────────────────────────────────────────
|
||
|
||
SCENARIOS: list[dict] = [
|
||
{
|
||
"name": "场景1: task 入口 — 维客任务咨询",
|
||
"context_type": "task",
|
||
"description": "助教从任务详情页进入,询问如何完成一个维客任务",
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": SYSTEM_PROMPT,
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": json.dumps(
|
||
{
|
||
"current_time": datetime.now().isoformat(),
|
||
"source_page": "task-detail",
|
||
"page_context": {
|
||
"task_type": "retention",
|
||
"member_name": "张三",
|
||
"priority_score": 85,
|
||
"expires_at": "2026-03-25",
|
||
},
|
||
"screen_content": "维客任务:张三,优先级85分,3月25日到期",
|
||
},
|
||
ensure_ascii=False,
|
||
),
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": "这个客户最近消费频率下降了,我应该怎么跟他沟通比较好?有什么话术建议吗?",
|
||
},
|
||
],
|
||
},
|
||
{
|
||
"name": "场景2: customer 入口 — 客户详情咨询",
|
||
"context_type": "customer",
|
||
"description": "助教从客户详情页进入,询问客户消费情况分析",
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": SYSTEM_PROMPT,
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": json.dumps(
|
||
{
|
||
"current_time": datetime.now().isoformat(),
|
||
"source_page": "customer-detail",
|
||
"page_context": {
|
||
"member_id": 12345,
|
||
"member_name": "李四",
|
||
"member_level": "VIP",
|
||
"last_visit": "2026-03-15",
|
||
"total_consumption": "¥8,500",
|
||
},
|
||
"screen_content": "客户:李四,VIP会员,累计消费¥8,500,最近到店3月15日",
|
||
},
|
||
ensure_ascii=False,
|
||
),
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": "帮我分析一下这个客户的消费习惯,他适合推荐什么课程?",
|
||
},
|
||
],
|
||
},
|
||
{
|
||
"name": "场景3: coach 入口 — 助教业绩咨询",
|
||
"context_type": "coach",
|
||
"description": "助教从自己的详情页进入,询问业绩提升建议",
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": SYSTEM_PROMPT,
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": json.dumps(
|
||
{
|
||
"current_time": datetime.now().isoformat(),
|
||
"source_page": "coach-detail",
|
||
"page_context": {
|
||
"coach_name": "王教练",
|
||
"monthly_lessons": 45,
|
||
"monthly_revenue": "¥12,000",
|
||
"customer_count": 28,
|
||
},
|
||
"screen_content": "助教:王教练,本月课时45节,收入¥12,000,服务客户28人",
|
||
},
|
||
ensure_ascii=False,
|
||
),
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": "我这个月业绩一般,有什么方法可以提升客户续课率?",
|
||
},
|
||
],
|
||
},
|
||
{
|
||
"name": "场景4: general 入口 — 通用对话",
|
||
"context_type": "general",
|
||
"description": "助教从首页直接进入聊天,无特定上下文",
|
||
"messages": [
|
||
{
|
||
"role": "system",
|
||
"content": SYSTEM_PROMPT,
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": "台球馆周末客流量大的时候,怎么合理安排台位和助教排班?",
|
||
},
|
||
],
|
||
},
|
||
]
|
||
|
||
# ── 多轮追问(场景1 追加) ────────────────────────────────────
|
||
|
||
FOLLOWUP_MESSAGES = [
|
||
"如果他说最近比较忙没时间来,我该怎么回应?",
|
||
"好的,那如果他愿意来,我应该推荐什么样的课程套餐?",
|
||
]
|
||
|
||
|
||
async def call_ai(messages: list[dict]) -> tuple[str, float, int | None]:
|
||
"""调用百炼 API,返回 (回复内容, 耗时秒, tokens_used)。"""
|
||
t0 = time.time()
|
||
response = await client.chat.completions.create(
|
||
model=BAILIAN_MODEL,
|
||
messages=messages,
|
||
temperature=0.7,
|
||
max_tokens=2000,
|
||
)
|
||
elapsed = time.time() - t0
|
||
content = response.choices[0].message.content or ""
|
||
tokens = response.usage.total_tokens if response.usage else None
|
||
return content, elapsed, tokens
|
||
|
||
|
||
async def call_ai_stream(messages: list[dict]) -> tuple[str, float, int]:
|
||
"""流式调用百炼 API,返回 (完整回复, 耗时秒, chunk数)。"""
|
||
t0 = time.time()
|
||
chunks: list[str] = []
|
||
chunk_count = 0
|
||
response = await client.chat.completions.create(
|
||
model=BAILIAN_MODEL,
|
||
messages=messages,
|
||
temperature=0.7,
|
||
max_tokens=2000,
|
||
stream=True,
|
||
)
|
||
async for chunk in response:
|
||
if chunk.choices and chunk.choices[0].delta.content:
|
||
chunks.append(chunk.choices[0].delta.content)
|
||
chunk_count += 1
|
||
elapsed = time.time() - t0
|
||
return "".join(chunks), elapsed, chunk_count
|
||
|
||
|
||
async def run_scenario(scenario: dict) -> dict:
|
||
"""执行单个场景,返回结果字典。"""
|
||
print(f"\n{'='*60}")
|
||
print(f" {scenario['name']}")
|
||
print(f"{'='*60}")
|
||
|
||
results = {"name": scenario["name"], "description": scenario["description"], "rounds": []}
|
||
|
||
messages = list(scenario["messages"])
|
||
|
||
# 第一轮:非流式
|
||
user_msg = messages[-1]["content"]
|
||
print(f"\n[用户] {user_msg[:80]}...")
|
||
reply, elapsed, tokens = await call_ai(messages)
|
||
print(f"[AI] ({elapsed:.1f}s, {tokens} tokens) {reply[:100]}...")
|
||
results["rounds"].append({
|
||
"round": 1,
|
||
"mode": "非流式",
|
||
"user_message": user_msg,
|
||
"ai_reply": reply,
|
||
"elapsed_seconds": round(elapsed, 2),
|
||
"tokens_used": tokens,
|
||
})
|
||
messages.append({"role": "assistant", "content": reply})
|
||
|
||
# 第二轮:流式(仅场景1)
|
||
if scenario["context_type"] == "task":
|
||
for i, followup in enumerate(FOLLOWUP_MESSAGES):
|
||
messages.append({"role": "user", "content": followup})
|
||
print(f"\n[用户] {followup}")
|
||
reply_s, elapsed_s, chunk_count = await call_ai_stream(messages)
|
||
print(f"[AI-Stream] ({elapsed_s:.1f}s, {chunk_count} chunks) {reply_s[:100]}...")
|
||
results["rounds"].append({
|
||
"round": i + 2,
|
||
"mode": "流式",
|
||
"user_message": followup,
|
||
"ai_reply": reply_s,
|
||
"elapsed_seconds": round(elapsed_s, 2),
|
||
"chunk_count": chunk_count,
|
||
})
|
||
messages.append({"role": "assistant", "content": reply_s})
|
||
|
||
return results
|
||
|
||
|
||
async def main():
|
||
print("RNS1.4 CHAT AI 质量评估 — 开始")
|
||
print(f"模型: {BAILIAN_MODEL}")
|
||
print(f"端点: {BAILIAN_BASE_URL}")
|
||
print(f"时间: {datetime.now().isoformat()}")
|
||
|
||
all_results: list[dict] = []
|
||
for scenario in SCENARIOS:
|
||
try:
|
||
result = await run_scenario(scenario)
|
||
all_results.append(result)
|
||
except Exception as e:
|
||
print(f"\n ❌ 场景失败: {e}")
|
||
all_results.append({
|
||
"name": scenario["name"],
|
||
"description": scenario["description"],
|
||
"error": str(e),
|
||
})
|
||
|
||
# 生成 Markdown 报告
|
||
report = generate_report(all_results)
|
||
output_path = _root / "docs" / "reports" / "2026-03-20__rns14_chat_ai_quality_eval.md"
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
output_path.write_text(report, encoding="utf-8")
|
||
print(f"\n✅ 报告已输出: {output_path}")
|
||
|
||
|
||
def generate_report(results: list[dict]) -> str:
|
||
"""生成 Markdown 评估报告。"""
|
||
lines: list[str] = []
|
||
lines.append("# RNS1.4 CHAT 模块 AI 返回质量评估报告")
|
||
lines.append("")
|
||
lines.append(f"- 评估时间: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
||
lines.append(f"- 模型: {BAILIAN_MODEL}")
|
||
lines.append(f"- 端点: {BAILIAN_BASE_URL}")
|
||
lines.append(f"- 场景数: {len(results)}")
|
||
lines.append("")
|
||
lines.append("---")
|
||
lines.append("")
|
||
|
||
for r in results:
|
||
lines.append(f"## {r['name']}")
|
||
lines.append("")
|
||
lines.append(f"**场景描述**: {r['description']}")
|
||
lines.append("")
|
||
|
||
if "error" in r:
|
||
lines.append(f"**❌ 执行失败**: {r['error']}")
|
||
lines.append("")
|
||
continue
|
||
|
||
for rd in r.get("rounds", []):
|
||
lines.append(f"### 第 {rd['round']} 轮({rd['mode']})")
|
||
lines.append("")
|
||
lines.append(f"**用户发送**:")
|
||
lines.append("")
|
||
lines.append(f"```")
|
||
lines.append(rd["user_message"])
|
||
lines.append(f"```")
|
||
lines.append("")
|
||
lines.append(f"**AI 回复**:")
|
||
lines.append("")
|
||
lines.append(f"```")
|
||
lines.append(rd["ai_reply"])
|
||
lines.append(f"```")
|
||
lines.append("")
|
||
|
||
meta_parts = [f"耗时 {rd['elapsed_seconds']}s"]
|
||
if rd.get("tokens_used"):
|
||
meta_parts.append(f"tokens: {rd['tokens_used']}")
|
||
if rd.get("chunk_count"):
|
||
meta_parts.append(f"chunks: {rd['chunk_count']}")
|
||
lines.append(f"**性能**: {' | '.join(meta_parts)}")
|
||
lines.append("")
|
||
|
||
lines.append("---")
|
||
lines.append("")
|
||
|
||
# AI 评价占位(由执行者填写)
|
||
lines.append("## 综合评价")
|
||
lines.append("")
|
||
lines.append("| 维度 | 评分 | 说明 |")
|
||
lines.append("|------|------|------|")
|
||
lines.append("| 语义相关性 | — | — |")
|
||
lines.append("| 中文表达质量 | — | — |")
|
||
lines.append("| 上下文理解 | — | — |")
|
||
lines.append("| 多轮连贯性 | — | — |")
|
||
lines.append("| 响应速度 | — | — |")
|
||
lines.append("| 流式输出稳定性 | — | — |")
|
||
lines.append("")
|
||
lines.append("> 评分标准: ✅ 优秀 / ⚠️ 可接受 / ❌ 不合格")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|