# -*- coding: utf-8 -*- """ RNS1.4 CHAT 模块 AI 返回质量评估脚本。 直接调用百炼 API(OpenAI 兼容协议),模拟 4 种入口场景的对话, 评估 AI 回复的质量(语义相关性、中文正确性、上下文理解能力)。 输出:Markdown 评估报告 → docs/reports/ """ from __future__ import annotations import asyncio import json import os import sys import time from datetime import datetime from pathlib import Path from dotenv import load_dotenv # 加载根 .env _root = Path(__file__).resolve().parents[2] load_dotenv(_root / ".env") BAILIAN_API_KEY = os.environ.get("BAILIAN_API_KEY") BAILIAN_BASE_URL = os.environ.get("BAILIAN_BASE_URL") BAILIAN_MODEL = os.environ.get("BAILIAN_MODEL") if not all([BAILIAN_API_KEY, BAILIAN_BASE_URL, BAILIAN_MODEL]): print("ERROR: 缺少 BAILIAN_API_KEY / BAILIAN_BASE_URL / BAILIAN_MODEL 环境变量") sys.exit(1) import openai client = openai.AsyncOpenAI(api_key=BAILIAN_API_KEY, base_url=BAILIAN_BASE_URL) SYSTEM_PROMPT = json.dumps( {"task": "你是台球门店的 AI 助手,根据用户的问题和当前页面上下文提供帮助。"}, ensure_ascii=False, ) # ── 4 个测试场景 ────────────────────────────────────────────── SCENARIOS: list[dict] = [ { "name": "场景1: task 入口 — 维客任务咨询", "context_type": "task", "description": "助教从任务详情页进入,询问如何完成一个维客任务", "messages": [ { "role": "system", "content": SYSTEM_PROMPT, }, { "role": "user", "content": json.dumps( { "current_time": datetime.now().isoformat(), "source_page": "task-detail", "page_context": { "task_type": "retention", "member_name": "张三", "priority_score": 85, "expires_at": "2026-03-25", }, "screen_content": "维客任务:张三,优先级85分,3月25日到期", }, ensure_ascii=False, ), }, { "role": "user", "content": "这个客户最近消费频率下降了,我应该怎么跟他沟通比较好?有什么话术建议吗?", }, ], }, { "name": "场景2: customer 入口 — 客户详情咨询", "context_type": "customer", "description": "助教从客户详情页进入,询问客户消费情况分析", "messages": [ { "role": "system", "content": SYSTEM_PROMPT, }, { "role": "user", "content": json.dumps( { "current_time": datetime.now().isoformat(), "source_page": "customer-detail", "page_context": { "member_id": 12345, "member_name": "李四", "member_level": "VIP", "last_visit": "2026-03-15", "total_consumption": "¥8,500", }, "screen_content": "客户:李四,VIP会员,累计消费¥8,500,最近到店3月15日", }, ensure_ascii=False, ), }, { "role": "user", "content": "帮我分析一下这个客户的消费习惯,他适合推荐什么课程?", }, ], }, { "name": "场景3: coach 入口 — 助教业绩咨询", "context_type": "coach", "description": "助教从自己的详情页进入,询问业绩提升建议", "messages": [ { "role": "system", "content": SYSTEM_PROMPT, }, { "role": "user", "content": json.dumps( { "current_time": datetime.now().isoformat(), "source_page": "coach-detail", "page_context": { "coach_name": "王教练", "monthly_lessons": 45, "monthly_revenue": "¥12,000", "customer_count": 28, }, "screen_content": "助教:王教练,本月课时45节,收入¥12,000,服务客户28人", }, ensure_ascii=False, ), }, { "role": "user", "content": "我这个月业绩一般,有什么方法可以提升客户续课率?", }, ], }, { "name": "场景4: general 入口 — 通用对话", "context_type": "general", "description": "助教从首页直接进入聊天,无特定上下文", "messages": [ { "role": "system", "content": SYSTEM_PROMPT, }, { "role": "user", "content": "台球馆周末客流量大的时候,怎么合理安排台位和助教排班?", }, ], }, ] # ── 多轮追问(场景1 追加) ──────────────────────────────────── FOLLOWUP_MESSAGES = [ "如果他说最近比较忙没时间来,我该怎么回应?", "好的,那如果他愿意来,我应该推荐什么样的课程套餐?", ] async def call_ai(messages: list[dict]) -> tuple[str, float, int | None]: """调用百炼 API,返回 (回复内容, 耗时秒, tokens_used)。""" t0 = time.time() response = await client.chat.completions.create( model=BAILIAN_MODEL, messages=messages, temperature=0.7, max_tokens=2000, ) elapsed = time.time() - t0 content = response.choices[0].message.content or "" tokens = response.usage.total_tokens if response.usage else None return content, elapsed, tokens async def call_ai_stream(messages: list[dict]) -> tuple[str, float, int]: """流式调用百炼 API,返回 (完整回复, 耗时秒, chunk数)。""" t0 = time.time() chunks: list[str] = [] chunk_count = 0 response = await client.chat.completions.create( model=BAILIAN_MODEL, messages=messages, temperature=0.7, max_tokens=2000, stream=True, ) async for chunk in response: if chunk.choices and chunk.choices[0].delta.content: chunks.append(chunk.choices[0].delta.content) chunk_count += 1 elapsed = time.time() - t0 return "".join(chunks), elapsed, chunk_count async def run_scenario(scenario: dict) -> dict: """执行单个场景,返回结果字典。""" print(f"\n{'='*60}") print(f" {scenario['name']}") print(f"{'='*60}") results = {"name": scenario["name"], "description": scenario["description"], "rounds": []} messages = list(scenario["messages"]) # 第一轮:非流式 user_msg = messages[-1]["content"] print(f"\n[用户] {user_msg[:80]}...") reply, elapsed, tokens = await call_ai(messages) print(f"[AI] ({elapsed:.1f}s, {tokens} tokens) {reply[:100]}...") results["rounds"].append({ "round": 1, "mode": "非流式", "user_message": user_msg, "ai_reply": reply, "elapsed_seconds": round(elapsed, 2), "tokens_used": tokens, }) messages.append({"role": "assistant", "content": reply}) # 第二轮:流式(仅场景1) if scenario["context_type"] == "task": for i, followup in enumerate(FOLLOWUP_MESSAGES): messages.append({"role": "user", "content": followup}) print(f"\n[用户] {followup}") reply_s, elapsed_s, chunk_count = await call_ai_stream(messages) print(f"[AI-Stream] ({elapsed_s:.1f}s, {chunk_count} chunks) {reply_s[:100]}...") results["rounds"].append({ "round": i + 2, "mode": "流式", "user_message": followup, "ai_reply": reply_s, "elapsed_seconds": round(elapsed_s, 2), "chunk_count": chunk_count, }) messages.append({"role": "assistant", "content": reply_s}) return results async def main(): print("RNS1.4 CHAT AI 质量评估 — 开始") print(f"模型: {BAILIAN_MODEL}") print(f"端点: {BAILIAN_BASE_URL}") print(f"时间: {datetime.now().isoformat()}") all_results: list[dict] = [] for scenario in SCENARIOS: try: result = await run_scenario(scenario) all_results.append(result) except Exception as e: print(f"\n ❌ 场景失败: {e}") all_results.append({ "name": scenario["name"], "description": scenario["description"], "error": str(e), }) # 生成 Markdown 报告 report = generate_report(all_results) output_path = _root / "docs" / "reports" / "2026-03-20__rns14_chat_ai_quality_eval.md" output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(report, encoding="utf-8") print(f"\n✅ 报告已输出: {output_path}") def generate_report(results: list[dict]) -> str: """生成 Markdown 评估报告。""" lines: list[str] = [] lines.append("# RNS1.4 CHAT 模块 AI 返回质量评估报告") lines.append("") lines.append(f"- 评估时间: {datetime.now().strftime('%Y-%m-%d %H:%M')}") lines.append(f"- 模型: {BAILIAN_MODEL}") lines.append(f"- 端点: {BAILIAN_BASE_URL}") lines.append(f"- 场景数: {len(results)}") lines.append("") lines.append("---") lines.append("") for r in results: lines.append(f"## {r['name']}") lines.append("") lines.append(f"**场景描述**: {r['description']}") lines.append("") if "error" in r: lines.append(f"**❌ 执行失败**: {r['error']}") lines.append("") continue for rd in r.get("rounds", []): lines.append(f"### 第 {rd['round']} 轮({rd['mode']})") lines.append("") lines.append(f"**用户发送**:") lines.append("") lines.append(f"```") lines.append(rd["user_message"]) lines.append(f"```") lines.append("") lines.append(f"**AI 回复**:") lines.append("") lines.append(f"```") lines.append(rd["ai_reply"]) lines.append(f"```") lines.append("") meta_parts = [f"耗时 {rd['elapsed_seconds']}s"] if rd.get("tokens_used"): meta_parts.append(f"tokens: {rd['tokens_used']}") if rd.get("chunk_count"): meta_parts.append(f"chunks: {rd['chunk_count']}") lines.append(f"**性能**: {' | '.join(meta_parts)}") lines.append("") lines.append("---") lines.append("") # AI 评价占位(由执行者填写) lines.append("## 综合评价") lines.append("") lines.append("| 维度 | 评分 | 说明 |") lines.append("|------|------|------|") lines.append("| 语义相关性 | — | — |") lines.append("| 中文表达质量 | — | — |") lines.append("| 上下文理解 | — | — |") lines.append("| 多轮连贯性 | — | — |") lines.append("| 响应速度 | — | — |") lines.append("| 流式输出稳定性 | — | — |") lines.append("") lines.append("> 评分标准: ✅ 优秀 / ⚠️ 可接受 / ❌ 不合格") lines.append("") return "\n".join(lines) if __name__ == "__main__": asyncio.run(main())