1. docs/ai/app2a_finance_area_system_prompt_20260422_v1.md (新建 · v1.2 生产版): - 基于 app2_finance V5.1 派生 - 板块 C 改"业态收入结构" · 板块 E 改"业态定位与对比" - 新增 H7 硬约束:业态特征引用必须紧跟 payload 真实数据 - H6 扩展区域级 6 类字段缺失降级(储值卡/分渠道现金流/现金流出/会员占比/按星期/日异常) - 经 3 次修正:v1"稀疏" → v1.1 纠正为业务真实 0/非 0 → v1.2 纠正为字段存在/整块缺失 - 已同步百炼控制台 APP ID 0ae965029bc54706bcff44f511ac716b 2. docs/ai/app2_finance_multi_app_design.md (新建 · v2 定稿): - 6 章 + 3 附录 · Q1-Q7 全部决策 · 6 阶段 28 项 checklist - 72 组合数据源支持度三档梳理(必须 / 业务级全店 / 字段存在 vs 整块缺失) - 2 套 prompt 拼接方案 · 2 个派生百炼 APP 策略 3. docs/audit/changes/2026-04-23__app2a_finance_area_integrated.md (新建): - 完整审计记录 · 13 高风险文件逐项注解 - 数据库变更 + 风险与回滚 + 验证方式 + 合规检查 4. docs/audit/audit_dashboard.md (刷新 · 135 条记录) 5. scripts/ab_test_app2a_area.py (新建): - 8 业态 × 3 轮 = 24 次采样评估含金量 - 自动检测 H1/H2/H3/H7 硬约束通过率 + seq11 三色灯分布 6. scripts/ab_to_cache.py (新建): - 复用 A/B 结果直接写 ai_cache · 绕开百炼预算验证 UI 端到端 A/B 实测 24/24 成功 · 12 条齐整率 100% · H1/H3/H7 100% · 达生产级。 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
213 lines
7.9 KiB
Python
213 lines
7.9 KiB
Python
"""App2a 区域财务洞察 system prompt 含金量评估脚本。
|
||
|
||
对 8 业态 × 3 轮 = 24 次百炼调用采样,验证 v1.2 system prompt 输出质量是否
|
||
达到 V5.1 全域版同等水准(店长视角:准确性/洞察深度/稳定性)。
|
||
|
||
用法:
|
||
PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/ab_test_app2a_area.py
|
||
|
||
输出:
|
||
export/ai-ab-test/round_v1_app2a_area/<area>_round<i>.json # 24 个原始文件
|
||
export/ai-ab-test/round_v1_app2a_area/summary.json # 汇总报告
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
sys.path.insert(0, 'apps/backend')
|
||
from dotenv import load_dotenv
|
||
|
||
load_dotenv(dotenv_path=os.path.join(os.getcwd(), '.env'))
|
||
|
||
from app.ai.config import AIConfig
|
||
from app.ai.dashscope_client import DashScopeClient
|
||
from app.ai.prompts import build_app2a_area_prompt
|
||
|
||
|
||
SITE_ID = 2790685415443269
|
||
TIME_DIMENSION = 'this_month'
|
||
AREAS = ('hall', 'hallA', 'hallB', 'hallC', 'vip', 'snooker', 'mahjong', 'ktv')
|
||
ROUNDS_PER_AREA = 3
|
||
OUT_DIR = Path('export/ai-ab-test/round_v1_app2a_area')
|
||
|
||
|
||
def classify_light(content: str) -> str:
|
||
if re.search(r'🔴|红灯', content):
|
||
return 'red'
|
||
if re.search(r'🟡|黄灯', content):
|
||
return 'yellow'
|
||
if re.search(r'🟢|绿灯', content):
|
||
return 'green'
|
||
return 'unknown'
|
||
|
||
|
||
def analyze(parsed) -> dict:
|
||
"""分析单次返回的含金量指标。"""
|
||
if not isinstance(parsed, list):
|
||
parsed = parsed.get('insights') if isinstance(parsed, dict) else None
|
||
if not isinstance(parsed, list):
|
||
return {'count': 0, 'valid': False, 'reasons': ['未返回列表']}
|
||
count = len(parsed)
|
||
# 硬约束检查
|
||
has_align = False # H1 对比口径
|
||
has_number_in_trend = True # H2 趋势词是否都有数字锚点
|
||
has_industry_fake = False # H3 是否编造行业数字
|
||
has_trait_ref = False # H7 是否引用业态说明
|
||
light = None
|
||
tracking_has_action = False
|
||
for ins in parsed:
|
||
content = (ins.get('content') or '') if isinstance(ins, dict) else ''
|
||
seq = ins.get('seq') if isinstance(ins, dict) else None
|
||
if '对比口径' in content or '同天数对齐' in content or re.search(r'当期\s*\d+\s*天', content):
|
||
has_align = True
|
||
if '业态说明' in content or '业态特征' in content or '业态' in content:
|
||
has_trait_ref = True
|
||
# 趋势词检查
|
||
trend_words = re.findall(r'(下滑|下降|上升|提升|收缩|萎缩|承压|走弱|走强|加剧|恶化|显著|大幅)', content)
|
||
if trend_words:
|
||
sentences = re.split(r'[;。;]', content)
|
||
for s in sentences:
|
||
if any(w in s for w in trend_words) and not re.search(r'[-+]?\d+(?:\.\d+)?\s*(?:%|元)', s):
|
||
has_number_in_trend = False
|
||
break
|
||
# H3: 行业警戒线类编造
|
||
if re.search(r'行业(均值|警戒线|标准)\s*\d', content):
|
||
has_industry_fake = True
|
||
if seq == 11:
|
||
light = classify_light(content)
|
||
if seq == 12:
|
||
if re.search(r'(每(周|日|月)|双周)', content) and re.search(r'(启动|暂停|停用|核查|调整|触发|输出|排查)', content):
|
||
tracking_has_action = True
|
||
return {
|
||
'count': count,
|
||
'valid': count == 12,
|
||
'h1_align_caliber': has_align,
|
||
'h2_no_bare_trend': has_number_in_trend,
|
||
'h3_no_fake_industry': not has_industry_fake,
|
||
'h7_trait_ref': has_trait_ref,
|
||
'seq11_light': light,
|
||
'seq12_has_action': tracking_has_action,
|
||
}
|
||
|
||
|
||
async def run_one(client, cfg, area: str, round_idx: int) -> dict:
|
||
prompt = await build_app2a_area_prompt({
|
||
'site_id': SITE_ID,
|
||
'time_dimension': TIME_DIMENSION,
|
||
'area': area,
|
||
})
|
||
t0 = time.monotonic()
|
||
try:
|
||
parsed, tokens, _ = await client.call_app(
|
||
app_id=cfg.app_id_2a_finance_area,
|
||
prompt=prompt,
|
||
)
|
||
dt = time.monotonic() - t0
|
||
ok = True
|
||
error = None
|
||
except Exception as e:
|
||
dt = time.monotonic() - t0
|
||
parsed = None
|
||
tokens = 0
|
||
ok = False
|
||
error = f'{type(e).__name__}: {e}'
|
||
|
||
return {
|
||
'area': area,
|
||
'round_idx': round_idx,
|
||
'ok': ok,
|
||
'duration_s': round(dt, 2),
|
||
'tokens': tokens,
|
||
'prompt_len': len(prompt),
|
||
'parsed': parsed,
|
||
'error': error,
|
||
'analysis': analyze(parsed) if parsed else None,
|
||
}
|
||
|
||
|
||
async def main():
|
||
cfg = AIConfig.from_env()
|
||
client = DashScopeClient(api_key=cfg.api_key, workspace_id=cfg.workspace_id)
|
||
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
total = len(AREAS) * ROUNDS_PER_AREA
|
||
results = []
|
||
done = 0
|
||
for area in AREAS:
|
||
for i in range(1, ROUNDS_PER_AREA + 1):
|
||
done += 1
|
||
print(f'[{done:>2}/{total}] {area:8s} round {i} ...', flush=True)
|
||
r = await run_one(client, cfg, area, i)
|
||
out_file = OUT_DIR / f'{area}_round{i}.json'
|
||
with open(out_file, 'w', encoding='utf-8') as f:
|
||
json.dump(r, f, ensure_ascii=False, indent=2, default=str)
|
||
status = 'OK' if r['ok'] else f"FAIL ({r['error']})"
|
||
analysis = r.get('analysis') or {}
|
||
print(f' {status} · {r["duration_s"]}s · tokens={r["tokens"]} · 12 条={analysis.get("valid")} · 三色={analysis.get("seq11_light")}')
|
||
results.append(r)
|
||
|
||
# 汇总报告
|
||
success = [r for r in results if r['ok']]
|
||
summary = {
|
||
'total': total,
|
||
'success': len(success),
|
||
'failed': total - len(success),
|
||
'avg_duration_s': round(sum(r['duration_s'] for r in success) / max(len(success), 1), 2),
|
||
'avg_tokens': int(sum(r['tokens'] for r in success) / max(len(success), 1)),
|
||
'by_area': {},
|
||
'hard_constraints_pass_rate': {},
|
||
}
|
||
# 按区域分组
|
||
for area in AREAS:
|
||
area_results = [r for r in results if r['area'] == area]
|
||
ok_rounds = [r for r in area_results if r['ok']]
|
||
valid_12 = sum(1 for r in ok_rounds if (r.get('analysis') or {}).get('valid'))
|
||
lights = [(r.get('analysis') or {}).get('seq11_light') for r in ok_rounds]
|
||
summary['by_area'][area] = {
|
||
'rounds': len(area_results),
|
||
'ok': len(ok_rounds),
|
||
'valid_12_pct': round(100 * valid_12 / max(len(area_results), 1), 1),
|
||
'light_distribution': {lg: lights.count(lg) for lg in ('red', 'yellow', 'green', 'unknown')},
|
||
}
|
||
# 硬约束通过率(按 H1/H2/H3/H7/seq12 分别)
|
||
def pass_rate(key: str) -> float:
|
||
ok_rounds = [r for r in success if r.get('analysis')]
|
||
if not ok_rounds:
|
||
return 0.0
|
||
hits = sum(1 for r in ok_rounds if (r.get('analysis') or {}).get(key))
|
||
return round(100 * hits / len(ok_rounds), 1)
|
||
summary['hard_constraints_pass_rate'] = {
|
||
'H1 对比口径显式': pass_rate('h1_align_caliber'),
|
||
'H2 无裸趋势词': pass_rate('h2_no_bare_trend'),
|
||
'H3 无编造行业数字': pass_rate('h3_no_fake_industry'),
|
||
'H7 引用业态特征': pass_rate('h7_trait_ref'),
|
||
'seq12 含触发动作': pass_rate('seq12_has_action'),
|
||
}
|
||
|
||
with open(OUT_DIR / 'summary.json', 'w', encoding='utf-8') as f:
|
||
json.dump(summary, f, ensure_ascii=False, indent=2, default=str)
|
||
|
||
print()
|
||
print('=' * 60)
|
||
print(f'采样完成:{summary["success"]}/{summary["total"]} 成功')
|
||
print(f'平均耗时:{summary["avg_duration_s"]}s · 平均 tokens:{summary["avg_tokens"]}')
|
||
print()
|
||
print('硬约束通过率:')
|
||
for k, v in summary['hard_constraints_pass_rate'].items():
|
||
print(f' {k}: {v}%')
|
||
print()
|
||
print('各业态 valid_12 率:')
|
||
for area, info in summary['by_area'].items():
|
||
print(f' {area:8s}: {info["valid_12_pct"]}% · 灯色 {info["light_distribution"]}')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
asyncio.run(main())
|