Files
Neo-ZQYY/scripts/ab_test_app2a_area.py
Neo d269ee6401 docs(ai): app2a v1.2 system prompt + 多 APP 派生设计 v2 + 审计 + A/B 脚本
1. docs/ai/app2a_finance_area_system_prompt_20260422_v1.md (新建 · v1.2 生产版):
   - 基于 app2_finance V5.1 派生
   - 板块 C 改"业态收入结构" · 板块 E 改"业态定位与对比"
   - 新增 H7 硬约束:业态特征引用必须紧跟 payload 真实数据
   - H6 扩展区域级 6 类字段缺失降级(储值卡/分渠道现金流/现金流出/会员占比/按星期/日异常)
   - 经 3 次修正:v1"稀疏" → v1.1 纠正为业务真实 0/非 0 → v1.2 纠正为字段存在/整块缺失
   - 已同步百炼控制台 APP ID 0ae965029bc54706bcff44f511ac716b

2. docs/ai/app2_finance_multi_app_design.md (新建 · v2 定稿):
   - 6 章 + 3 附录 · Q1-Q7 全部决策 · 6 阶段 28 项 checklist
   - 72 组合数据源支持度三档梳理(必须 / 业务级全店 / 字段存在 vs 整块缺失)
   - 2 套 prompt 拼接方案 · 2 个派生百炼 APP 策略

3. docs/audit/changes/2026-04-23__app2a_finance_area_integrated.md (新建):
   - 完整审计记录 · 13 高风险文件逐项注解
   - 数据库变更 + 风险与回滚 + 验证方式 + 合规检查

4. docs/audit/audit_dashboard.md (刷新 · 135 条记录)

5. scripts/ab_test_app2a_area.py (新建):
   - 8 业态 × 3 轮 = 24 次采样评估含金量
   - 自动检测 H1/H2/H3/H7 硬约束通过率 + seq11 三色灯分布

6. scripts/ab_to_cache.py (新建):
   - 复用 A/B 结果直接写 ai_cache · 绕开百炼预算验证 UI 端到端

A/B 实测 24/24 成功 · 12 条齐整率 100% · H1/H3/H7 100% · 达生产级。

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 21:56:46 +08:00

213 lines
7.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""App2a 区域财务洞察 system prompt 含金量评估脚本。
对 8 业态 × 3 轮 = 24 次百炼调用采样,验证 v1.2 system prompt 输出质量是否
达到 V5.1 全域版同等水准(店长视角:准确性/洞察深度/稳定性)。
用法:
PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/ab_test_app2a_area.py
输出:
export/ai-ab-test/round_v1_app2a_area/<area>_round<i>.json # 24 个原始文件
export/ai-ab-test/round_v1_app2a_area/summary.json # 汇总报告
"""
from __future__ import annotations
import asyncio
import json
import os
import re
import sys
import time
from pathlib import Path
sys.path.insert(0, 'apps/backend')
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.getcwd(), '.env'))
from app.ai.config import AIConfig
from app.ai.dashscope_client import DashScopeClient
from app.ai.prompts import build_app2a_area_prompt
SITE_ID = 2790685415443269
TIME_DIMENSION = 'this_month'
AREAS = ('hall', 'hallA', 'hallB', 'hallC', 'vip', 'snooker', 'mahjong', 'ktv')
ROUNDS_PER_AREA = 3
OUT_DIR = Path('export/ai-ab-test/round_v1_app2a_area')
def classify_light(content: str) -> str:
if re.search(r'🔴|红灯', content):
return 'red'
if re.search(r'🟡|黄灯', content):
return 'yellow'
if re.search(r'🟢|绿灯', content):
return 'green'
return 'unknown'
def analyze(parsed) -> dict:
"""分析单次返回的含金量指标。"""
if not isinstance(parsed, list):
parsed = parsed.get('insights') if isinstance(parsed, dict) else None
if not isinstance(parsed, list):
return {'count': 0, 'valid': False, 'reasons': ['未返回列表']}
count = len(parsed)
# 硬约束检查
has_align = False # H1 对比口径
has_number_in_trend = True # H2 趋势词是否都有数字锚点
has_industry_fake = False # H3 是否编造行业数字
has_trait_ref = False # H7 是否引用业态说明
light = None
tracking_has_action = False
for ins in parsed:
content = (ins.get('content') or '') if isinstance(ins, dict) else ''
seq = ins.get('seq') if isinstance(ins, dict) else None
if '对比口径' in content or '同天数对齐' in content or re.search(r'当期\s*\d+\s*天', content):
has_align = True
if '业态说明' in content or '业态特征' in content or '业态' in content:
has_trait_ref = True
# 趋势词检查
trend_words = re.findall(r'(下滑|下降|上升|提升|收缩|萎缩|承压|走弱|走强|加剧|恶化|显著|大幅)', content)
if trend_words:
sentences = re.split(r'[;。;]', content)
for s in sentences:
if any(w in s for w in trend_words) and not re.search(r'[-+]?\d+(?:\.\d+)?\s*(?:%|元)', s):
has_number_in_trend = False
break
# H3: 行业警戒线类编造
if re.search(r'行业(均值|警戒线|标准)\s*\d', content):
has_industry_fake = True
if seq == 11:
light = classify_light(content)
if seq == 12:
if re.search(r'(每(周|日|月)|双周)', content) and re.search(r'(启动|暂停|停用|核查|调整|触发|输出|排查)', content):
tracking_has_action = True
return {
'count': count,
'valid': count == 12,
'h1_align_caliber': has_align,
'h2_no_bare_trend': has_number_in_trend,
'h3_no_fake_industry': not has_industry_fake,
'h7_trait_ref': has_trait_ref,
'seq11_light': light,
'seq12_has_action': tracking_has_action,
}
async def run_one(client, cfg, area: str, round_idx: int) -> dict:
prompt = await build_app2a_area_prompt({
'site_id': SITE_ID,
'time_dimension': TIME_DIMENSION,
'area': area,
})
t0 = time.monotonic()
try:
parsed, tokens, _ = await client.call_app(
app_id=cfg.app_id_2a_finance_area,
prompt=prompt,
)
dt = time.monotonic() - t0
ok = True
error = None
except Exception as e:
dt = time.monotonic() - t0
parsed = None
tokens = 0
ok = False
error = f'{type(e).__name__}: {e}'
return {
'area': area,
'round_idx': round_idx,
'ok': ok,
'duration_s': round(dt, 2),
'tokens': tokens,
'prompt_len': len(prompt),
'parsed': parsed,
'error': error,
'analysis': analyze(parsed) if parsed else None,
}
async def main():
cfg = AIConfig.from_env()
client = DashScopeClient(api_key=cfg.api_key, workspace_id=cfg.workspace_id)
OUT_DIR.mkdir(parents=True, exist_ok=True)
total = len(AREAS) * ROUNDS_PER_AREA
results = []
done = 0
for area in AREAS:
for i in range(1, ROUNDS_PER_AREA + 1):
done += 1
print(f'[{done:>2}/{total}] {area:8s} round {i} ...', flush=True)
r = await run_one(client, cfg, area, i)
out_file = OUT_DIR / f'{area}_round{i}.json'
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(r, f, ensure_ascii=False, indent=2, default=str)
status = 'OK' if r['ok'] else f"FAIL ({r['error']})"
analysis = r.get('analysis') or {}
print(f' {status} · {r["duration_s"]}s · tokens={r["tokens"]} · 12 条={analysis.get("valid")} · 三色={analysis.get("seq11_light")}')
results.append(r)
# 汇总报告
success = [r for r in results if r['ok']]
summary = {
'total': total,
'success': len(success),
'failed': total - len(success),
'avg_duration_s': round(sum(r['duration_s'] for r in success) / max(len(success), 1), 2),
'avg_tokens': int(sum(r['tokens'] for r in success) / max(len(success), 1)),
'by_area': {},
'hard_constraints_pass_rate': {},
}
# 按区域分组
for area in AREAS:
area_results = [r for r in results if r['area'] == area]
ok_rounds = [r for r in area_results if r['ok']]
valid_12 = sum(1 for r in ok_rounds if (r.get('analysis') or {}).get('valid'))
lights = [(r.get('analysis') or {}).get('seq11_light') for r in ok_rounds]
summary['by_area'][area] = {
'rounds': len(area_results),
'ok': len(ok_rounds),
'valid_12_pct': round(100 * valid_12 / max(len(area_results), 1), 1),
'light_distribution': {lg: lights.count(lg) for lg in ('red', 'yellow', 'green', 'unknown')},
}
# 硬约束通过率(按 H1/H2/H3/H7/seq12 分别)
def pass_rate(key: str) -> float:
ok_rounds = [r for r in success if r.get('analysis')]
if not ok_rounds:
return 0.0
hits = sum(1 for r in ok_rounds if (r.get('analysis') or {}).get(key))
return round(100 * hits / len(ok_rounds), 1)
summary['hard_constraints_pass_rate'] = {
'H1 对比口径显式': pass_rate('h1_align_caliber'),
'H2 无裸趋势词': pass_rate('h2_no_bare_trend'),
'H3 无编造行业数字': pass_rate('h3_no_fake_industry'),
'H7 引用业态特征': pass_rate('h7_trait_ref'),
'seq12 含触发动作': pass_rate('seq12_has_action'),
}
with open(OUT_DIR / 'summary.json', 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=2, default=str)
print()
print('=' * 60)
print(f'采样完成:{summary["success"]}/{summary["total"]} 成功')
print(f'平均耗时:{summary["avg_duration_s"]}s · 平均 tokens{summary["avg_tokens"]}')
print()
print('硬约束通过率:')
for k, v in summary['hard_constraints_pass_rate'].items():
print(f' {k}: {v}%')
print()
print('各业态 valid_12 率:')
for area, info in summary['by_area'].items():
print(f' {area:8s}: {info["valid_12_pct"]}% · 灯色 {info["light_distribution"]}')
if __name__ == '__main__':
asyncio.run(main())