Files
Neo-ZQYY/scripts/analyze_ab_content_quality.py
Neo caf179a5da feat: 2026-04-15~05-02 累积变更基线 — AI 重构 + Runtime Context + DWS 修复
涵盖(每条对应已存的审计记录):
- AI 模块拆分:apps/backend/app/ai/apps -> prompts/(8 个 APP + app2a 派生)
  audit: 2026-04-20__ai-module-complete.md
- admin-web AI 管理套件:AIDashboard / AIOperations / AIRunLogs / AITriggers / TriggerManager
  audit: 2026-04-21__admin-web-ai-management-suite.md
- App2 财务洞察 prompt v3 -> v5.1 + 小程序 AI 接入(chat / board-finance)
  audit: 2026-04-22__app2_prompt_v5_1_and_miniprogram_ai_insight.md
- App2 prewarm 全过滤器 + AI 触发器 cron reschedule
  audit: 2026-04-21__app2-finance-prewarm-all-filters.md
  migration: 20260420_ai_trigger_jobs_and_app2_prewarm.sql / 20260421_app2_prewarm_cron_reschedule.sql
- AppType 联合类型对齐 + adminAiAppTypes.test.ts
  audit: 2026-04-30__admin_web_ai_app_type_alignment.md
- DashScope tokens_used 提取修复
  audit: 2026-04-30__backend_dashscope_tokens_used_extraction.md
- App3 线索完整详情 prompt
  audit: 2026-05-01__backend_app3_full_detail_prompt.md
- Runtime Context 沙箱(5-1~5-2 主线):
  - 后端 schema/service + admin_runtime_context / xcx_runtime_clock 两个 router
  - admin-web RuntimeContext.tsx + miniprogram runtime-clock.ts
  - migration: 20260501__runtime_context_sandbox.sql
  - tools/db/verify_admin_web_sandbox.py + verify_sandbox_end_to_end.py
  - database/changes: 7 份 sandbox_* 验证报告
- 飞球 DWS 修复:finance_area_daily 区域汇总 + task_engine 调整
  + RLS 视图业务日上界(migration 20260502 + scripts/ops/gen_rls_business_date_migration.py)

合规:
- .gitignore 启用 tmp/ 排除
- 不入仓:apps/etl/connectors/feiqiu/.env(API_TOKEN secret,本地修改保留)

待验证清单:
- docs/audit/changes/2026-05-04__cumulative_baseline_pending_verification.md
  每个主题的功能完整性 / 上线验证几乎都未收口,按优先级 P0~P3 逐一处理
2026-05-04 02:30:19 +08:00

337 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""App2 A/B 测试 · 内容质量深度分析器。
围绕"实际信息质量"评估,非表现形式(加粗/时长)。
分析维度(按板块):
- 板块 Aseq 1-2客单价环比是否原字段引用非推测、是否识别"对比口径"
- 板块 Bseq 3-4最大优惠来源是否点明、手动调整是否合规表述禁用"抹零/免单 X 元"
- 板块 Cseq 5-6是否引用权威字段"储值卡余额变化"(期初/期末/其他调整)
- 板块 Dseq 7-8支出完整性 + 人力成本占比
- 板块 Eseq 9-10seq 9 是否含"旺淡倍率"seq 10 是否标"同周X均值/期均"基线
- 板块 Fseq 11-12seq 11 三色灯 + top 2 原因seq 12 跟踪节奏 + 触发动作 + 阈值
违规检测:
- 禁用行业数字payload 未提供):警戒线/均值/参考值/30%/40% 等无锚点百分比
- 禁用单期推测("提升/下降/显著增长"但未引用 _环比
- 编造字段payload 不含的字段名)
用法:
PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_ab_content_quality.py --dir export/ai-ab-test/round_a
# 对比两轮:
PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_ab_content_quality.py --compare export/ai-ab-test/round_a export/ai-ab-test/round_b
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
from statistics import mean, stdev
def _iter_rounds(dir_path: Path) -> list[dict]:
"""读取目录下 round_XX.json 按 round_idx 排序。"""
files = sorted(dir_path.glob('round_*.json'))
out = []
for f in files:
data = json.loads(f.read_text(encoding='utf-8'))
data['_file'] = f.name
out.append(data)
return out
def _get_seq(insights: list, seq: int) -> dict | None:
for ins in insights:
if isinstance(ins, dict) and ins.get('seq') == seq:
return ins
return None
def _has_number(text: str) -> bool:
"""content 是否含 ≥1 个具体数字或百分比。"""
return bool(re.search(r'\d+(\.\d+)?%?', text))
def analyze_round(parsed: dict | None) -> dict:
"""分析单次返回的内容质量。返回 17 项指标。"""
metrics: dict = {
# 基础
'count': 0,
'seq_complete': False,
'has_number_rate': 0.0, # 每条含数字的比例(目标 100%
'avg_content_len': 0,
# 板块 A
'A_unit_econ_ref': False, # seq 1-2 中引用单位经济字段(客单价/会员占比/日均订单)
'A_env_bi_ref': False, # 引用 _环比 字段的真实值
'A_calib_ref': False, # 引用对比口径("同天数对齐"/"同期"
# 板块 B
'B_top_source': False, # 点明最大优惠来源
'B_manual_violation': False, # 违规:说了"抹零 X 元" / "免单 X 元"
# 板块 C
'C_balance_change_ref': False, # 引用"储值卡余额变化"字段(期初/期末/其他调整)
# 板块 D
'D_labor_ratio_ref': False, # 人力成本占比
'D_zero_expense_flag': False, # 标注支出 0 或数据缺失
# 板块 E
'E_weekday_ratio': False, # seq 9 含旺/淡倍率("X 倍" / "X.XX 倍"
'E_anomaly_baseline': False, # seq 10 标注基线类型("同周" / "期均" / "基线"
# 板块 F
'F_light': 'unknown',
'F_top2_reasons': False, # seq 11 列 ≥2 原因1)...2)... / 原因一...原因二)
'F_tracking_trigger': False, # seq 12 含跟踪节奏 + 触发动作
# 违规
'V_industry_number': 0, # 编造行业数字提及次数
'V_speculation': 0, # 单期推测(未引用 _环比 却说"提升/下降/显著"
}
if not parsed:
return metrics
insights = parsed.get('insights') or []
if not isinstance(insights, list):
return metrics
metrics['count'] = len(insights)
seqs = [ins.get('seq') for ins in insights if isinstance(ins, dict)]
metrics['seq_complete'] = sorted([s for s in seqs if isinstance(s, int)]) == list(range(1, 13))
total_len = 0
with_number = 0
for ins in insights:
if not isinstance(ins, dict):
continue
body = (ins.get('content') or '')
total_len += len(body)
if _has_number(body):
with_number += 1
if insights:
metrics['has_number_rate'] = round(with_number / len(insights), 2)
metrics['avg_content_len'] = round(total_len / len(insights))
# 板块 A
a_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (1, 2))
metrics['A_unit_econ_ref'] = any(kw in a_texts for kw in ('客单价', '会员订单占比', '会员占比', '日均订单'))
metrics['A_env_bi_ref'] = '_环比' in a_texts or bool(re.search(r'环比[^字段][^"]*?[+-]?\d+\.?\d*%', a_texts))
metrics['A_calib_ref'] = any(kw in a_texts for kw in ('对比口径', '同天数对齐', '同期', '同日数', '截断到', '对比期'))
# 板块 B
b_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (3, 4))
metrics['B_top_source'] = any(kw in b_texts for kw in ('最大', '主导', '占比最高', '占比超', '团购优惠', '主要来源'))
# 违规:直接说"抹零 X 元"/"免单 X 元"(不是说"抹零/免单"这个类目名)
metrics['B_manual_violation'] = bool(re.search(r'(抹零|免单)\s*\d+[\.\d]*\s*元', b_texts))
# 板块 C
c_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (5, 6))
metrics['C_balance_change_ref'] = any(kw in c_texts for kw in ('期初', '期末', '余额变化', '其他调整', '非充值/消耗'))
# 板块 D
d_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (7, 8))
metrics['D_labor_ratio_ref'] = any(kw in d_texts for kw in ('人力成本', '助教成本', '占成交收入', '占比'))
metrics['D_zero_expense_flag'] = any(kw in d_texts for kw in ('支出为 0', '支出全 0', '支出全0', '支出为0', '0 元', '0元', '数据缺失', '数据不完整', '数据完整性', '未录入'))
# 板块 E
seq9 = (_get_seq(insights, 9) or {}).get('content') or ''
seq10 = (_get_seq(insights, 10) or {}).get('content') or ''
metrics['E_weekday_ratio'] = bool(re.search(r'\d+\.?\d*\s*倍|比.*\d+\.?\d*', seq9))
metrics['E_anomaly_baseline'] = any(kw in seq10 for kw in ('同周', '期均', '基线', '同星期'))
# 板块 F
seq11 = (_get_seq(insights, 11) or {}).get('content') or ''
seq12 = (_get_seq(insights, 12) or {}).get('content') or ''
if re.search(r'🔴|红灯', seq11):
metrics['F_light'] = 'red'
elif re.search(r'🟡|黄灯', seq11):
metrics['F_light'] = 'yellow'
elif re.search(r'🟢|绿灯', seq11):
metrics['F_light'] = 'green'
# 匹配 "原因1:" / "原因 1" / "1)" / "1." / "1、" / "①" / "原因一" / "其一"
metrics['F_top2_reasons'] = bool(re.search(r'原因\s*1|1\s*[\)\.、:]|①|原因一|其一', seq11)) and \
bool(re.search(r'原因\s*2|2\s*[\)\.、:]|②|原因二|其二', seq11))
metrics['F_tracking_trigger'] = any(kw in seq12 for kw in ('启动', '触发', '召回', '立即')) and \
bool(re.search(r'(每周|每月|每日|每天|每\s*\d+|周期性|定期)', seq12)) and \
bool(re.search(r'<|>|≥|≤|低于|超过|达到|阈值', seq12))
# 违规行业数字payload 只提供"周中客流规律",其他均禁)
# 典型措辞:"行业警戒线" "行业均值" "行业标准" "行业参考" + 数字
all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict))
metrics['V_industry_number'] = len(re.findall(r'行业(警戒线|均值|标准|参考值|基线|基准|水平|经验值|通常|一般)[^,。;,]*\d+\.?\d*%?', all_text))
# 违规:单期推测(句子含"提升/下降/显著增长/大幅"等但未引用 _环比 字段值)
# 启发式:句子中有"提升/下降/大幅/明显/显著"但句子内没有带 % 的数字
speculation_hits = 0
for sent in re.split(r'[。;\n]', all_text):
if not sent.strip():
continue
has_trend_word = bool(re.search(r'(提升|下降|上升|下滑|显著|大幅|明显)', sent))
has_pct_number = bool(re.search(r'[+-]?\d+\.?\d*%', sent))
if has_trend_word and not has_pct_number:
# 允许"不推测"/"禁止推测"这类元指令
if re.search(r'(推测|不|禁)', sent):
continue
speculation_hits += 1
metrics['V_speculation'] = speculation_hits
return metrics
def summarize(rounds: list[dict], label: str) -> dict:
"""聚合 10 次的内容质量分布。"""
per = [analyze_round(r.get('parsed')) for r in rounds]
# 汇总
def _rate(key: str) -> float:
vals = [1 if p.get(key) else 0 for p in per]
return round(sum(vals) / len(vals), 2) if vals else 0.0
def _avg(key: str) -> float:
vals = [p.get(key, 0) for p in per]
return round(mean(vals), 2) if vals else 0.0
lights: dict[str, int] = {}
for p in per:
l = p.get('F_light', 'unknown')
lights[l] = lights.get(l, 0) + 1
summary = {
'label': label,
'n': len(rounds),
'rates': {
'seq_complete': _rate('seq_complete'),
'has_number': _avg('has_number_rate'),
'A_unit_econ_ref': _rate('A_unit_econ_ref'),
'A_env_bi_ref': _rate('A_env_bi_ref'),
'A_calib_ref': _rate('A_calib_ref'),
'B_top_source': _rate('B_top_source'),
'C_balance_change_ref': _rate('C_balance_change_ref'),
'D_labor_ratio_ref': _rate('D_labor_ratio_ref'),
'D_zero_expense_flag': _rate('D_zero_expense_flag'),
'E_weekday_ratio': _rate('E_weekday_ratio'),
'E_anomaly_baseline': _rate('E_anomaly_baseline'),
'F_top2_reasons': _rate('F_top2_reasons'),
'F_tracking_trigger': _rate('F_tracking_trigger'),
},
'violations': {
'B_manual': sum(1 for p in per if p.get('B_manual_violation')),
'industry_number_total': sum(p.get('V_industry_number', 0) for p in per),
'speculation_total': sum(p.get('V_speculation', 0) for p in per),
},
'light_distribution': lights,
'avg_content_len': _avg('avg_content_len'),
'per_round': per,
}
return summary
def print_summary(s: dict) -> None:
print(f"\n=== Round {s['label'].upper()} 内容质量汇总n={s['n']}===")
print(f" 结构完整性:")
print(f" seq 1-12 完整率: {s['rates']['seq_complete']:.0%}")
print(f" 每条含数字比例: {s['rates']['has_number']:.0%}")
print(f" 平均 content 字数: {s['avg_content_len']:.0f}")
print(f" 板块 A · 收入:")
print(f" 引用单位经济字段: {s['rates']['A_unit_econ_ref']:.0%}")
print(f" 引用 _环比 真实值: {s['rates']['A_env_bi_ref']:.0%}")
print(f" 引用对比口径: {s['rates']['A_calib_ref']:.0%} ★ v4 新增规则的关键指标")
print(f" 板块 B · 优惠:")
print(f" 点明最大来源: {s['rates']['B_top_source']:.0%}")
print(f" 板块 C · 储值卡:")
print(f" 引用余额变化字段: {s['rates']['C_balance_change_ref']:.0%}")
print(f" 板块 D · 成本:")
print(f" 引用人力成本占比: {s['rates']['D_labor_ratio_ref']:.0%}")
print(f" 标注 0 支出/数据缺失: {s['rates']['D_zero_expense_flag']:.0%}")
print(f" 板块 E · 时间规律:")
print(f" seq 9 含旺/淡倍率: {s['rates']['E_weekday_ratio']:.0%}")
print(f" seq 10 标注基线类型: {s['rates']['E_anomaly_baseline']:.0%}")
print(f" 板块 F · 综合:")
print(f" 三色灯分布: {s['light_distribution']}")
print(f" seq 11 列 top 2 原因: {s['rates']['F_top2_reasons']:.0%}")
print(f" seq 12 节奏+触发+阈值:{s['rates']['F_tracking_trigger']:.0%}")
print(f" 违规统计(越低越好):")
print(f" 手动调整违规次数: {s['violations']['B_manual']} / {s['n']}")
print(f" 行业数字编造总计: {s['violations']['industry_number_total']}")
print(f" 单期推测总计: {s['violations']['speculation_total']}")
def print_compare(*summaries: dict) -> None:
labels = [s['label'].upper() for s in summaries]
header = ' vs '.join(labels)
print(f"\n======= {header} 多方对比表 =======")
col_w = 10
print(f"{'指标':<34}" + ''.join(f"{l:>{col_w}}" for l in labels))
print('-' * (34 + col_w * len(labels)))
def _row(name: str, values: list, fmt: str = 'percent') -> None:
cells = []
for v in values:
if fmt == 'percent':
cells.append(f'{v:.0%}')
else:
cells.append(str(v))
print(f"{name:<34}" + ''.join(f"{c:>{col_w}}" for c in cells))
for k, name in (
('seq_complete', 'seq 1-12 完整率'),
('has_number', '每条含数字比例'),
('A_unit_econ_ref', 'A 引用单位经济字段'),
('A_env_bi_ref', 'A 引用 _环比 真实值'),
('A_calib_ref', 'A 引用对比口径 ★'),
('B_top_source', 'B 点明最大优惠来源'),
('C_balance_change_ref', 'C 引用余额变化字段'),
('D_labor_ratio_ref', 'D 引用人力成本占比'),
('D_zero_expense_flag', 'D 标注 0 支出'),
('E_weekday_ratio', 'E seq 9 含倍率'),
('E_anomaly_baseline', 'E seq 10 标注基线'),
('F_top2_reasons', 'F seq 11 列 top 2 原因 ★'),
('F_tracking_trigger', 'F seq 12 节奏+触发+阈值'),
):
_row(name, [s['rates'][k] for s in summaries], 'percent')
print('-' * (34 + col_w * len(summaries)))
print('违规次数(越低越好):')
_row(' 手动调整违规', [s['violations']['B_manual'] for s in summaries], 'int')
_row(' 行业数字编造', [s['violations']['industry_number_total'] for s in summaries], 'int')
_row(' 单期推测', [s['violations']['speculation_total'] for s in summaries], 'int')
print('-' * (34 + col_w * len(summaries)))
_row(' 平均字数', [f"{s['avg_content_len']:.0f}" for s in summaries], 'int')
_row(' 样本数', [s['n'] for s in summaries], 'int')
print()
for s in summaries:
print(f" {s['label'].upper()} 三色灯分布: {s['light_distribution']}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--dir', help='单目录分析:如 export/ai-ab-test/round_a')
parser.add_argument('--compare', nargs='+', metavar='DIR', help='多轮对比2-4 个目录)')
args = parser.parse_args()
if args.compare:
dirs = [Path(d) for d in args.compare]
if len(dirs) < 2:
sys.exit('--compare 至少 2 个目录')
summaries = []
for d in dirs:
rounds = _iter_rounds(d)
if not rounds:
sys.exit(f'目录无 round_*.json{d}')
label = d.name.replace('round_', '')
summaries.append(summarize(rounds, label))
for s in summaries:
print_summary(s)
print_compare(*summaries)
# 存档对比 JSON
tag = '_'.join(s['label'] for s in summaries)
out_path = Path(f'export/ai-ab-test/_compare_{tag}.json')
out_path.write_text(json.dumps({s['label']: s for s in summaries}, ensure_ascii=False, indent=2), encoding='utf-8')
print(f'\n[done] 对比 JSON 已存: {out_path}')
elif args.dir:
rounds = _iter_rounds(Path(args.dir))
if not rounds:
sys.exit(f'目录无 round_*.json{args.dir}')
label = Path(args.dir).name.replace('round_', '')
s = summarize(rounds, label)
print_summary(s)
else:
parser.error('需指定 --dir 或 --compare')
if __name__ == '__main__':
main()