"""App2 A/B 测试 · 内容质量深度分析器。 围绕"实际信息质量"评估,非表现形式(加粗/时长)。 分析维度(按板块): - 板块 A(seq 1-2):客单价环比是否原字段引用(非推测)、是否识别"对比口径" - 板块 B(seq 3-4):最大优惠来源是否点明、手动调整是否合规表述(禁用"抹零/免单 X 元") - 板块 C(seq 5-6):是否引用权威字段"储值卡余额变化"(期初/期末/其他调整) - 板块 D(seq 7-8):支出完整性 + 人力成本占比 - 板块 E(seq 9-10):seq 9 是否含"旺淡倍率";seq 10 是否标"同周X均值/期均"基线 - 板块 F(seq 11-12):seq 11 三色灯 + top 2 原因;seq 12 跟踪节奏 + 触发动作 + 阈值 违规检测: - 禁用行业数字(payload 未提供):警戒线/均值/参考值/30%/40% 等无锚点百分比 - 禁用单期推测("提升/下降/显著增长"但未引用 _环比) - 编造字段(payload 不含的字段名) 用法: PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_ab_content_quality.py --dir export/ai-ab-test/round_a # 对比两轮: PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_ab_content_quality.py --compare export/ai-ab-test/round_a export/ai-ab-test/round_b """ from __future__ import annotations import argparse import json import re import sys from pathlib import Path from statistics import mean, stdev def _iter_rounds(dir_path: Path) -> list[dict]: """读取目录下 round_XX.json 按 round_idx 排序。""" files = sorted(dir_path.glob('round_*.json')) out = [] for f in files: data = json.loads(f.read_text(encoding='utf-8')) data['_file'] = f.name out.append(data) return out def _get_seq(insights: list, seq: int) -> dict | None: for ins in insights: if isinstance(ins, dict) and ins.get('seq') == seq: return ins return None def _has_number(text: str) -> bool: """content 是否含 ≥1 个具体数字或百分比。""" return bool(re.search(r'\d+(\.\d+)?%?', text)) def analyze_round(parsed: dict | None) -> dict: """分析单次返回的内容质量。返回 17 项指标。""" metrics: dict = { # 基础 'count': 0, 'seq_complete': False, 'has_number_rate': 0.0, # 每条含数字的比例(目标 100%) 'avg_content_len': 0, # 板块 A 'A_unit_econ_ref': False, # seq 1-2 中引用单位经济字段(客单价/会员占比/日均订单) 'A_env_bi_ref': False, # 引用 _环比 字段的真实值 'A_calib_ref': False, # 引用对比口径("同天数对齐"/"同期") # 板块 B 'B_top_source': False, # 点明最大优惠来源 'B_manual_violation': False, # 违规:说了"抹零 X 元" / "免单 X 元" # 板块 C 'C_balance_change_ref': False, # 引用"储值卡余额变化"字段(期初/期末/其他调整) # 板块 D 'D_labor_ratio_ref': False, # 人力成本占比 'D_zero_expense_flag': False, # 标注支出 0 或数据缺失 # 板块 E 'E_weekday_ratio': False, # seq 9 含旺/淡倍率("X 倍" / "X.XX 倍") 'E_anomaly_baseline': False, # seq 10 标注基线类型("同周" / "期均" / "基线") # 板块 F 'F_light': 'unknown', 'F_top2_reasons': False, # seq 11 列 ≥2 原因(1)...2)... / 原因一...原因二) 'F_tracking_trigger': False, # seq 12 含跟踪节奏 + 触发动作 # 违规 'V_industry_number': 0, # 编造行业数字提及次数 'V_speculation': 0, # 单期推测(未引用 _环比 却说"提升/下降/显著") } if not parsed: return metrics insights = parsed.get('insights') or [] if not isinstance(insights, list): return metrics metrics['count'] = len(insights) seqs = [ins.get('seq') for ins in insights if isinstance(ins, dict)] metrics['seq_complete'] = sorted([s for s in seqs if isinstance(s, int)]) == list(range(1, 13)) total_len = 0 with_number = 0 for ins in insights: if not isinstance(ins, dict): continue body = (ins.get('content') or '') total_len += len(body) if _has_number(body): with_number += 1 if insights: metrics['has_number_rate'] = round(with_number / len(insights), 2) metrics['avg_content_len'] = round(total_len / len(insights)) # 板块 A a_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (1, 2)) metrics['A_unit_econ_ref'] = any(kw in a_texts for kw in ('客单价', '会员订单占比', '会员占比', '日均订单')) metrics['A_env_bi_ref'] = '_环比' in a_texts or bool(re.search(r'环比[^字段][^"]*?[+-]?\d+\.?\d*%', a_texts)) metrics['A_calib_ref'] = any(kw in a_texts for kw in ('对比口径', '同天数对齐', '同期', '同日数', '截断到', '对比期')) # 板块 B b_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (3, 4)) metrics['B_top_source'] = any(kw in b_texts for kw in ('最大', '主导', '占比最高', '占比超', '团购优惠', '主要来源')) # 违规:直接说"抹零 X 元"/"免单 X 元"(不是说"抹零/免单"这个类目名) metrics['B_manual_violation'] = bool(re.search(r'(抹零|免单)\s*\d+[\.\d]*\s*元', b_texts)) # 板块 C c_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (5, 6)) metrics['C_balance_change_ref'] = any(kw in c_texts for kw in ('期初', '期末', '余额变化', '其他调整', '非充值/消耗')) # 板块 D d_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (7, 8)) metrics['D_labor_ratio_ref'] = any(kw in d_texts for kw in ('人力成本', '助教成本', '占成交收入', '占比')) metrics['D_zero_expense_flag'] = any(kw in d_texts for kw in ('支出为 0', '支出全 0', '支出全0', '支出为0', '0 元', '0元', '数据缺失', '数据不完整', '数据完整性', '未录入')) # 板块 E seq9 = (_get_seq(insights, 9) or {}).get('content') or '' seq10 = (_get_seq(insights, 10) or {}).get('content') or '' metrics['E_weekday_ratio'] = bool(re.search(r'\d+\.?\d*\s*倍|比.*\d+\.?\d*', seq9)) metrics['E_anomaly_baseline'] = any(kw in seq10 for kw in ('同周', '期均', '基线', '同星期')) # 板块 F seq11 = (_get_seq(insights, 11) or {}).get('content') or '' seq12 = (_get_seq(insights, 12) or {}).get('content') or '' if re.search(r'🔴|红灯', seq11): metrics['F_light'] = 'red' elif re.search(r'🟡|黄灯', seq11): metrics['F_light'] = 'yellow' elif re.search(r'🟢|绿灯', seq11): metrics['F_light'] = 'green' # 匹配 "原因1:" / "原因 1:" / "1)" / "1." / "1、" / "①" / "原因一" / "其一" metrics['F_top2_reasons'] = bool(re.search(r'原因\s*1|1\s*[\))\.、::]|①|原因一|其一', seq11)) and \ bool(re.search(r'原因\s*2|2\s*[\))\.、::]|②|原因二|其二', seq11)) metrics['F_tracking_trigger'] = any(kw in seq12 for kw in ('启动', '触发', '召回', '立即')) and \ bool(re.search(r'(每周|每月|每日|每天|每\s*\d+|周期性|定期)', seq12)) and \ bool(re.search(r'<|>|≥|≤|低于|超过|达到|阈值', seq12)) # 违规:行业数字(payload 只提供"周中客流规律",其他均禁) # 典型措辞:"行业警戒线" "行业均值" "行业标准" "行业参考" + 数字 all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict)) metrics['V_industry_number'] = len(re.findall(r'行业(警戒线|均值|标准|参考值|基线|基准|水平|经验值|通常|一般)[^,。;,]*\d+\.?\d*%?', all_text)) # 违规:单期推测(句子含"提升/下降/显著增长/大幅"等但未引用 _环比 字段值) # 启发式:句子中有"提升/下降/大幅/明显/显著"但句子内没有带 % 的数字 speculation_hits = 0 for sent in re.split(r'[。;\n]', all_text): if not sent.strip(): continue has_trend_word = bool(re.search(r'(提升|下降|上升|下滑|显著|大幅|明显)', sent)) has_pct_number = bool(re.search(r'[+-]?\d+\.?\d*%', sent)) if has_trend_word and not has_pct_number: # 允许"不推测"/"禁止推测"这类元指令 if re.search(r'(推测|不|禁)', sent): continue speculation_hits += 1 metrics['V_speculation'] = speculation_hits return metrics def summarize(rounds: list[dict], label: str) -> dict: """聚合 10 次的内容质量分布。""" per = [analyze_round(r.get('parsed')) for r in rounds] # 汇总 def _rate(key: str) -> float: vals = [1 if p.get(key) else 0 for p in per] return round(sum(vals) / len(vals), 2) if vals else 0.0 def _avg(key: str) -> float: vals = [p.get(key, 0) for p in per] return round(mean(vals), 2) if vals else 0.0 lights: dict[str, int] = {} for p in per: l = p.get('F_light', 'unknown') lights[l] = lights.get(l, 0) + 1 summary = { 'label': label, 'n': len(rounds), 'rates': { 'seq_complete': _rate('seq_complete'), 'has_number': _avg('has_number_rate'), 'A_unit_econ_ref': _rate('A_unit_econ_ref'), 'A_env_bi_ref': _rate('A_env_bi_ref'), 'A_calib_ref': _rate('A_calib_ref'), 'B_top_source': _rate('B_top_source'), 'C_balance_change_ref': _rate('C_balance_change_ref'), 'D_labor_ratio_ref': _rate('D_labor_ratio_ref'), 'D_zero_expense_flag': _rate('D_zero_expense_flag'), 'E_weekday_ratio': _rate('E_weekday_ratio'), 'E_anomaly_baseline': _rate('E_anomaly_baseline'), 'F_top2_reasons': _rate('F_top2_reasons'), 'F_tracking_trigger': _rate('F_tracking_trigger'), }, 'violations': { 'B_manual': sum(1 for p in per if p.get('B_manual_violation')), 'industry_number_total': sum(p.get('V_industry_number', 0) for p in per), 'speculation_total': sum(p.get('V_speculation', 0) for p in per), }, 'light_distribution': lights, 'avg_content_len': _avg('avg_content_len'), 'per_round': per, } return summary def print_summary(s: dict) -> None: print(f"\n=== Round {s['label'].upper()} 内容质量汇总(n={s['n']})===") print(f" 结构完整性:") print(f" seq 1-12 完整率: {s['rates']['seq_complete']:.0%}") print(f" 每条含数字比例: {s['rates']['has_number']:.0%}") print(f" 平均 content 字数: {s['avg_content_len']:.0f}") print(f" 板块 A · 收入:") print(f" 引用单位经济字段: {s['rates']['A_unit_econ_ref']:.0%}") print(f" 引用 _环比 真实值: {s['rates']['A_env_bi_ref']:.0%}") print(f" 引用对比口径: {s['rates']['A_calib_ref']:.0%} ★ v4 新增规则的关键指标") print(f" 板块 B · 优惠:") print(f" 点明最大来源: {s['rates']['B_top_source']:.0%}") print(f" 板块 C · 储值卡:") print(f" 引用余额变化字段: {s['rates']['C_balance_change_ref']:.0%}") print(f" 板块 D · 成本:") print(f" 引用人力成本占比: {s['rates']['D_labor_ratio_ref']:.0%}") print(f" 标注 0 支出/数据缺失: {s['rates']['D_zero_expense_flag']:.0%}") print(f" 板块 E · 时间规律:") print(f" seq 9 含旺/淡倍率: {s['rates']['E_weekday_ratio']:.0%}") print(f" seq 10 标注基线类型: {s['rates']['E_anomaly_baseline']:.0%}") print(f" 板块 F · 综合:") print(f" 三色灯分布: {s['light_distribution']}") print(f" seq 11 列 top 2 原因: {s['rates']['F_top2_reasons']:.0%}") print(f" seq 12 节奏+触发+阈值:{s['rates']['F_tracking_trigger']:.0%}") print(f" 违规统计(越低越好):") print(f" 手动调整违规次数: {s['violations']['B_manual']} / {s['n']}") print(f" 行业数字编造总计: {s['violations']['industry_number_total']}") print(f" 单期推测总计: {s['violations']['speculation_total']}") def print_compare(*summaries: dict) -> None: labels = [s['label'].upper() for s in summaries] header = ' vs '.join(labels) print(f"\n======= {header} 多方对比表 =======") col_w = 10 print(f"{'指标':<34}" + ''.join(f"{l:>{col_w}}" for l in labels)) print('-' * (34 + col_w * len(labels))) def _row(name: str, values: list, fmt: str = 'percent') -> None: cells = [] for v in values: if fmt == 'percent': cells.append(f'{v:.0%}') else: cells.append(str(v)) print(f"{name:<34}" + ''.join(f"{c:>{col_w}}" for c in cells)) for k, name in ( ('seq_complete', 'seq 1-12 完整率'), ('has_number', '每条含数字比例'), ('A_unit_econ_ref', 'A 引用单位经济字段'), ('A_env_bi_ref', 'A 引用 _环比 真实值'), ('A_calib_ref', 'A 引用对比口径 ★'), ('B_top_source', 'B 点明最大优惠来源'), ('C_balance_change_ref', 'C 引用余额变化字段'), ('D_labor_ratio_ref', 'D 引用人力成本占比'), ('D_zero_expense_flag', 'D 标注 0 支出'), ('E_weekday_ratio', 'E seq 9 含倍率'), ('E_anomaly_baseline', 'E seq 10 标注基线'), ('F_top2_reasons', 'F seq 11 列 top 2 原因 ★'), ('F_tracking_trigger', 'F seq 12 节奏+触发+阈值'), ): _row(name, [s['rates'][k] for s in summaries], 'percent') print('-' * (34 + col_w * len(summaries))) print('违规次数(越低越好):') _row(' 手动调整违规', [s['violations']['B_manual'] for s in summaries], 'int') _row(' 行业数字编造', [s['violations']['industry_number_total'] for s in summaries], 'int') _row(' 单期推测', [s['violations']['speculation_total'] for s in summaries], 'int') print('-' * (34 + col_w * len(summaries))) _row(' 平均字数', [f"{s['avg_content_len']:.0f}" for s in summaries], 'int') _row(' 样本数', [s['n'] for s in summaries], 'int') print() for s in summaries: print(f" {s['label'].upper()} 三色灯分布: {s['light_distribution']}") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('--dir', help='单目录分析:如 export/ai-ab-test/round_a') parser.add_argument('--compare', nargs='+', metavar='DIR', help='多轮对比(2-4 个目录)') args = parser.parse_args() if args.compare: dirs = [Path(d) for d in args.compare] if len(dirs) < 2: sys.exit('--compare 至少 2 个目录') summaries = [] for d in dirs: rounds = _iter_rounds(d) if not rounds: sys.exit(f'目录无 round_*.json:{d}') label = d.name.replace('round_', '') summaries.append(summarize(rounds, label)) for s in summaries: print_summary(s) print_compare(*summaries) # 存档对比 JSON tag = '_'.join(s['label'] for s in summaries) out_path = Path(f'export/ai-ab-test/_compare_{tag}.json') out_path.write_text(json.dumps({s['label']: s for s in summaries}, ensure_ascii=False, indent=2), encoding='utf-8') print(f'\n[done] 对比 JSON 已存: {out_path}') elif args.dir: rounds = _iter_rounds(Path(args.dir)) if not rounds: sys.exit(f'目录无 round_*.json:{args.dir}') label = Path(args.dir).name.replace('round_', '') s = summarize(rounds, label) print_summary(s) else: parser.error('需指定 --dir 或 --compare') if __name__ == '__main__': main()