Neo-ZQYY/scripts/analyze_ab_content_quality.py

"""App2 A/B 测试 · 内容质量深度分析器。

围绕"实际信息质量"评估，非表现形式（加粗/时长）。

分析维度（按板块）：
- 板块 A（seq 1-2）：客单价环比是否原字段引用（非推测）、是否识别"对比口径"
- 板块 B（seq 3-4）：最大优惠来源是否点明、手动调整是否合规表述（禁用"抹零/免单 X 元"）
- 板块 C（seq 5-6）：是否引用权威字段"储值卡余额变化"（期初/期末/其他调整）
- 板块 D（seq 7-8）：支出完整性 + 人力成本占比
- 板块 E（seq 9-10）：seq 9 是否含"旺淡倍率"；seq 10 是否标"同周X均值/期均"基线
- 板块 F（seq 11-12）：seq 11 三色灯 + top 2 原因；seq 12 跟踪节奏 + 触发动作 + 阈值

违规检测：
- 禁用行业数字（payload 未提供）：警戒线/均值/参考值/30%/40% 等无锚点百分比
- 禁用单期推测（"提升/下降/显著增长"但未引用 _环比）
- 编造字段（payload 不含的字段名）

用法：
  PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_ab_content_quality.py --dir export/ai-ab-test/round_a
  # 对比两轮：
  PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_ab_content_quality.py --compare export/ai-ab-test/round_a export/ai-ab-test/round_b
"""
from __future__ import annotations

import argparse
import json
import re
import sys
from pathlib import Path
from statistics import mean, stdev


def _iter_rounds(dir_path: Path) -> list[dict]:
    """读取目录下 round_XX.json 按 round_idx 排序。"""
    files = sorted(dir_path.glob('round_*.json'))
    out = []
    for f in files:
        data = json.loads(f.read_text(encoding='utf-8'))
        data['_file'] = f.name
        out.append(data)
    return out


def _get_seq(insights: list, seq: int) -> dict | None:
    for ins in insights:
        if isinstance(ins, dict) and ins.get('seq') == seq:
            return ins
    return None


def _has_number(text: str) -> bool:
    """content 是否含 ≥1 个具体数字或百分比。"""
    return bool(re.search(r'\d+(\.\d+)?%?', text))


def analyze_round(parsed: dict | None) -> dict:
    """分析单次返回的内容质量。返回 17 项指标。"""
    metrics: dict = {
        # 基础
        'count': 0,
        'seq_complete': False,
        'has_number_rate': 0.0,       # 每条含数字的比例（目标 100%）
        'avg_content_len': 0,
        # 板块 A
        'A_unit_econ_ref': False,      # seq 1-2 中引用单位经济字段（客单价/会员占比/日均订单）
        'A_env_bi_ref': False,         # 引用 _环比 字段的真实值
        'A_calib_ref': False,          # 引用对比口径（"同天数对齐"/"同期"）
        # 板块 B
        'B_top_source': False,         # 点明最大优惠来源
        'B_manual_violation': False,   # 违规：说了"抹零 X 元" / "免单 X 元"
        # 板块 C
        'C_balance_change_ref': False, # 引用"储值卡余额变化"字段（期初/期末/其他调整）
        # 板块 D
        'D_labor_ratio_ref': False,    # 人力成本占比
        'D_zero_expense_flag': False,  # 标注支出 0 或数据缺失
        # 板块 E
        'E_weekday_ratio': False,      # seq 9 含旺/淡倍率（"X 倍" / "X.XX 倍"）
        'E_anomaly_baseline': False,   # seq 10 标注基线类型（"同周" / "期均" / "基线"）
        # 板块 F
        'F_light': 'unknown',
        'F_top2_reasons': False,       # seq 11 列 ≥2 原因（1)...2)... / 原因一...原因二）
        'F_tracking_trigger': False,   # seq 12 含跟踪节奏 + 触发动作
        # 违规
        'V_industry_number': 0,        # 编造行业数字提及次数
        'V_speculation': 0,            # 单期推测（未引用 _环比 却说"提升/下降/显著"）
    }
    if not parsed:
        return metrics
    insights = parsed.get('insights') or []
    if not isinstance(insights, list):
        return metrics
    metrics['count'] = len(insights)
    seqs = [ins.get('seq') for ins in insights if isinstance(ins, dict)]
    metrics['seq_complete'] = sorted([s for s in seqs if isinstance(s, int)]) == list(range(1, 13))

    total_len = 0
    with_number = 0
    for ins in insights:
        if not isinstance(ins, dict):
            continue
        body = (ins.get('content') or '')
        total_len += len(body)
        if _has_number(body):
            with_number += 1
    if insights:
        metrics['has_number_rate'] = round(with_number / len(insights), 2)
        metrics['avg_content_len'] = round(total_len / len(insights))

    # 板块 A
    a_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (1, 2))
    metrics['A_unit_econ_ref'] = any(kw in a_texts for kw in ('客单价', '会员订单占比', '会员占比', '日均订单'))
    metrics['A_env_bi_ref'] = '_环比' in a_texts or bool(re.search(r'环比[^字段][^"]*?[+-]?\d+\.?\d*%', a_texts))
    metrics['A_calib_ref'] = any(kw in a_texts for kw in ('对比口径', '同天数对齐', '同期', '同日数', '截断到', '对比期'))

    # 板块 B
    b_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (3, 4))
    metrics['B_top_source'] = any(kw in b_texts for kw in ('最大', '主导', '占比最高', '占比超', '团购优惠', '主要来源'))
    # 违规：直接说"抹零 X 元"/"免单 X 元"（不是说"抹零/免单"这个类目名）
    metrics['B_manual_violation'] = bool(re.search(r'(抹零|免单)\s*\d+[\.\d]*\s*元', b_texts))

    # 板块 C
    c_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (5, 6))
    metrics['C_balance_change_ref'] = any(kw in c_texts for kw in ('期初', '期末', '余额变化', '其他调整', '非充值/消耗'))

    # 板块 D
    d_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (7, 8))
    metrics['D_labor_ratio_ref'] = any(kw in d_texts for kw in ('人力成本', '助教成本', '占成交收入', '占比'))
    metrics['D_zero_expense_flag'] = any(kw in d_texts for kw in ('支出为 0', '支出全 0', '支出全0', '支出为0', '0 元', '0元', '数据缺失', '数据不完整', '数据完整性', '未录入'))

    # 板块 E
    seq9 = (_get_seq(insights, 9) or {}).get('content') or ''
    seq10 = (_get_seq(insights, 10) or {}).get('content') or ''
    metrics['E_weekday_ratio'] = bool(re.search(r'\d+\.?\d*\s*倍|比.*\d+\.?\d*', seq9))
    metrics['E_anomaly_baseline'] = any(kw in seq10 for kw in ('同周', '期均', '基线', '同星期'))

    # 板块 F
    seq11 = (_get_seq(insights, 11) or {}).get('content') or ''
    seq12 = (_get_seq(insights, 12) or {}).get('content') or ''
    if re.search(r'🔴|红灯', seq11):
        metrics['F_light'] = 'red'
    elif re.search(r'🟡|黄灯', seq11):
        metrics['F_light'] = 'yellow'
    elif re.search(r'🟢|绿灯', seq11):
        metrics['F_light'] = 'green'
    # 匹配 "原因1:" / "原因 1：" / "1)" / "1." / "1、" / "①" / "原因一" / "其一"
    metrics['F_top2_reasons'] = bool(re.search(r'原因\s*1|1\s*[\)）\.、:：]|①|原因一|其一', seq11)) and \
                                 bool(re.search(r'原因\s*2|2\s*[\)）\.、:：]|②|原因二|其二', seq11))
    metrics['F_tracking_trigger'] = any(kw in seq12 for kw in ('启动', '触发', '召回', '立即')) and \
                                    bool(re.search(r'(每周|每月|每日|每天|每\s*\d+|周期性|定期)', seq12)) and \
                                    bool(re.search(r'<|>|≥|≤|低于|超过|达到|阈值', seq12))

    # 违规：行业数字（payload 只提供"周中客流规律"，其他均禁）
    # 典型措辞："行业警戒线" "行业均值" "行业标准" "行业参考" + 数字
    all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict))
    metrics['V_industry_number'] = len(re.findall(r'行业(警戒线|均值|标准|参考值|基线|基准|水平|经验值|通常|一般)[^，。；,]*\d+\.?\d*%?', all_text))

    # 违规：单期推测（句子含"提升/下降/显著增长/大幅"等但未引用 _环比 字段值）
    # 启发式：句子中有"提升/下降/大幅/明显/显著"但句子内没有带 % 的数字
    speculation_hits = 0
    for sent in re.split(r'[。；\n]', all_text):
        if not sent.strip():
            continue
        has_trend_word = bool(re.search(r'(提升|下降|上升|下滑|显著|大幅|明显)', sent))
        has_pct_number = bool(re.search(r'[+-]?\d+\.?\d*%', sent))
        if has_trend_word and not has_pct_number:
            # 允许"不推测"/"禁止推测"这类元指令
            if re.search(r'(推测|不|禁)', sent):
                continue
            speculation_hits += 1
    metrics['V_speculation'] = speculation_hits

    return metrics


def summarize(rounds: list[dict], label: str) -> dict:
    """聚合 10 次的内容质量分布。"""
    per = [analyze_round(r.get('parsed')) for r in rounds]
    # 汇总
    def _rate(key: str) -> float:
        vals = [1 if p.get(key) else 0 for p in per]
        return round(sum(vals) / len(vals), 2) if vals else 0.0

    def _avg(key: str) -> float:
        vals = [p.get(key, 0) for p in per]
        return round(mean(vals), 2) if vals else 0.0

    lights: dict[str, int] = {}
    for p in per:
        l = p.get('F_light', 'unknown')
        lights[l] = lights.get(l, 0) + 1

    summary = {
        'label': label,
        'n': len(rounds),
        'rates': {
            'seq_complete': _rate('seq_complete'),
            'has_number': _avg('has_number_rate'),
            'A_unit_econ_ref': _rate('A_unit_econ_ref'),
            'A_env_bi_ref': _rate('A_env_bi_ref'),
            'A_calib_ref': _rate('A_calib_ref'),
            'B_top_source': _rate('B_top_source'),
            'C_balance_change_ref': _rate('C_balance_change_ref'),
            'D_labor_ratio_ref': _rate('D_labor_ratio_ref'),
            'D_zero_expense_flag': _rate('D_zero_expense_flag'),
            'E_weekday_ratio': _rate('E_weekday_ratio'),
            'E_anomaly_baseline': _rate('E_anomaly_baseline'),
            'F_top2_reasons': _rate('F_top2_reasons'),
            'F_tracking_trigger': _rate('F_tracking_trigger'),
        },
        'violations': {
            'B_manual': sum(1 for p in per if p.get('B_manual_violation')),
            'industry_number_total': sum(p.get('V_industry_number', 0) for p in per),
            'speculation_total': sum(p.get('V_speculation', 0) for p in per),
        },
        'light_distribution': lights,
        'avg_content_len': _avg('avg_content_len'),
        'per_round': per,
    }
    return summary


def print_summary(s: dict) -> None:
    print(f"\n=== Round {s['label'].upper()} 内容质量汇总（n={s['n']}）===")
    print(f"  结构完整性:")
    print(f"    seq 1-12 完整率:     {s['rates']['seq_complete']:.0%}")
    print(f"    每条含数字比例:       {s['rates']['has_number']:.0%}")
    print(f"    平均 content 字数:    {s['avg_content_len']:.0f}")
    print(f"  板块 A · 收入:")
    print(f"    引用单位经济字段:     {s['rates']['A_unit_econ_ref']:.0%}")
    print(f"    引用 _环比 真实值:    {s['rates']['A_env_bi_ref']:.0%}")
    print(f"    引用对比口径:         {s['rates']['A_calib_ref']:.0%}  ★ v4 新增规则的关键指标")
    print(f"  板块 B · 优惠:")
    print(f"    点明最大来源:         {s['rates']['B_top_source']:.0%}")
    print(f"  板块 C · 储值卡:")
    print(f"    引用余额变化字段:     {s['rates']['C_balance_change_ref']:.0%}")
    print(f"  板块 D · 成本:")
    print(f"    引用人力成本占比:     {s['rates']['D_labor_ratio_ref']:.0%}")
    print(f"    标注 0 支出/数据缺失: {s['rates']['D_zero_expense_flag']:.0%}")
    print(f"  板块 E · 时间规律:")
    print(f"    seq 9 含旺/淡倍率:    {s['rates']['E_weekday_ratio']:.0%}")
    print(f"    seq 10 标注基线类型:  {s['rates']['E_anomaly_baseline']:.0%}")
    print(f"  板块 F · 综合:")
    print(f"    三色灯分布:           {s['light_distribution']}")
    print(f"    seq 11 列 top 2 原因: {s['rates']['F_top2_reasons']:.0%}")
    print(f"    seq 12 节奏+触发+阈值:{s['rates']['F_tracking_trigger']:.0%}")
    print(f"  违规统计（越低越好）:")
    print(f"    手动调整违规次数:     {s['violations']['B_manual']} / {s['n']}")
    print(f"    行业数字编造总计:     {s['violations']['industry_number_total']}")
    print(f"    单期推测总计:         {s['violations']['speculation_total']}")


def print_compare(*summaries: dict) -> None:
    labels = [s['label'].upper() for s in summaries]
    header = ' vs '.join(labels)
    print(f"\n======= {header} 多方对比表 =======")
    col_w = 10
    print(f"{'指标':<34}" + ''.join(f"{l:>{col_w}}" for l in labels))
    print('-' * (34 + col_w * len(labels)))

    def _row(name: str, values: list, fmt: str = 'percent') -> None:
        cells = []
        for v in values:
            if fmt == 'percent':
                cells.append(f'{v:.0%}')
            else:
                cells.append(str(v))
        print(f"{name:<34}" + ''.join(f"{c:>{col_w}}" for c in cells))

    for k, name in (
        ('seq_complete',         'seq 1-12 完整率'),
        ('has_number',           '每条含数字比例'),
        ('A_unit_econ_ref',      'A 引用单位经济字段'),
        ('A_env_bi_ref',         'A 引用 _环比 真实值'),
        ('A_calib_ref',          'A 引用对比口径 ★'),
        ('B_top_source',         'B 点明最大优惠来源'),
        ('C_balance_change_ref', 'C 引用余额变化字段'),
        ('D_labor_ratio_ref',    'D 引用人力成本占比'),
        ('D_zero_expense_flag',  'D 标注 0 支出'),
        ('E_weekday_ratio',      'E seq 9 含倍率'),
        ('E_anomaly_baseline',   'E seq 10 标注基线'),
        ('F_top2_reasons',       'F seq 11 列 top 2 原因 ★'),
        ('F_tracking_trigger',   'F seq 12 节奏+触发+阈值'),
    ):
        _row(name, [s['rates'][k] for s in summaries], 'percent')

    print('-' * (34 + col_w * len(summaries)))
    print('违规次数（越低越好）:')
    _row('  手动调整违规',       [s['violations']['B_manual'] for s in summaries], 'int')
    _row('  行业数字编造',       [s['violations']['industry_number_total'] for s in summaries], 'int')
    _row('  单期推测',           [s['violations']['speculation_total'] for s in summaries], 'int')
    print('-' * (34 + col_w * len(summaries)))
    _row('  平均字数',           [f"{s['avg_content_len']:.0f}" for s in summaries], 'int')
    _row('  样本数',             [s['n'] for s in summaries], 'int')
    print()
    for s in summaries:
        print(f"  {s['label'].upper()} 三色灯分布: {s['light_distribution']}")


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument('--dir', help='单目录分析：如 export/ai-ab-test/round_a')
    parser.add_argument('--compare', nargs='+', metavar='DIR', help='多轮对比（2-4 个目录）')
    args = parser.parse_args()

    if args.compare:
        dirs = [Path(d) for d in args.compare]
        if len(dirs) < 2:
            sys.exit('--compare 至少 2 个目录')
        summaries = []
        for d in dirs:
            rounds = _iter_rounds(d)
            if not rounds:
                sys.exit(f'目录无 round_*.json：{d}')
            label = d.name.replace('round_', '')
            summaries.append(summarize(rounds, label))
        for s in summaries:
            print_summary(s)
        print_compare(*summaries)
        # 存档对比 JSON
        tag = '_'.join(s['label'] for s in summaries)
        out_path = Path(f'export/ai-ab-test/_compare_{tag}.json')
        out_path.write_text(json.dumps({s['label']: s for s in summaries}, ensure_ascii=False, indent=2), encoding='utf-8')
        print(f'\n[done] 对比 JSON 已存: {out_path}')
    elif args.dir:
        rounds = _iter_rounds(Path(args.dir))
        if not rounds:
            sys.exit(f'目录无 round_*.json：{args.dir}')
        label = Path(args.dir).name.replace('round_', '')
        s = summarize(rounds, label)
        print_summary(s)
    else:
        parser.error('需指定 --dir 或 --compare')


if __name__ == '__main__':
    main()