"""App2 财务洞察 · 店长视角评分方法 v2。 围绕"店长读了这 12 条能做什么 / 学到什么"做评估,三大维度: 【1】准确性(客观正确 · 越高越好) - 数字取自 payload,无编造 - 引用权威字段(非原始指标兜底) - 遵守业务规则(手动调整不拆明细 / 禁行业数字 / 禁单期推测 / 对比口径) - 数据缺失/0 支出主动标注 【2】洞察深度(业务价值 · 越高越好) - 非显性信号挖掘("消耗>充值=存量消费"/"差值=平均让利"等解读) - 多指标协同分析(seq 11 结构失衡 + 原因互相印证) - 根因而非表象(如"数据录入缺失"而不只"支出为 0") - seq 12 跟踪 4 要素齐全(指标 / 阈值 / 节奏 / 触发动作) - 避免空洞表达("关注 XX" / "加强 XX" 被扣分) 【3】稳定性(可靠性 · 越高越好;稳定 ≠ 僵化) - 健康度评级方向一致(同数据下 10 次评级应相似,灯色众数占比 ≥ 80%) - 关键原因收敛(seq 11 原因 1/2 所引 key signal 的 IoU ≥ 60%) - seq 12 跟踪指标选择一致(10 次中 TOP 指标命中率高) - 字数/时长波动小(内容饱满但不冗余 · CV 低) 用法: PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_store_manager_quality.py \\ --compare export/ai-ab-test/round_a export/ai-ab-test/round_b \\ export/ai-ab-test/round_v5 export/ai-ab-test/round_v5_1 """ from __future__ import annotations import argparse import json import re import sys from collections import Counter from pathlib import Path from statistics import mean, stdev # ===== 核心业务关键词库(店长语汇) ===== # 挖掘深度关键信号 DEEP_SIGNALS = [ # 储值卡负债动态 ('消耗.*充值|充值.*消耗', '储值卡充消对比'), ('存量消费|复购乏力|复购.*收缩|复购.*减弱', '复购解读'), ('负债.*累积|兑付压力|负债.*减轻', '负债解读'), # 客单价双口径差值 ('按成交收入.*按发生额|按发生额.*按成交收入', '客单价双口径对比'), ('每单.*让利|让利.*量级|让利.*金额', '让利量化'), # 业态归因 ('业态|定位|散客|团购为主|车站|商场', '业态归因'), # 数据完整性 ('数据.*缺失|数据.*完整|录入.*缺失|支出.*全.*0|支出.*为.*0|虚高风险|净利.*可信', '数据质量质疑'), # 协同恶化/结构失衡 ('结构失衡|协同|多指标.*同|同向恶化', '结构性洞察'), # 时间规律深度 ('周六.*周.*倍|周末.*工作日|旺淡日.*倍', '周规律倍率'), ('同周|同星期|同周基线', '基线识别'), ] # 空洞表达(应被扣分) HOLLOW_PATTERNS = [ r'^关注\s*[^,。]+$', r'建议关注', r'加强\s*(\w+)?运营', r'提升\s*(\w+)?管理', r'需要重视', r'应当注意', ] # 趋势词 · 必须同句内有 % 或绝对值数字(否则单期推测违规) TREND_WORDS = ['下滑', '下降', '上升', '提升', '收缩', '萎缩', '承压', '走弱', '走强', '加剧', '恶化', '持续', '显著', '大幅', '明显', '锐减', '攀升'] def _iter_rounds(dir_path: Path) -> list[dict]: files = sorted(dir_path.glob('round_*.json')) out = [] for f in files: data = json.loads(f.read_text(encoding='utf-8')) data['_file'] = f.name out.append(data) return out def _get_seq(insights: list, seq: int) -> dict | None: for ins in insights: if isinstance(ins, dict) and ins.get('seq') == seq: return ins return None def analyze_accuracy(insights: list) -> dict: """准确性评分(每项 0/1)。""" a_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (1, 2)) b_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (3, 4)) c_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (5, 6)) d_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (7, 8)) seq9 = (_get_seq(insights, 9) or {}).get('content') or '' seq10 = (_get_seq(insights, 10) or {}).get('content') or '' all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict)) # 1. 对比口径显式引用(V5.1 H1 硬性要求) calib_explicit = bool(re.search(r'对比口径|同期对齐|同天数对齐|当期\s*\d+\s*天.*上期|\d+\s*天同期', a_texts)) # 2. 权威字段引用(C 储值卡余额变化 / A 单位经济环比) authority_c = any(k in c_texts for k in ('期初', '期末', '余额变化', '其他调整')) authority_a = bool(re.search(r'[+-]?\d+\.?\d*%', a_texts)) # 有实际环比数字 # 3. 规则合规 no_industry_number = not bool(re.search(r'行业(警戒线|均值|标准|参考|基准|水平|经验值|通常).*\d+\.?\d*%?', all_text)) no_manual_detail = not bool(re.search(r'(抹零|免单)\s*\d+[\.\d]*\s*元', b_texts)) # 4. 单期推测违规计数(趋势词同句内是否有数字锚点) speculation_count = 0 for sent in re.split(r'[。;\n]', all_text): if not sent.strip(): continue has_trend_word = any(w in sent for w in TREND_WORDS) has_number = bool(re.search(r'[+-]?\d+\.?\d*%|\d{3,}元|\d+\s*元|\-\d+', sent)) if has_trend_word and not has_number: # 允许"禁止推测"等元指令跳过 if '推测' in sent or '禁' in sent or '不能' in sent: continue speculation_count += 1 # 5. 数据完整性标注(D 板块 0 支出主动指出) data_integrity_flagged = any(k in d_texts for k in ('支出.*0', '数据缺失', '录入', '不完整', '虚高', '无法评估', '可信度')) # 6. 每条含数字 ins_with_number = sum(1 for ins in insights if isinstance(ins, dict) and re.search(r'\d+', ins.get('content') or '')) number_rate = ins_with_number / len(insights) if insights else 0 return { 'calib_explicit': int(calib_explicit), 'authority_c': int(authority_c), 'authority_a_env_bi': int(authority_a), 'no_industry_number': int(no_industry_number), 'no_manual_detail': int(no_manual_detail), 'speculation_count': speculation_count, # 越低越好 'data_integrity_flagged': int(data_integrity_flagged), 'number_rate': round(number_rate, 2), } def analyze_depth(insights: list) -> dict: """洞察深度评分。""" all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict)) # 1. 深度信号命中数(10 类里命中几类) deep_hits: list[str] = [] for pattern, name in DEEP_SIGNALS: if re.search(pattern, all_text): deep_hits.append(name) deep_hit_count = len(deep_hits) # 2. 空洞表达计数(每条按句扫描) hollow_count = 0 for ins in insights: if not isinstance(ins, dict): continue body = ins.get('content') or '' for pattern in HOLLOW_PATTERNS: if re.search(pattern, body): hollow_count += 1 break # 3. seq 11 top 2 原因结构 + 每个原因是否有意义解读(不只数字堆砌) seq11 = (_get_seq(insights, 11) or {}).get('content') or '' has_r1 = bool(re.search(r'原因\s*1|1\s*[\))\.、::]|①|原因一', seq11)) has_r2 = bool(re.search(r'原因\s*2|2\s*[\))\.、::]|②|原因二', seq11)) f_top2 = int(has_r1 and has_r2) # 意义解读:原因文字里是否有解读词("收缩/虚高/失衡/风险/无法评估"等因果意义) has_semantic = bool(re.search(r'收缩|虚高|失衡|风险|无法评估|可信|压力|乏力|不足|崩塌|疲态|暴露', seq11)) f_top2_semantic = int(has_r1 and has_r2 and has_semantic) # 4. seq 12 跟踪 4 要素齐全(指标/阈值/节奏/动作) seq12 = (_get_seq(insights, 12) or {}).get('content') or '' has_indicator = bool(re.search(r'[储值卡余额|会员|客单价|成交收入|现金流入|现金流出|支出|占比|助教|订单]', seq12)) has_rhythm = bool(re.search(r'(每[周月日天]|每\s*\d+|双周|旬|定期|周期性|连续)', seq12)) has_threshold = bool(re.search(r'<\s*-?\d+|>\s*-?\d+|≥|≤|达到|跌破|超过|低于|目标', seq12)) has_action = bool(re.search(r'(启动|触发|召回|发起|立即|核查|补录|校准|活动|赠金|审批|预算)', seq12)) seq12_4elem = int(has_indicator and has_rhythm and has_threshold and has_action) # 5. 协同分析识别(seq 11 是否讲到多指标联动) collaborative = bool(re.search(r'(\+|与|及|和|协同|同时|叠加|共同|双收缩|双下降).*(\+|与|及|和)', seq11)) or \ bool(re.search(r'\d+.*\d+.*\d+', seq11)) # 至少 3 个数字说明多信号 return { 'deep_hit_count': deep_hit_count, # 0-10 'deep_hits': deep_hits, 'hollow_count': hollow_count, # 越低越好 'f_top2_structure': f_top2, 'f_top2_with_semantic': f_top2_semantic, 'seq12_4elem_complete': seq12_4elem, 'collaborative_analysis': int(collaborative), } def analyze_stability_round(insights: list) -> dict: """单轮提取稳定性分析所需的"指纹"。""" # 1. 健康度灯色 seq11 = (_get_seq(insights, 11) or {}).get('content') or '' light = 'unknown' if re.search(r'🔴|红灯', seq11): light = 'red' elif re.search(r'🟡|黄灯', seq11): light = 'yellow' elif re.search(r'🟢|绿灯', seq11): light = 'green' # 2. 关键 key signals(从 seq 11 提取命中的业务关键词集合) KEY_SIGNAL_VOCAB = [ '会员占比', '会员订单占比', '客单价', '储值卡余额', '储值卡', '支出.*0', '支出缺失', '数据缺失', '复购', '优惠', '助教成本', '人力成本', '成交收入', '现金流入', ] signals = set() for kw in KEY_SIGNAL_VOCAB: if re.search(kw, seq11): signals.add(kw) # 3. seq 12 选取的跟踪指标 seq12_title = (_get_seq(insights, 12) or {}).get('title') or '' seq12_body = (_get_seq(insights, 12) or {}).get('content') or '' tracking_indicator = 'unknown' for kw, label in [ ('储值卡余额', '储值卡余额'), ('会员占比|会员订单占比', '会员占比'), ('客单价', '客单价'), ('现金流出|支出', '现金流出/支出'), ('成交收入', '成交收入'), ('助教|人力成本', '助教/人力成本'), ]: if re.search(kw, seq12_title + ' ' + seq12_body): tracking_indicator = label break # 4. 字数 avg_len = 0 if insights: avg_len = sum(len(ins.get('content') or '') for ins in insights if isinstance(ins, dict)) / len(insights) return { 'light': light, 'seq11_signals': signals, 'tracking_indicator': tracking_indicator, 'avg_content_len': round(avg_len, 1), } def summarize(rounds_data: list[dict], label: str, perf: list[dict] | None = None) -> dict: """聚合 n 次的三层评分。 rounds_data: round_*.json 原始数据列表(含 parsed) perf: 外部传入性能数据(duration_s, tokens),从 meta 里取 """ accuracies = [] depths = [] stab_fingerprints = [] for rd in rounds_data: insights = (rd.get('parsed') or {}).get('insights') or [] if not isinstance(insights, list) or not insights: continue accuracies.append(analyze_accuracy(insights)) depths.append(analyze_depth(insights)) stab_fingerprints.append(analyze_stability_round(insights)) n = len(accuracies) if n == 0: return {'label': label, 'n': 0} # === 准确性聚合 === acc_scores = { 'calib_explicit_rate': mean(a['calib_explicit'] for a in accuracies), 'authority_c_rate': mean(a['authority_c'] for a in accuracies), 'authority_a_env_bi_rate': mean(a['authority_a_env_bi'] for a in accuracies), 'no_industry_number_rate': mean(a['no_industry_number'] for a in accuracies), 'no_manual_detail_rate': mean(a['no_manual_detail'] for a in accuracies), 'speculation_avg': mean(a['speculation_count'] for a in accuracies), 'data_integrity_flagged_rate': mean(a['data_integrity_flagged'] for a in accuracies), 'number_rate_avg': mean(a['number_rate'] for a in accuracies), } # === 洞察深度聚合 === depth_scores = { 'deep_hit_avg': mean(d['deep_hit_count'] for d in depths), 'deep_hit_union': len(set.union(*[set(d['deep_hits']) for d in depths])) if depths else 0, 'hollow_avg': mean(d['hollow_count'] for d in depths), 'f_top2_rate': mean(d['f_top2_structure'] for d in depths), 'f_top2_with_semantic_rate': mean(d['f_top2_with_semantic'] for d in depths), 'seq12_4elem_rate': mean(d['seq12_4elem_complete'] for d in depths), 'collab_analysis_rate': mean(d['collaborative_analysis'] for d in depths), } # === 稳定性聚合(核心:众数占比 / 交并比 / CV)=== # 灯色众数占比 light_counter = Counter(s['light'] for s in stab_fingerprints) light_mode_rate = light_counter.most_common(1)[0][1] / n # seq 11 signals 交并比(跨 n 轮的平均 IoU) iou_scores = [] for i in range(n): for j in range(i + 1, n): s1, s2 = stab_fingerprints[i]['seq11_signals'], stab_fingerprints[j]['seq11_signals'] union = s1 | s2 if not union: continue iou_scores.append(len(s1 & s2) / len(union)) seq11_iou = mean(iou_scores) if iou_scores else 0 # tracking indicator 一致性(众数占比) ti_counter = Counter(s['tracking_indicator'] for s in stab_fingerprints) tracking_mode_rate = ti_counter.most_common(1)[0][1] / n # 字数 CV(低为稳定) content_lens = [s['avg_content_len'] for s in stab_fingerprints] content_cv = (stdev(content_lens) / mean(content_lens)) if len(content_lens) > 1 and mean(content_lens) > 0 else 0 # 性能 CV perf_data = perf or [] durations = [p['duration_s'] for p in perf_data if p.get('duration_s') is not None] tokens = [p['tokens'] for p in perf_data if p.get('tokens') is not None] duration_cv = (stdev(durations) / mean(durations)) if len(durations) > 1 and mean(durations) > 0 else 0 tokens_cv = (stdev(tokens) / mean(tokens)) if len(tokens) > 1 and mean(tokens) > 0 else 0 stab_scores = { 'light_mode': light_counter.most_common(1)[0][0], 'light_mode_rate': light_mode_rate, 'light_distribution': dict(light_counter), 'seq11_signal_iou': seq11_iou, 'tracking_mode': ti_counter.most_common(1)[0][0], 'tracking_mode_rate': tracking_mode_rate, 'content_len_cv': content_cv, 'duration_cv': duration_cv, 'tokens_cv': tokens_cv, 'duration_mean': mean(durations) if durations else 0, 'tokens_mean': mean(tokens) if tokens else 0, } # === 综合评分(0-100 百分制,三维加权)=== # 准确性 40% + 洞察深度 35% + 稳定性 25% acc_composite = ( acc_scores['calib_explicit_rate'] * 0.25 + acc_scores['authority_c_rate'] * 0.15 + acc_scores['authority_a_env_bi_rate'] * 0.10 + acc_scores['no_industry_number_rate'] * 0.15 + acc_scores['no_manual_detail_rate'] * 0.10 + max(0, 1 - acc_scores['speculation_avg'] / 5) * 0.15 + # 5 次推测扣到 0 acc_scores['data_integrity_flagged_rate'] * 0.10 ) depth_composite = ( min(depth_scores['deep_hit_avg'] / 5, 1) * 0.30 + # 深度信号 5 类以上得满 max(0, 1 - depth_scores['hollow_avg'] / 3) * 0.15 + # 空洞 3 次扣到 0 depth_scores['f_top2_with_semantic_rate'] * 0.25 + depth_scores['seq12_4elem_rate'] * 0.20 + depth_scores['collab_analysis_rate'] * 0.10 ) stab_composite = ( stab_scores['light_mode_rate'] * 0.30 + # 同灯色占比 stab_scores['seq11_signal_iou'] * 0.25 + # 原因信号交并比 stab_scores['tracking_mode_rate'] * 0.25 + # 跟踪指标一致 max(0, 1 - stab_scores['content_len_cv'] * 2) * 0.10 + # CV 0.5 扣到 0 max(0, 1 - stab_scores['duration_cv'] * 2) * 0.10 ) overall = acc_composite * 0.4 + depth_composite * 0.35 + stab_composite * 0.25 return { 'label': label, 'n': n, 'accuracy': acc_scores, 'depth': depth_scores, 'stability': stab_scores, 'composite': { 'accuracy': round(acc_composite * 100, 1), 'depth': round(depth_composite * 100, 1), 'stability': round(stab_composite * 100, 1), 'overall': round(overall * 100, 1), }, } def _load_perf(dir_path: Path) -> list[dict]: """从 round_XX.json 的 meta 提取 duration/tokens。""" out = [] for f in sorted(dir_path.glob('round_*.json')): data = json.loads(f.read_text(encoding='utf-8')) meta = data.get('meta') or {} out.append({'duration_s': meta.get('duration_s'), 'tokens': meta.get('tokens')}) return out def print_summary(s: dict) -> None: if s['n'] == 0: print(f"\n=== {s['label'].upper()} 数据为空 ===") return c = s['composite'] print(f"\n=== Round {s['label'].upper()} · 店长视角评分(n={s['n']})===") print(f" 【综合评分】 {c['overall']:.1f} / 100") print(f" 准确性 {c['accuracy']:.1f} | 洞察深度 {c['depth']:.1f} | 稳定性 {c['stability']:.1f}") print(f"\n -- 准确性明细 --") a = s['accuracy'] print(f" 对比口径显式引用: {a['calib_explicit_rate']:.0%}") print(f" C 权威字段(余额变化): {a['authority_c_rate']:.0%}") print(f" A 权威字段(环比数字): {a['authority_a_env_bi_rate']:.0%}") print(f" 禁行业数字合规: {a['no_industry_number_rate']:.0%}") print(f" 禁手动调整拆明细合规: {a['no_manual_detail_rate']:.0%}") print(f" 单期推测违规/次 (低好): {a['speculation_avg']:.1f}") print(f" 数据完整性标注: {a['data_integrity_flagged_rate']:.0%}") print(f"\n -- 洞察深度明细 --") d = s['depth'] print(f" 深度信号命中/次 (满 10): {d['deep_hit_avg']:.1f}") print(f" 跨轮覆盖信号数: {d['deep_hit_union']} / 10") print(f" 空洞表达/次 (低好): {d['hollow_avg']:.1f}") print(f" seq 11 top 2 结构: {d['f_top2_rate']:.0%}") print(f" seq 11 有意义解读: {d['f_top2_with_semantic_rate']:.0%}") print(f" seq 12 四要素齐全: {d['seq12_4elem_rate']:.0%}") print(f" 多指标协同分析: {d['collab_analysis_rate']:.0%}") print(f"\n -- 稳定性明细 --") st = s['stability'] print(f" 评级众数 ({st['light_mode']}) 占比: {st['light_mode_rate']:.0%} [{st['light_distribution']}]") print(f" seq 11 原因信号 IoU: {st['seq11_signal_iou']:.0%}") print(f" 跟踪指标众数 ({st['tracking_mode']}) 占比: {st['tracking_mode_rate']:.0%}") print(f" 字数 CV (低好): {st['content_len_cv']:.2f}") print(f" 时长 CV (低好): {st['duration_cv']:.2f} 均值 {st['duration_mean']:.1f}s") print(f" tokens CV (低好): {st['tokens_cv']:.2f} 均值 {st['tokens_mean']:.0f}") def print_compare(*summaries: dict) -> None: labels = [s['label'].upper() for s in summaries] header = ' vs '.join(labels) col_w = 12 print(f"\n======= {header} 店长视角综合评分 =======") print(f"{'指标':<34}" + ''.join(f"{l:>{col_w}}" for l in labels)) print('-' * (34 + col_w * len(labels))) def _row(name: str, values: list, fmt: str = 'float', higher_better: bool = True) -> None: cells = [] for v in values: if fmt == 'percent': cells.append(f'{v:.0%}') elif fmt == 'int': cells.append(str(v)) elif fmt == 'float1': cells.append(f'{v:.1f}') else: cells.append(f'{v:.2f}') print(f"{name:<34}" + ''.join(f"{c:>{col_w}}" for c in cells)) print('【综合】') _row(' 总分 / 100', [s['composite']['overall'] for s in summaries], 'float1') _row(' 准确性 (40%)', [s['composite']['accuracy'] for s in summaries], 'float1') _row(' 洞察深度 (35%)', [s['composite']['depth'] for s in summaries], 'float1') _row(' 稳定性 (25%)', [s['composite']['stability'] for s in summaries], 'float1') print('【准确性】') _row(' 对比口径显式 ★', [s['accuracy']['calib_explicit_rate'] for s in summaries], 'percent') _row(' C 权威字段', [s['accuracy']['authority_c_rate'] for s in summaries], 'percent') _row(' 数据完整性标注', [s['accuracy']['data_integrity_flagged_rate'] for s in summaries], 'percent') _row(' 单期推测违规/次 ↓', [s['accuracy']['speculation_avg'] for s in summaries], 'float1') _row(' 禁行业数字合规', [s['accuracy']['no_industry_number_rate'] for s in summaries], 'percent') print('【洞察深度】') _row(' 深度信号命中/次', [s['depth']['deep_hit_avg'] for s in summaries], 'float1') _row(' 跨轮信号覆盖 /10', [s['depth']['deep_hit_union'] for s in summaries], 'int') _row(' 空洞表达/次 ↓', [s['depth']['hollow_avg'] for s in summaries], 'float1') _row(' seq 11 top 2 结构', [s['depth']['f_top2_rate'] for s in summaries], 'percent') _row(' seq 11 有意义解读 ★', [s['depth']['f_top2_with_semantic_rate'] for s in summaries], 'percent') _row(' seq 12 四要素齐全', [s['depth']['seq12_4elem_rate'] for s in summaries], 'percent') _row(' 多指标协同分析', [s['depth']['collab_analysis_rate'] for s in summaries], 'percent') print('【稳定性(同数据下越一致越好)】') _row(' 评级众数占比 ★', [s['stability']['light_mode_rate'] for s in summaries], 'percent') _row(' seq 11 原因信号 IoU', [s['stability']['seq11_signal_iou'] for s in summaries], 'percent') _row(' 跟踪指标众数占比 ★', [s['stability']['tracking_mode_rate'] for s in summaries], 'percent') _row(' 字数 CV ↓', [s['stability']['content_len_cv'] for s in summaries], 'float') _row(' 时长 CV ↓', [s['stability']['duration_cv'] for s in summaries], 'float') _row(' 均时长 s', [s['stability']['duration_mean'] for s in summaries], 'float1') _row(' 均 tokens', [s['stability']['tokens_mean'] for s in summaries], 'float1') print() for s in summaries: print(f" {s['label'].upper()} 灯色分布: {s['stability']['light_distribution']}") def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('--dir', help='单目录分析') parser.add_argument('--compare', nargs='+', metavar='DIR', help='多目录对比(2-5 个)') args = parser.parse_args() if args.compare: dirs = [Path(d) for d in args.compare] summaries = [] for d in dirs: rounds = _iter_rounds(d) if not rounds: print(f'[skip] {d} 无数据') continue perf = _load_perf(d) label = d.name.replace('round_', '') summaries.append(summarize(rounds, label, perf)) for s in summaries: print_summary(s) print_compare(*summaries) tag = '_'.join(s['label'] for s in summaries) out_path = Path(f'export/ai-ab-test/_manager_quality_{tag}.json') out_path.write_text(json.dumps({s['label']: s for s in summaries}, ensure_ascii=False, indent=2, default=str), encoding='utf-8') print(f'\n[done] 店长视角评分 JSON 已存: {out_path}') elif args.dir: rounds = _iter_rounds(Path(args.dir)) perf = _load_perf(Path(args.dir)) label = Path(args.dir).name.replace('round_', '') s = summarize(rounds, label, perf) print_summary(s) else: parser.error('需指定 --dir 或 --compare') if __name__ == '__main__': main()