Files
Neo-ZQYY/scripts/analyze_store_manager_quality.py
Neo caf179a5da feat: 2026-04-15~05-02 累积变更基线 — AI 重构 + Runtime Context + DWS 修复
涵盖(每条对应已存的审计记录):
- AI 模块拆分:apps/backend/app/ai/apps -> prompts/(8 个 APP + app2a 派生)
  audit: 2026-04-20__ai-module-complete.md
- admin-web AI 管理套件:AIDashboard / AIOperations / AIRunLogs / AITriggers / TriggerManager
  audit: 2026-04-21__admin-web-ai-management-suite.md
- App2 财务洞察 prompt v3 -> v5.1 + 小程序 AI 接入(chat / board-finance)
  audit: 2026-04-22__app2_prompt_v5_1_and_miniprogram_ai_insight.md
- App2 prewarm 全过滤器 + AI 触发器 cron reschedule
  audit: 2026-04-21__app2-finance-prewarm-all-filters.md
  migration: 20260420_ai_trigger_jobs_and_app2_prewarm.sql / 20260421_app2_prewarm_cron_reschedule.sql
- AppType 联合类型对齐 + adminAiAppTypes.test.ts
  audit: 2026-04-30__admin_web_ai_app_type_alignment.md
- DashScope tokens_used 提取修复
  audit: 2026-04-30__backend_dashscope_tokens_used_extraction.md
- App3 线索完整详情 prompt
  audit: 2026-05-01__backend_app3_full_detail_prompt.md
- Runtime Context 沙箱(5-1~5-2 主线):
  - 后端 schema/service + admin_runtime_context / xcx_runtime_clock 两个 router
  - admin-web RuntimeContext.tsx + miniprogram runtime-clock.ts
  - migration: 20260501__runtime_context_sandbox.sql
  - tools/db/verify_admin_web_sandbox.py + verify_sandbox_end_to_end.py
  - database/changes: 7 份 sandbox_* 验证报告
- 飞球 DWS 修复:finance_area_daily 区域汇总 + task_engine 调整
  + RLS 视图业务日上界(migration 20260502 + scripts/ops/gen_rls_business_date_migration.py)

合规:
- .gitignore 启用 tmp/ 排除
- 不入仓:apps/etl/connectors/feiqiu/.env(API_TOKEN secret,本地修改保留)

待验证清单:
- docs/audit/changes/2026-05-04__cumulative_baseline_pending_verification.md
  每个主题的功能完整性 / 上线验证几乎都未收口,按优先级 P0~P3 逐一处理
2026-05-04 02:30:19 +08:00

532 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""App2 财务洞察 · 店长视角评分方法 v2。
围绕"店长读了这 12 条能做什么 / 学到什么"做评估,三大维度:
【1】准确性客观正确 · 越高越好)
- 数字取自 payload无编造
- 引用权威字段(非原始指标兜底)
- 遵守业务规则(手动调整不拆明细 / 禁行业数字 / 禁单期推测 / 对比口径)
- 数据缺失/0 支出主动标注
【2】洞察深度业务价值 · 越高越好)
- 非显性信号挖掘("消耗>充值=存量消费"/"差值=平均让利"等解读)
- 多指标协同分析seq 11 结构失衡 + 原因互相印证)
- 根因而非表象(如"数据录入缺失"而不只"支出为 0"
- seq 12 跟踪 4 要素齐全(指标 / 阈值 / 节奏 / 触发动作)
- 避免空洞表达("关注 XX" / "加强 XX" 被扣分)
【3】稳定性可靠性 · 越高越好;稳定 ≠ 僵化)
- 健康度评级方向一致(同数据下 10 次评级应相似,灯色众数占比 ≥ 80%
- 关键原因收敛seq 11 原因 1/2 所引 key signal 的 IoU ≥ 60%
- seq 12 跟踪指标选择一致10 次中 TOP 指标命中率高)
- 字数/时长波动小(内容饱满但不冗余 · CV 低)
用法:
PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_store_manager_quality.py \\
--compare export/ai-ab-test/round_a export/ai-ab-test/round_b \\
export/ai-ab-test/round_v5 export/ai-ab-test/round_v5_1
"""
from __future__ import annotations
import argparse
import json
import re
import sys
from collections import Counter
from pathlib import Path
from statistics import mean, stdev
# ===== 核心业务关键词库(店长语汇) =====
# 挖掘深度关键信号
DEEP_SIGNALS = [
# 储值卡负债动态
('消耗.*充值|充值.*消耗', '储值卡充消对比'),
('存量消费|复购乏力|复购.*收缩|复购.*减弱', '复购解读'),
('负债.*累积|兑付压力|负债.*减轻', '负债解读'),
# 客单价双口径差值
('按成交收入.*按发生额|按发生额.*按成交收入', '客单价双口径对比'),
('每单.*让利|让利.*量级|让利.*金额', '让利量化'),
# 业态归因
('业态|定位|散客|团购为主|车站|商场', '业态归因'),
# 数据完整性
('数据.*缺失|数据.*完整|录入.*缺失|支出.*全.*0|支出.*为.*0|虚高风险|净利.*可信', '数据质量质疑'),
# 协同恶化/结构失衡
('结构失衡|协同|多指标.*同|同向恶化', '结构性洞察'),
# 时间规律深度
('周六.*周.*倍|周末.*工作日|旺淡日.*倍', '周规律倍率'),
('同周|同星期|同周基线', '基线识别'),
]
# 空洞表达(应被扣分)
HOLLOW_PATTERNS = [
r'^关注\s*[^,。]+$',
r'建议关注',
r'加强\s*(\w+)?运营',
r'提升\s*(\w+)?管理',
r'需要重视',
r'应当注意',
]
# 趋势词 · 必须同句内有 % 或绝对值数字(否则单期推测违规)
TREND_WORDS = ['下滑', '下降', '上升', '提升', '收缩', '萎缩', '承压', '走弱', '走强',
'加剧', '恶化', '持续', '显著', '大幅', '明显', '锐减', '攀升']
def _iter_rounds(dir_path: Path) -> list[dict]:
files = sorted(dir_path.glob('round_*.json'))
out = []
for f in files:
data = json.loads(f.read_text(encoding='utf-8'))
data['_file'] = f.name
out.append(data)
return out
def _get_seq(insights: list, seq: int) -> dict | None:
for ins in insights:
if isinstance(ins, dict) and ins.get('seq') == seq:
return ins
return None
def analyze_accuracy(insights: list) -> dict:
"""准确性评分(每项 0/1"""
a_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (1, 2))
b_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (3, 4))
c_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (5, 6))
d_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (7, 8))
seq9 = (_get_seq(insights, 9) or {}).get('content') or ''
seq10 = (_get_seq(insights, 10) or {}).get('content') or ''
all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict))
# 1. 对比口径显式引用V5.1 H1 硬性要求)
calib_explicit = bool(re.search(r'对比口径|同期对齐|同天数对齐|当期\s*\d+\s*天.*上期|\d+\s*天同期', a_texts))
# 2. 权威字段引用C 储值卡余额变化 / A 单位经济环比)
authority_c = any(k in c_texts for k in ('期初', '期末', '余额变化', '其他调整'))
authority_a = bool(re.search(r'[+-]?\d+\.?\d*%', a_texts)) # 有实际环比数字
# 3. 规则合规
no_industry_number = not bool(re.search(r'行业(警戒线|均值|标准|参考|基准|水平|经验值|通常).*\d+\.?\d*%?', all_text))
no_manual_detail = not bool(re.search(r'(抹零|免单)\s*\d+[\.\d]*\s*元', b_texts))
# 4. 单期推测违规计数(趋势词同句内是否有数字锚点)
speculation_count = 0
for sent in re.split(r'[。;\n]', all_text):
if not sent.strip():
continue
has_trend_word = any(w in sent for w in TREND_WORDS)
has_number = bool(re.search(r'[+-]?\d+\.?\d*%|\d{3,}元|\d+\s*元|\-\d+', sent))
if has_trend_word and not has_number:
# 允许"禁止推测"等元指令跳过
if '推测' in sent or '' in sent or '不能' in sent:
continue
speculation_count += 1
# 5. 数据完整性标注D 板块 0 支出主动指出)
data_integrity_flagged = any(k in d_texts for k in
('支出.*0', '数据缺失', '录入', '不完整', '虚高', '无法评估', '可信度'))
# 6. 每条含数字
ins_with_number = sum(1 for ins in insights if isinstance(ins, dict) and
re.search(r'\d+', ins.get('content') or ''))
number_rate = ins_with_number / len(insights) if insights else 0
return {
'calib_explicit': int(calib_explicit),
'authority_c': int(authority_c),
'authority_a_env_bi': int(authority_a),
'no_industry_number': int(no_industry_number),
'no_manual_detail': int(no_manual_detail),
'speculation_count': speculation_count, # 越低越好
'data_integrity_flagged': int(data_integrity_flagged),
'number_rate': round(number_rate, 2),
}
def analyze_depth(insights: list) -> dict:
"""洞察深度评分。"""
all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict))
# 1. 深度信号命中数10 类里命中几类)
deep_hits: list[str] = []
for pattern, name in DEEP_SIGNALS:
if re.search(pattern, all_text):
deep_hits.append(name)
deep_hit_count = len(deep_hits)
# 2. 空洞表达计数(每条按句扫描)
hollow_count = 0
for ins in insights:
if not isinstance(ins, dict):
continue
body = ins.get('content') or ''
for pattern in HOLLOW_PATTERNS:
if re.search(pattern, body):
hollow_count += 1
break
# 3. seq 11 top 2 原因结构 + 每个原因是否有意义解读(不只数字堆砌)
seq11 = (_get_seq(insights, 11) or {}).get('content') or ''
has_r1 = bool(re.search(r'原因\s*1|1\s*[\)\.、:]|①|原因一', seq11))
has_r2 = bool(re.search(r'原因\s*2|2\s*[\)\.、:]|②|原因二', seq11))
f_top2 = int(has_r1 and has_r2)
# 意义解读:原因文字里是否有解读词("收缩/虚高/失衡/风险/无法评估"等因果意义)
has_semantic = bool(re.search(r'收缩|虚高|失衡|风险|无法评估|可信|压力|乏力|不足|崩塌|疲态|暴露',
seq11))
f_top2_semantic = int(has_r1 and has_r2 and has_semantic)
# 4. seq 12 跟踪 4 要素齐全(指标/阈值/节奏/动作)
seq12 = (_get_seq(insights, 12) or {}).get('content') or ''
has_indicator = bool(re.search(r'[储值卡余额|会员|客单价|成交收入|现金流入|现金流出|支出|占比|助教|订单]', seq12))
has_rhythm = bool(re.search(r'(每[周月日天]|每\s*\d+|双周|旬|定期|周期性|连续)', seq12))
has_threshold = bool(re.search(r'<\s*-?\d+|>\s*-?\d+|≥|≤|达到|跌破|超过|低于|目标', seq12))
has_action = bool(re.search(r'(启动|触发|召回|发起|立即|核查|补录|校准|活动|赠金|审批|预算)', seq12))
seq12_4elem = int(has_indicator and has_rhythm and has_threshold and has_action)
# 5. 协同分析识别seq 11 是否讲到多指标联动)
collaborative = bool(re.search(r'(\+|与|及|和|协同|同时|叠加|共同|双收缩|双下降).*(\+|与|及|和)', seq11)) or \
bool(re.search(r'\d+.*\d+.*\d+', seq11)) # 至少 3 个数字说明多信号
return {
'deep_hit_count': deep_hit_count, # 0-10
'deep_hits': deep_hits,
'hollow_count': hollow_count, # 越低越好
'f_top2_structure': f_top2,
'f_top2_with_semantic': f_top2_semantic,
'seq12_4elem_complete': seq12_4elem,
'collaborative_analysis': int(collaborative),
}
def analyze_stability_round(insights: list) -> dict:
"""单轮提取稳定性分析所需的"指纹""""
# 1. 健康度灯色
seq11 = (_get_seq(insights, 11) or {}).get('content') or ''
light = 'unknown'
if re.search(r'🔴|红灯', seq11):
light = 'red'
elif re.search(r'🟡|黄灯', seq11):
light = 'yellow'
elif re.search(r'🟢|绿灯', seq11):
light = 'green'
# 2. 关键 key signals从 seq 11 提取命中的业务关键词集合)
KEY_SIGNAL_VOCAB = [
'会员占比', '会员订单占比',
'客单价',
'储值卡余额', '储值卡',
'支出.*0', '支出缺失', '数据缺失',
'复购',
'优惠',
'助教成本', '人力成本',
'成交收入', '现金流入',
]
signals = set()
for kw in KEY_SIGNAL_VOCAB:
if re.search(kw, seq11):
signals.add(kw)
# 3. seq 12 选取的跟踪指标
seq12_title = (_get_seq(insights, 12) or {}).get('title') or ''
seq12_body = (_get_seq(insights, 12) or {}).get('content') or ''
tracking_indicator = 'unknown'
for kw, label in [
('储值卡余额', '储值卡余额'),
('会员占比|会员订单占比', '会员占比'),
('客单价', '客单价'),
('现金流出|支出', '现金流出/支出'),
('成交收入', '成交收入'),
('助教|人力成本', '助教/人力成本'),
]:
if re.search(kw, seq12_title + ' ' + seq12_body):
tracking_indicator = label
break
# 4. 字数
avg_len = 0
if insights:
avg_len = sum(len(ins.get('content') or '') for ins in insights if isinstance(ins, dict)) / len(insights)
return {
'light': light,
'seq11_signals': signals,
'tracking_indicator': tracking_indicator,
'avg_content_len': round(avg_len, 1),
}
def summarize(rounds_data: list[dict], label: str, perf: list[dict] | None = None) -> dict:
"""聚合 n 次的三层评分。
rounds_data: round_*.json 原始数据列表(含 parsed
perf: 外部传入性能数据duration_s, tokens从 meta 里取
"""
accuracies = []
depths = []
stab_fingerprints = []
for rd in rounds_data:
insights = (rd.get('parsed') or {}).get('insights') or []
if not isinstance(insights, list) or not insights:
continue
accuracies.append(analyze_accuracy(insights))
depths.append(analyze_depth(insights))
stab_fingerprints.append(analyze_stability_round(insights))
n = len(accuracies)
if n == 0:
return {'label': label, 'n': 0}
# === 准确性聚合 ===
acc_scores = {
'calib_explicit_rate': mean(a['calib_explicit'] for a in accuracies),
'authority_c_rate': mean(a['authority_c'] for a in accuracies),
'authority_a_env_bi_rate': mean(a['authority_a_env_bi'] for a in accuracies),
'no_industry_number_rate': mean(a['no_industry_number'] for a in accuracies),
'no_manual_detail_rate': mean(a['no_manual_detail'] for a in accuracies),
'speculation_avg': mean(a['speculation_count'] for a in accuracies),
'data_integrity_flagged_rate': mean(a['data_integrity_flagged'] for a in accuracies),
'number_rate_avg': mean(a['number_rate'] for a in accuracies),
}
# === 洞察深度聚合 ===
depth_scores = {
'deep_hit_avg': mean(d['deep_hit_count'] for d in depths),
'deep_hit_union': len(set.union(*[set(d['deep_hits']) for d in depths])) if depths else 0,
'hollow_avg': mean(d['hollow_count'] for d in depths),
'f_top2_rate': mean(d['f_top2_structure'] for d in depths),
'f_top2_with_semantic_rate': mean(d['f_top2_with_semantic'] for d in depths),
'seq12_4elem_rate': mean(d['seq12_4elem_complete'] for d in depths),
'collab_analysis_rate': mean(d['collaborative_analysis'] for d in depths),
}
# === 稳定性聚合(核心:众数占比 / 交并比 / CV===
# 灯色众数占比
light_counter = Counter(s['light'] for s in stab_fingerprints)
light_mode_rate = light_counter.most_common(1)[0][1] / n
# seq 11 signals 交并比(跨 n 轮的平均 IoU
iou_scores = []
for i in range(n):
for j in range(i + 1, n):
s1, s2 = stab_fingerprints[i]['seq11_signals'], stab_fingerprints[j]['seq11_signals']
union = s1 | s2
if not union:
continue
iou_scores.append(len(s1 & s2) / len(union))
seq11_iou = mean(iou_scores) if iou_scores else 0
# tracking indicator 一致性(众数占比)
ti_counter = Counter(s['tracking_indicator'] for s in stab_fingerprints)
tracking_mode_rate = ti_counter.most_common(1)[0][1] / n
# 字数 CV低为稳定
content_lens = [s['avg_content_len'] for s in stab_fingerprints]
content_cv = (stdev(content_lens) / mean(content_lens)) if len(content_lens) > 1 and mean(content_lens) > 0 else 0
# 性能 CV
perf_data = perf or []
durations = [p['duration_s'] for p in perf_data if p.get('duration_s') is not None]
tokens = [p['tokens'] for p in perf_data if p.get('tokens') is not None]
duration_cv = (stdev(durations) / mean(durations)) if len(durations) > 1 and mean(durations) > 0 else 0
tokens_cv = (stdev(tokens) / mean(tokens)) if len(tokens) > 1 and mean(tokens) > 0 else 0
stab_scores = {
'light_mode': light_counter.most_common(1)[0][0],
'light_mode_rate': light_mode_rate,
'light_distribution': dict(light_counter),
'seq11_signal_iou': seq11_iou,
'tracking_mode': ti_counter.most_common(1)[0][0],
'tracking_mode_rate': tracking_mode_rate,
'content_len_cv': content_cv,
'duration_cv': duration_cv,
'tokens_cv': tokens_cv,
'duration_mean': mean(durations) if durations else 0,
'tokens_mean': mean(tokens) if tokens else 0,
}
# === 综合评分0-100 百分制,三维加权)===
# 准确性 40% + 洞察深度 35% + 稳定性 25%
acc_composite = (
acc_scores['calib_explicit_rate'] * 0.25 +
acc_scores['authority_c_rate'] * 0.15 +
acc_scores['authority_a_env_bi_rate'] * 0.10 +
acc_scores['no_industry_number_rate'] * 0.15 +
acc_scores['no_manual_detail_rate'] * 0.10 +
max(0, 1 - acc_scores['speculation_avg'] / 5) * 0.15 + # 5 次推测扣到 0
acc_scores['data_integrity_flagged_rate'] * 0.10
)
depth_composite = (
min(depth_scores['deep_hit_avg'] / 5, 1) * 0.30 + # 深度信号 5 类以上得满
max(0, 1 - depth_scores['hollow_avg'] / 3) * 0.15 + # 空洞 3 次扣到 0
depth_scores['f_top2_with_semantic_rate'] * 0.25 +
depth_scores['seq12_4elem_rate'] * 0.20 +
depth_scores['collab_analysis_rate'] * 0.10
)
stab_composite = (
stab_scores['light_mode_rate'] * 0.30 + # 同灯色占比
stab_scores['seq11_signal_iou'] * 0.25 + # 原因信号交并比
stab_scores['tracking_mode_rate'] * 0.25 + # 跟踪指标一致
max(0, 1 - stab_scores['content_len_cv'] * 2) * 0.10 + # CV 0.5 扣到 0
max(0, 1 - stab_scores['duration_cv'] * 2) * 0.10
)
overall = acc_composite * 0.4 + depth_composite * 0.35 + stab_composite * 0.25
return {
'label': label,
'n': n,
'accuracy': acc_scores,
'depth': depth_scores,
'stability': stab_scores,
'composite': {
'accuracy': round(acc_composite * 100, 1),
'depth': round(depth_composite * 100, 1),
'stability': round(stab_composite * 100, 1),
'overall': round(overall * 100, 1),
},
}
def _load_perf(dir_path: Path) -> list[dict]:
"""从 round_XX.json 的 meta 提取 duration/tokens。"""
out = []
for f in sorted(dir_path.glob('round_*.json')):
data = json.loads(f.read_text(encoding='utf-8'))
meta = data.get('meta') or {}
out.append({'duration_s': meta.get('duration_s'), 'tokens': meta.get('tokens')})
return out
def print_summary(s: dict) -> None:
if s['n'] == 0:
print(f"\n=== {s['label'].upper()} 数据为空 ===")
return
c = s['composite']
print(f"\n=== Round {s['label'].upper()} · 店长视角评分n={s['n']}===")
print(f" 【综合评分】 {c['overall']:.1f} / 100")
print(f" 准确性 {c['accuracy']:.1f} | 洞察深度 {c['depth']:.1f} | 稳定性 {c['stability']:.1f}")
print(f"\n -- 准确性明细 --")
a = s['accuracy']
print(f" 对比口径显式引用: {a['calib_explicit_rate']:.0%}")
print(f" C 权威字段(余额变化): {a['authority_c_rate']:.0%}")
print(f" A 权威字段(环比数字): {a['authority_a_env_bi_rate']:.0%}")
print(f" 禁行业数字合规: {a['no_industry_number_rate']:.0%}")
print(f" 禁手动调整拆明细合规: {a['no_manual_detail_rate']:.0%}")
print(f" 单期推测违规/次 (低好): {a['speculation_avg']:.1f}")
print(f" 数据完整性标注: {a['data_integrity_flagged_rate']:.0%}")
print(f"\n -- 洞察深度明细 --")
d = s['depth']
print(f" 深度信号命中/次 (满 10): {d['deep_hit_avg']:.1f}")
print(f" 跨轮覆盖信号数: {d['deep_hit_union']} / 10")
print(f" 空洞表达/次 (低好): {d['hollow_avg']:.1f}")
print(f" seq 11 top 2 结构: {d['f_top2_rate']:.0%}")
print(f" seq 11 有意义解读: {d['f_top2_with_semantic_rate']:.0%}")
print(f" seq 12 四要素齐全: {d['seq12_4elem_rate']:.0%}")
print(f" 多指标协同分析: {d['collab_analysis_rate']:.0%}")
print(f"\n -- 稳定性明细 --")
st = s['stability']
print(f" 评级众数 ({st['light_mode']}) 占比: {st['light_mode_rate']:.0%} [{st['light_distribution']}]")
print(f" seq 11 原因信号 IoU: {st['seq11_signal_iou']:.0%}")
print(f" 跟踪指标众数 ({st['tracking_mode']}) 占比: {st['tracking_mode_rate']:.0%}")
print(f" 字数 CV (低好): {st['content_len_cv']:.2f}")
print(f" 时长 CV (低好): {st['duration_cv']:.2f} 均值 {st['duration_mean']:.1f}s")
print(f" tokens CV (低好): {st['tokens_cv']:.2f} 均值 {st['tokens_mean']:.0f}")
def print_compare(*summaries: dict) -> None:
labels = [s['label'].upper() for s in summaries]
header = ' vs '.join(labels)
col_w = 12
print(f"\n======= {header} 店长视角综合评分 =======")
print(f"{'指标':<34}" + ''.join(f"{l:>{col_w}}" for l in labels))
print('-' * (34 + col_w * len(labels)))
def _row(name: str, values: list, fmt: str = 'float', higher_better: bool = True) -> None:
cells = []
for v in values:
if fmt == 'percent':
cells.append(f'{v:.0%}')
elif fmt == 'int':
cells.append(str(v))
elif fmt == 'float1':
cells.append(f'{v:.1f}')
else:
cells.append(f'{v:.2f}')
print(f"{name:<34}" + ''.join(f"{c:>{col_w}}" for c in cells))
print('【综合】')
_row(' 总分 / 100', [s['composite']['overall'] for s in summaries], 'float1')
_row(' 准确性 (40%)', [s['composite']['accuracy'] for s in summaries], 'float1')
_row(' 洞察深度 (35%)', [s['composite']['depth'] for s in summaries], 'float1')
_row(' 稳定性 (25%)', [s['composite']['stability'] for s in summaries], 'float1')
print('【准确性】')
_row(' 对比口径显式 ★', [s['accuracy']['calib_explicit_rate'] for s in summaries], 'percent')
_row(' C 权威字段', [s['accuracy']['authority_c_rate'] for s in summaries], 'percent')
_row(' 数据完整性标注', [s['accuracy']['data_integrity_flagged_rate'] for s in summaries], 'percent')
_row(' 单期推测违规/次 ↓', [s['accuracy']['speculation_avg'] for s in summaries], 'float1')
_row(' 禁行业数字合规', [s['accuracy']['no_industry_number_rate'] for s in summaries], 'percent')
print('【洞察深度】')
_row(' 深度信号命中/次', [s['depth']['deep_hit_avg'] for s in summaries], 'float1')
_row(' 跨轮信号覆盖 /10', [s['depth']['deep_hit_union'] for s in summaries], 'int')
_row(' 空洞表达/次 ↓', [s['depth']['hollow_avg'] for s in summaries], 'float1')
_row(' seq 11 top 2 结构', [s['depth']['f_top2_rate'] for s in summaries], 'percent')
_row(' seq 11 有意义解读 ★', [s['depth']['f_top2_with_semantic_rate'] for s in summaries], 'percent')
_row(' seq 12 四要素齐全', [s['depth']['seq12_4elem_rate'] for s in summaries], 'percent')
_row(' 多指标协同分析', [s['depth']['collab_analysis_rate'] for s in summaries], 'percent')
print('【稳定性(同数据下越一致越好)】')
_row(' 评级众数占比 ★', [s['stability']['light_mode_rate'] for s in summaries], 'percent')
_row(' seq 11 原因信号 IoU', [s['stability']['seq11_signal_iou'] for s in summaries], 'percent')
_row(' 跟踪指标众数占比 ★', [s['stability']['tracking_mode_rate'] for s in summaries], 'percent')
_row(' 字数 CV ↓', [s['stability']['content_len_cv'] for s in summaries], 'float')
_row(' 时长 CV ↓', [s['stability']['duration_cv'] for s in summaries], 'float')
_row(' 均时长 s', [s['stability']['duration_mean'] for s in summaries], 'float1')
_row(' 均 tokens', [s['stability']['tokens_mean'] for s in summaries], 'float1')
print()
for s in summaries:
print(f" {s['label'].upper()} 灯色分布: {s['stability']['light_distribution']}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('--dir', help='单目录分析')
parser.add_argument('--compare', nargs='+', metavar='DIR', help='多目录对比2-5 个)')
args = parser.parse_args()
if args.compare:
dirs = [Path(d) for d in args.compare]
summaries = []
for d in dirs:
rounds = _iter_rounds(d)
if not rounds:
print(f'[skip] {d} 无数据')
continue
perf = _load_perf(d)
label = d.name.replace('round_', '')
summaries.append(summarize(rounds, label, perf))
for s in summaries:
print_summary(s)
print_compare(*summaries)
tag = '_'.join(s['label'] for s in summaries)
out_path = Path(f'export/ai-ab-test/_manager_quality_{tag}.json')
out_path.write_text(json.dumps({s['label']: s for s in summaries}, ensure_ascii=False, indent=2, default=str),
encoding='utf-8')
print(f'\n[done] 店长视角评分 JSON 已存: {out_path}')
elif args.dir:
rounds = _iter_rounds(Path(args.dir))
perf = _load_perf(Path(args.dir))
label = Path(args.dir).name.replace('round_', '')
s = summarize(rounds, label, perf)
print_summary(s)
else:
parser.error('需指定 --dir 或 --compare')
if __name__ == '__main__':
main()