feat: 2026-04-15~05-02 累积变更基线 — AI 重构 + Runtime Context + DWS 修复

涵盖（每条对应已存的审计记录）： - AI 模块拆分：apps/backend/app/ai/apps -> prompts/（8 个 APP + app2a 派生） audit: 2026-04-20__ai-module-complete.md - admin-web AI 管理套件：AIDashboard / AIOperations / AIRunLogs / AITriggers / TriggerManager audit: 2026-04-21__admin-web-ai-management-suite.md - App2 财务洞察 prompt v3 -> v5.1 + 小程序 AI 接入（chat / board-finance） audit: 2026-04-22__app2_prompt_v5_1_and_miniprogram_ai_insight.md - App2 prewarm 全过滤器 + AI 触发器 cron reschedule audit: 2026-04-21__app2-finance-prewarm-all-filters.md migration: 20260420_ai_trigger_jobs_and_app2_prewarm.sql / 20260421_app2_prewarm_cron_reschedule.sql - AppType 联合类型对齐 + adminAiAppTypes.test.ts audit: 2026-04-30__admin_web_ai_app_type_alignment.md - DashScope tokens_used 提取修复 audit: 2026-04-30__backend_dashscope_tokens_used_extraction.md - App3 线索完整详情 prompt audit: 2026-05-01__backend_app3_full_detail_prompt.md - Runtime Context 沙箱（5-1~5-2 主线）： - 后端 schema/service + admin_runtime_context / xcx_runtime_clock 两个 router - admin-web RuntimeContext.tsx + miniprogram runtime-clock.ts - migration: 20260501__runtime_context_sandbox.sql - tools/db/verify_admin_web_sandbox.py + verify_sandbox_end_to_end.py - database/changes: 7 份 sandbox_* 验证报告 - 飞球 DWS 修复：finance_area_daily 区域汇总 + task_engine 调整 + RLS 视图业务日上界（migration 20260502 + scripts/ops/gen_rls_business_date_migration.py）合规： - .gitignore 启用 tmp/ 排除 - 不入仓：apps/etl/connectors/feiqiu/.env（API_TOKEN secret，本地修改保留）待验证清单： - docs/audit/changes/2026-05-04__cumulative_baseline_pending_verification.md 每个主题的功能完整性 / 上线验证几乎都未收口，按优先级 P0~P3 逐一处理
2026-05-04 02:30:19 +08:00
parent 2010034840
commit caf179a5da
130 changed files with 14543 additions and 2717 deletions
--- a/scripts/analyze_store_manager_quality.py
+++ b/scripts/analyze_store_manager_quality.py
@@ -0,0 +1,531 @@
+"""App2 财务洞察 · 店长视角评分方法 v2。
+
+围绕"店长读了这 12 条能做什么 / 学到什么"做评估，三大维度：
+
+【1】准确性（客观正确 · 越高越好）
+  - 数字取自 payload，无编造
+  - 引用权威字段（非原始指标兜底）
+  - 遵守业务规则（手动调整不拆明细 / 禁行业数字 / 禁单期推测 / 对比口径）
+  - 数据缺失/0 支出主动标注
+
+【2】洞察深度（业务价值 · 越高越好）
+  - 非显性信号挖掘（"消耗>充值=存量消费"/"差值=平均让利"等解读）
+  - 多指标协同分析（seq 11 结构失衡 + 原因互相印证）
+  - 根因而非表象（如"数据录入缺失"而不只"支出为 0"）
+  - seq 12 跟踪 4 要素齐全（指标 / 阈值 / 节奏 / 触发动作）
+  - 避免空洞表达（"关注 XX" / "加强 XX" 被扣分）
+
+【3】稳定性（可靠性 · 越高越好；稳定 ≠ 僵化）
+  - 健康度评级方向一致（同数据下 10 次评级应相似，灯色众数占比 ≥ 80%）
+  - 关键原因收敛（seq 11 原因 1/2 所引 key signal 的 IoU ≥ 60%）
+  - seq 12 跟踪指标选择一致（10 次中 TOP 指标命中率高）
+  - 字数/时长波动小（内容饱满但不冗余 · CV 低）
+
+用法：
+  PYTHONIOENCODING=utf-8 .venv/Scripts/python.exe scripts/analyze_store_manager_quality.py \\
+    --compare export/ai-ab-test/round_a export/ai-ab-test/round_b \\
+              export/ai-ab-test/round_v5 export/ai-ab-test/round_v5_1
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+from statistics import mean, stdev
+
+
+# ===== 核心业务关键词库（店长语汇） =====
+
+# 挖掘深度关键信号
+DEEP_SIGNALS = [
+    # 储值卡负债动态
+    ('消耗.*充值|充值.*消耗', '储值卡充消对比'),
+    ('存量消费|复购乏力|复购.*收缩|复购.*减弱', '复购解读'),
+    ('负债.*累积|兑付压力|负债.*减轻', '负债解读'),
+    # 客单价双口径差值
+    ('按成交收入.*按发生额|按发生额.*按成交收入', '客单价双口径对比'),
+    ('每单.*让利|让利.*量级|让利.*金额', '让利量化'),
+    # 业态归因
+    ('业态|定位|散客|团购为主|车站|商场', '业态归因'),
+    # 数据完整性
+    ('数据.*缺失|数据.*完整|录入.*缺失|支出.*全.*0|支出.*为.*0|虚高风险|净利.*可信', '数据质量质疑'),
+    # 协同恶化/结构失衡
+    ('结构失衡|协同|多指标.*同|同向恶化', '结构性洞察'),
+    # 时间规律深度
+    ('周六.*周.*倍|周末.*工作日|旺淡日.*倍', '周规律倍率'),
+    ('同周|同星期|同周基线', '基线识别'),
+]
+
+# 空洞表达（应被扣分）
+HOLLOW_PATTERNS = [
+    r'^关注\s*[^，。]+$',
+    r'建议关注',
+    r'加强\s*(\w+)?运营',
+    r'提升\s*(\w+)?管理',
+    r'需要重视',
+    r'应当注意',
+]
+
+# 趋势词 · 必须同句内有 % 或绝对值数字（否则单期推测违规）
+TREND_WORDS = ['下滑', '下降', '上升', '提升', '收缩', '萎缩', '承压', '走弱', '走强',
+               '加剧', '恶化', '持续', '显著', '大幅', '明显', '锐减', '攀升']
+
+
+def _iter_rounds(dir_path: Path) -> list[dict]:
+    files = sorted(dir_path.glob('round_*.json'))
+    out = []
+    for f in files:
+        data = json.loads(f.read_text(encoding='utf-8'))
+        data['_file'] = f.name
+        out.append(data)
+    return out
+
+
+def _get_seq(insights: list, seq: int) -> dict | None:
+    for ins in insights:
+        if isinstance(ins, dict) and ins.get('seq') == seq:
+            return ins
+    return None
+
+
+def analyze_accuracy(insights: list) -> dict:
+    """准确性评分（每项 0/1）。"""
+    a_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (1, 2))
+    b_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (3, 4))
+    c_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (5, 6))
+    d_texts = ' '.join((_get_seq(insights, s) or {}).get('content') or '' for s in (7, 8))
+    seq9 = (_get_seq(insights, 9) or {}).get('content') or ''
+    seq10 = (_get_seq(insights, 10) or {}).get('content') or ''
+    all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict))
+
+    # 1. 对比口径显式引用（V5.1 H1 硬性要求）
+    calib_explicit = bool(re.search(r'对比口径|同期对齐|同天数对齐|当期\s*\d+\s*天.*上期|\d+\s*天同期', a_texts))
+
+    # 2. 权威字段引用（C 储值卡余额变化 / A 单位经济环比）
+    authority_c = any(k in c_texts for k in ('期初', '期末', '余额变化', '其他调整'))
+    authority_a = bool(re.search(r'[+-]?\d+\.?\d*%', a_texts))  # 有实际环比数字
+
+    # 3. 规则合规
+    no_industry_number = not bool(re.search(r'行业(警戒线|均值|标准|参考|基准|水平|经验值|通常).*\d+\.?\d*%?', all_text))
+    no_manual_detail = not bool(re.search(r'(抹零|免单)\s*\d+[\.\d]*\s*元', b_texts))
+
+    # 4. 单期推测违规计数（趋势词同句内是否有数字锚点）
+    speculation_count = 0
+    for sent in re.split(r'[。；\n]', all_text):
+        if not sent.strip():
+            continue
+        has_trend_word = any(w in sent for w in TREND_WORDS)
+        has_number = bool(re.search(r'[+-]?\d+\.?\d*%|\d{3,}元|\d+\s*元|\-\d+', sent))
+        if has_trend_word and not has_number:
+            # 允许"禁止推测"等元指令跳过
+            if '推测' in sent or '禁' in sent or '不能' in sent:
+                continue
+            speculation_count += 1
+
+    # 5. 数据完整性标注（D 板块 0 支出主动指出）
+    data_integrity_flagged = any(k in d_texts for k in
+                                 ('支出.*0', '数据缺失', '录入', '不完整', '虚高', '无法评估', '可信度'))
+
+    # 6. 每条含数字
+    ins_with_number = sum(1 for ins in insights if isinstance(ins, dict) and
+                          re.search(r'\d+', ins.get('content') or ''))
+    number_rate = ins_with_number / len(insights) if insights else 0
+
+    return {
+        'calib_explicit': int(calib_explicit),
+        'authority_c': int(authority_c),
+        'authority_a_env_bi': int(authority_a),
+        'no_industry_number': int(no_industry_number),
+        'no_manual_detail': int(no_manual_detail),
+        'speculation_count': speculation_count,   # 越低越好
+        'data_integrity_flagged': int(data_integrity_flagged),
+        'number_rate': round(number_rate, 2),
+    }
+
+
+def analyze_depth(insights: list) -> dict:
+    """洞察深度评分。"""
+    all_text = ' '.join((ins.get('content') or '') for ins in insights if isinstance(ins, dict))
+
+    # 1. 深度信号命中数（10 类里命中几类）
+    deep_hits: list[str] = []
+    for pattern, name in DEEP_SIGNALS:
+        if re.search(pattern, all_text):
+            deep_hits.append(name)
+    deep_hit_count = len(deep_hits)
+
+    # 2. 空洞表达计数（每条按句扫描）
+    hollow_count = 0
+    for ins in insights:
+        if not isinstance(ins, dict):
+            continue
+        body = ins.get('content') or ''
+        for pattern in HOLLOW_PATTERNS:
+            if re.search(pattern, body):
+                hollow_count += 1
+                break
+
+    # 3. seq 11 top 2 原因结构 + 每个原因是否有意义解读（不只数字堆砌）
+    seq11 = (_get_seq(insights, 11) or {}).get('content') or ''
+    has_r1 = bool(re.search(r'原因\s*1|1\s*[\)）\.、:：]|①|原因一', seq11))
+    has_r2 = bool(re.search(r'原因\s*2|2\s*[\)）\.、:：]|②|原因二', seq11))
+    f_top2 = int(has_r1 and has_r2)
+    # 意义解读：原因文字里是否有解读词（"收缩/虚高/失衡/风险/无法评估"等因果意义）
+    has_semantic = bool(re.search(r'收缩|虚高|失衡|风险|无法评估|可信|压力|乏力|不足|崩塌|疲态|暴露',
+                                   seq11))
+    f_top2_semantic = int(has_r1 and has_r2 and has_semantic)
+
+    # 4. seq 12 跟踪 4 要素齐全（指标/阈值/节奏/动作）
+    seq12 = (_get_seq(insights, 12) or {}).get('content') or ''
+    has_indicator = bool(re.search(r'[储值卡余额|会员|客单价|成交收入|现金流入|现金流出|支出|占比|助教|订单]', seq12))
+    has_rhythm = bool(re.search(r'(每[周月日天]|每\s*\d+|双周|旬|定期|周期性|连续)', seq12))
+    has_threshold = bool(re.search(r'<\s*-?\d+|>\s*-?\d+|≥|≤|达到|跌破|超过|低于|目标', seq12))
+    has_action = bool(re.search(r'(启动|触发|召回|发起|立即|核查|补录|校准|活动|赠金|审批|预算)', seq12))
+    seq12_4elem = int(has_indicator and has_rhythm and has_threshold and has_action)
+
+    # 5. 协同分析识别（seq 11 是否讲到多指标联动）
+    collaborative = bool(re.search(r'(\+|与|及|和|协同|同时|叠加|共同|双收缩|双下降).*(\+|与|及|和)', seq11)) or \
+                    bool(re.search(r'\d+.*\d+.*\d+', seq11))  # 至少 3 个数字说明多信号
+
+    return {
+        'deep_hit_count': deep_hit_count,            # 0-10
+        'deep_hits': deep_hits,
+        'hollow_count': hollow_count,                # 越低越好
+        'f_top2_structure': f_top2,
+        'f_top2_with_semantic': f_top2_semantic,
+        'seq12_4elem_complete': seq12_4elem,
+        'collaborative_analysis': int(collaborative),
+    }
+
+
+def analyze_stability_round(insights: list) -> dict:
+    """单轮提取稳定性分析所需的"指纹"。"""
+    # 1. 健康度灯色
+    seq11 = (_get_seq(insights, 11) or {}).get('content') or ''
+    light = 'unknown'
+    if re.search(r'🔴|红灯', seq11):
+        light = 'red'
+    elif re.search(r'🟡|黄灯', seq11):
+        light = 'yellow'
+    elif re.search(r'🟢|绿灯', seq11):
+        light = 'green'
+
+    # 2. 关键 key signals（从 seq 11 提取命中的业务关键词集合）
+    KEY_SIGNAL_VOCAB = [
+        '会员占比', '会员订单占比',
+        '客单价',
+        '储值卡余额', '储值卡',
+        '支出.*0', '支出缺失', '数据缺失',
+        '复购',
+        '优惠',
+        '助教成本', '人力成本',
+        '成交收入', '现金流入',
+    ]
+    signals = set()
+    for kw in KEY_SIGNAL_VOCAB:
+        if re.search(kw, seq11):
+            signals.add(kw)
+
+    # 3. seq 12 选取的跟踪指标
+    seq12_title = (_get_seq(insights, 12) or {}).get('title') or ''
+    seq12_body = (_get_seq(insights, 12) or {}).get('content') or ''
+    tracking_indicator = 'unknown'
+    for kw, label in [
+        ('储值卡余额', '储值卡余额'),
+        ('会员占比|会员订单占比', '会员占比'),
+        ('客单价', '客单价'),
+        ('现金流出|支出', '现金流出/支出'),
+        ('成交收入', '成交收入'),
+        ('助教|人力成本', '助教/人力成本'),
+    ]:
+        if re.search(kw, seq12_title + ' ' + seq12_body):
+            tracking_indicator = label
+            break
+
+    # 4. 字数
+    avg_len = 0
+    if insights:
+        avg_len = sum(len(ins.get('content') or '') for ins in insights if isinstance(ins, dict)) / len(insights)
+
+    return {
+        'light': light,
+        'seq11_signals': signals,
+        'tracking_indicator': tracking_indicator,
+        'avg_content_len': round(avg_len, 1),
+    }
+
+
+def summarize(rounds_data: list[dict], label: str, perf: list[dict] | None = None) -> dict:
+    """聚合 n 次的三层评分。
+
+    rounds_data: round_*.json 原始数据列表（含 parsed）
+    perf: 外部传入性能数据（duration_s, tokens），从 meta 里取
+    """
+    accuracies = []
+    depths = []
+    stab_fingerprints = []
+
+    for rd in rounds_data:
+        insights = (rd.get('parsed') or {}).get('insights') or []
+        if not isinstance(insights, list) or not insights:
+            continue
+        accuracies.append(analyze_accuracy(insights))
+        depths.append(analyze_depth(insights))
+        stab_fingerprints.append(analyze_stability_round(insights))
+
+    n = len(accuracies)
+    if n == 0:
+        return {'label': label, 'n': 0}
+
+    # === 准确性聚合 ===
+    acc_scores = {
+        'calib_explicit_rate': mean(a['calib_explicit'] for a in accuracies),
+        'authority_c_rate': mean(a['authority_c'] for a in accuracies),
+        'authority_a_env_bi_rate': mean(a['authority_a_env_bi'] for a in accuracies),
+        'no_industry_number_rate': mean(a['no_industry_number'] for a in accuracies),
+        'no_manual_detail_rate': mean(a['no_manual_detail'] for a in accuracies),
+        'speculation_avg': mean(a['speculation_count'] for a in accuracies),
+        'data_integrity_flagged_rate': mean(a['data_integrity_flagged'] for a in accuracies),
+        'number_rate_avg': mean(a['number_rate'] for a in accuracies),
+    }
+
+    # === 洞察深度聚合 ===
+    depth_scores = {
+        'deep_hit_avg': mean(d['deep_hit_count'] for d in depths),
+        'deep_hit_union': len(set.union(*[set(d['deep_hits']) for d in depths])) if depths else 0,
+        'hollow_avg': mean(d['hollow_count'] for d in depths),
+        'f_top2_rate': mean(d['f_top2_structure'] for d in depths),
+        'f_top2_with_semantic_rate': mean(d['f_top2_with_semantic'] for d in depths),
+        'seq12_4elem_rate': mean(d['seq12_4elem_complete'] for d in depths),
+        'collab_analysis_rate': mean(d['collaborative_analysis'] for d in depths),
+    }
+
+    # === 稳定性聚合（核心：众数占比 / 交并比 / CV）===
+    # 灯色众数占比
+    light_counter = Counter(s['light'] for s in stab_fingerprints)
+    light_mode_rate = light_counter.most_common(1)[0][1] / n
+
+    # seq 11 signals 交并比（跨 n 轮的平均 IoU）
+    iou_scores = []
+    for i in range(n):
+        for j in range(i + 1, n):
+            s1, s2 = stab_fingerprints[i]['seq11_signals'], stab_fingerprints[j]['seq11_signals']
+            union = s1 | s2
+            if not union:
+                continue
+            iou_scores.append(len(s1 & s2) / len(union))
+    seq11_iou = mean(iou_scores) if iou_scores else 0
+
+    # tracking indicator 一致性（众数占比）
+    ti_counter = Counter(s['tracking_indicator'] for s in stab_fingerprints)
+    tracking_mode_rate = ti_counter.most_common(1)[0][1] / n
+
+    # 字数 CV（低为稳定）
+    content_lens = [s['avg_content_len'] for s in stab_fingerprints]
+    content_cv = (stdev(content_lens) / mean(content_lens)) if len(content_lens) > 1 and mean(content_lens) > 0 else 0
+
+    # 性能 CV
+    perf_data = perf or []
+    durations = [p['duration_s'] for p in perf_data if p.get('duration_s') is not None]
+    tokens = [p['tokens'] for p in perf_data if p.get('tokens') is not None]
+    duration_cv = (stdev(durations) / mean(durations)) if len(durations) > 1 and mean(durations) > 0 else 0
+    tokens_cv = (stdev(tokens) / mean(tokens)) if len(tokens) > 1 and mean(tokens) > 0 else 0
+
+    stab_scores = {
+        'light_mode': light_counter.most_common(1)[0][0],
+        'light_mode_rate': light_mode_rate,
+        'light_distribution': dict(light_counter),
+        'seq11_signal_iou': seq11_iou,
+        'tracking_mode': ti_counter.most_common(1)[0][0],
+        'tracking_mode_rate': tracking_mode_rate,
+        'content_len_cv': content_cv,
+        'duration_cv': duration_cv,
+        'tokens_cv': tokens_cv,
+        'duration_mean': mean(durations) if durations else 0,
+        'tokens_mean': mean(tokens) if tokens else 0,
+    }
+
+    # === 综合评分（0-100 百分制，三维加权）===
+    # 准确性 40% + 洞察深度 35% + 稳定性 25%
+    acc_composite = (
+        acc_scores['calib_explicit_rate'] * 0.25 +
+        acc_scores['authority_c_rate'] * 0.15 +
+        acc_scores['authority_a_env_bi_rate'] * 0.10 +
+        acc_scores['no_industry_number_rate'] * 0.15 +
+        acc_scores['no_manual_detail_rate'] * 0.10 +
+        max(0, 1 - acc_scores['speculation_avg'] / 5) * 0.15 +  # 5 次推测扣到 0
+        acc_scores['data_integrity_flagged_rate'] * 0.10
+    )
+    depth_composite = (
+        min(depth_scores['deep_hit_avg'] / 5, 1) * 0.30 +           # 深度信号 5 类以上得满
+        max(0, 1 - depth_scores['hollow_avg'] / 3) * 0.15 +          # 空洞 3 次扣到 0
+        depth_scores['f_top2_with_semantic_rate'] * 0.25 +
+        depth_scores['seq12_4elem_rate'] * 0.20 +
+        depth_scores['collab_analysis_rate'] * 0.10
+    )
+    stab_composite = (
+        stab_scores['light_mode_rate'] * 0.30 +                      # 同灯色占比
+        stab_scores['seq11_signal_iou'] * 0.25 +                     # 原因信号交并比
+        stab_scores['tracking_mode_rate'] * 0.25 +                   # 跟踪指标一致
+        max(0, 1 - stab_scores['content_len_cv'] * 2) * 0.10 +       # CV 0.5 扣到 0
+        max(0, 1 - stab_scores['duration_cv'] * 2) * 0.10
+    )
+    overall = acc_composite * 0.4 + depth_composite * 0.35 + stab_composite * 0.25
+
+    return {
+        'label': label,
+        'n': n,
+        'accuracy': acc_scores,
+        'depth': depth_scores,
+        'stability': stab_scores,
+        'composite': {
+            'accuracy': round(acc_composite * 100, 1),
+            'depth': round(depth_composite * 100, 1),
+            'stability': round(stab_composite * 100, 1),
+            'overall': round(overall * 100, 1),
+        },
+    }
+
+
+def _load_perf(dir_path: Path) -> list[dict]:
+    """从 round_XX.json 的 meta 提取 duration/tokens。"""
+    out = []
+    for f in sorted(dir_path.glob('round_*.json')):
+        data = json.loads(f.read_text(encoding='utf-8'))
+        meta = data.get('meta') or {}
+        out.append({'duration_s': meta.get('duration_s'), 'tokens': meta.get('tokens')})
+    return out
+
+
+def print_summary(s: dict) -> None:
+    if s['n'] == 0:
+        print(f"\n=== {s['label'].upper()} 数据为空 ===")
+        return
+    c = s['composite']
+    print(f"\n=== Round {s['label'].upper()} · 店长视角评分（n={s['n']}）===")
+    print(f"  【综合评分】 {c['overall']:.1f} / 100")
+    print(f"     准确性 {c['accuracy']:.1f}  |  洞察深度 {c['depth']:.1f}  |  稳定性 {c['stability']:.1f}")
+
+    print(f"\n  -- 准确性明细 --")
+    a = s['accuracy']
+    print(f"    对比口径显式引用:      {a['calib_explicit_rate']:.0%}")
+    print(f"    C 权威字段(余额变化):  {a['authority_c_rate']:.0%}")
+    print(f"    A 权威字段(环比数字):  {a['authority_a_env_bi_rate']:.0%}")
+    print(f"    禁行业数字合规:        {a['no_industry_number_rate']:.0%}")
+    print(f"    禁手动调整拆明细合规:  {a['no_manual_detail_rate']:.0%}")
+    print(f"    单期推测违规/次 (低好): {a['speculation_avg']:.1f}")
+    print(f"    数据完整性标注:        {a['data_integrity_flagged_rate']:.0%}")
+
+    print(f"\n  -- 洞察深度明细 --")
+    d = s['depth']
+    print(f"    深度信号命中/次 (满 10): {d['deep_hit_avg']:.1f}")
+    print(f"    跨轮覆盖信号数:         {d['deep_hit_union']} / 10")
+    print(f"    空洞表达/次 (低好):     {d['hollow_avg']:.1f}")
+    print(f"    seq 11 top 2 结构:      {d['f_top2_rate']:.0%}")
+    print(f"    seq 11 有意义解读:      {d['f_top2_with_semantic_rate']:.0%}")
+    print(f"    seq 12 四要素齐全:      {d['seq12_4elem_rate']:.0%}")
+    print(f"    多指标协同分析:         {d['collab_analysis_rate']:.0%}")
+
+    print(f"\n  -- 稳定性明细 --")
+    st = s['stability']
+    print(f"    评级众数 ({st['light_mode']}) 占比:   {st['light_mode_rate']:.0%}  [{st['light_distribution']}]")
+    print(f"    seq 11 原因信号 IoU:    {st['seq11_signal_iou']:.0%}")
+    print(f"    跟踪指标众数 ({st['tracking_mode']}) 占比: {st['tracking_mode_rate']:.0%}")
+    print(f"    字数 CV (低好):         {st['content_len_cv']:.2f}")
+    print(f"    时长 CV (低好):         {st['duration_cv']:.2f}   均值 {st['duration_mean']:.1f}s")
+    print(f"    tokens CV (低好):       {st['tokens_cv']:.2f}   均值 {st['tokens_mean']:.0f}")
+
+
+def print_compare(*summaries: dict) -> None:
+    labels = [s['label'].upper() for s in summaries]
+    header = ' vs '.join(labels)
+    col_w = 12
+    print(f"\n======= {header} 店长视角综合评分 =======")
+    print(f"{'指标':<34}" + ''.join(f"{l:>{col_w}}" for l in labels))
+    print('-' * (34 + col_w * len(labels)))
+
+    def _row(name: str, values: list, fmt: str = 'float', higher_better: bool = True) -> None:
+        cells = []
+        for v in values:
+            if fmt == 'percent':
+                cells.append(f'{v:.0%}')
+            elif fmt == 'int':
+                cells.append(str(v))
+            elif fmt == 'float1':
+                cells.append(f'{v:.1f}')
+            else:
+                cells.append(f'{v:.2f}')
+        print(f"{name:<34}" + ''.join(f"{c:>{col_w}}" for c in cells))
+
+    print('【综合】')
+    _row('  总分 / 100',                [s['composite']['overall'] for s in summaries], 'float1')
+    _row('  准确性 (40%)',              [s['composite']['accuracy'] for s in summaries], 'float1')
+    _row('  洞察深度 (35%)',            [s['composite']['depth'] for s in summaries], 'float1')
+    _row('  稳定性 (25%)',              [s['composite']['stability'] for s in summaries], 'float1')
+    print('【准确性】')
+    _row('  对比口径显式 ★',            [s['accuracy']['calib_explicit_rate'] for s in summaries], 'percent')
+    _row('  C 权威字段',                [s['accuracy']['authority_c_rate'] for s in summaries], 'percent')
+    _row('  数据完整性标注',             [s['accuracy']['data_integrity_flagged_rate'] for s in summaries], 'percent')
+    _row('  单期推测违规/次 ↓',         [s['accuracy']['speculation_avg'] for s in summaries], 'float1')
+    _row('  禁行业数字合规',             [s['accuracy']['no_industry_number_rate'] for s in summaries], 'percent')
+    print('【洞察深度】')
+    _row('  深度信号命中/次',            [s['depth']['deep_hit_avg'] for s in summaries], 'float1')
+    _row('  跨轮信号覆盖 /10',          [s['depth']['deep_hit_union'] for s in summaries], 'int')
+    _row('  空洞表达/次 ↓',             [s['depth']['hollow_avg'] for s in summaries], 'float1')
+    _row('  seq 11 top 2 结构',         [s['depth']['f_top2_rate'] for s in summaries], 'percent')
+    _row('  seq 11 有意义解读 ★',       [s['depth']['f_top2_with_semantic_rate'] for s in summaries], 'percent')
+    _row('  seq 12 四要素齐全',          [s['depth']['seq12_4elem_rate'] for s in summaries], 'percent')
+    _row('  多指标协同分析',             [s['depth']['collab_analysis_rate'] for s in summaries], 'percent')
+    print('【稳定性（同数据下越一致越好）】')
+    _row('  评级众数占比 ★',            [s['stability']['light_mode_rate'] for s in summaries], 'percent')
+    _row('  seq 11 原因信号 IoU',       [s['stability']['seq11_signal_iou'] for s in summaries], 'percent')
+    _row('  跟踪指标众数占比 ★',        [s['stability']['tracking_mode_rate'] for s in summaries], 'percent')
+    _row('  字数 CV ↓',                [s['stability']['content_len_cv'] for s in summaries], 'float')
+    _row('  时长 CV ↓',                [s['stability']['duration_cv'] for s in summaries], 'float')
+    _row('  均时长 s',                  [s['stability']['duration_mean'] for s in summaries], 'float1')
+    _row('  均 tokens',                 [s['stability']['tokens_mean'] for s in summaries], 'float1')
+    print()
+    for s in summaries:
+        print(f"  {s['label'].upper()} 灯色分布: {s['stability']['light_distribution']}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dir', help='单目录分析')
+    parser.add_argument('--compare', nargs='+', metavar='DIR', help='多目录对比（2-5 个）')
+    args = parser.parse_args()
+
+    if args.compare:
+        dirs = [Path(d) for d in args.compare]
+        summaries = []
+        for d in dirs:
+            rounds = _iter_rounds(d)
+            if not rounds:
+                print(f'[skip] {d} 无数据')
+                continue
+            perf = _load_perf(d)
+            label = d.name.replace('round_', '')
+            summaries.append(summarize(rounds, label, perf))
+        for s in summaries:
+            print_summary(s)
+        print_compare(*summaries)
+        tag = '_'.join(s['label'] for s in summaries)
+        out_path = Path(f'export/ai-ab-test/_manager_quality_{tag}.json')
+        out_path.write_text(json.dumps({s['label']: s for s in summaries}, ensure_ascii=False, indent=2, default=str),
+                             encoding='utf-8')
+        print(f'\n[done] 店长视角评分 JSON 已存: {out_path}')
+    elif args.dir:
+        rounds = _iter_rounds(Path(args.dir))
+        perf = _load_perf(Path(args.dir))
+        label = Path(args.dir).name.replace('round_', '')
+        s = summarize(rounds, label, perf)
+        print_summary(s)
+    else:
+        parser.error('需指定 --dir 或 --compare')
+
+
+if __name__ == '__main__':
+    main()