Neo-ZQYY/scripts/ops/export_v6_report.py

# -*- coding: utf-8 -*-
"""导出第六次 ETL 执行结果报告，分析所有任务的成功/失败状态。"""
import json
import re
from pathlib import Path

from dotenv import load_dotenv

load_dotenv(Path(__file__).resolve().parents[2] / ".env")

from _env_paths import get_output_path

raw_path = get_output_path("SYSTEM_LOG_ROOT") / "2026-02-21__etl_run_raw_v6.json"
data = json.loads(raw_path.read_text(encoding="utf-8"))

error_log = data.get("error_log", "")
lines = error_log.strip().split("\n")

print(f"日志总行数: {len(lines)}")

# 解析任务结果
success_tasks = []
failed_tasks = []

# 匹配成功模式
success_patterns = [
    r"任务完成:\s*(\S+)",
    r"工具类任务执行成功:\s*(\S+)",
    r"(\S+)\s*完成，统计=",
]

# 匹配失败模式
fail_pattern = re.compile(r"任务\s+(\S+)\s+失败:\s*(.*)")

for line in lines:
    # 成功
    for pat in success_patterns:
        m = re.search(pat, line)
        if m:
            task = m.group(1).strip()
            if task not in success_tasks:
                success_tasks.append(task)
            break

    # 失败
    m = fail_pattern.search(line)
    if m:
        task = m.group(1).strip()
        err_msg = m.group(2).strip()[:120]
        # 分类错误
        if "InFailedSqlTransaction" in err_msg:
            err_type = "InFailedSqlTransaction（级联）"
        elif "UndefinedColumn" in err_msg:
            err_type = f"UndefinedColumn: {err_msg}"
        elif "UniqueViolation" in err_msg:
            err_type = "UniqueViolation"
        elif "UndefinedTable" in err_msg:
            err_type = f"UndefinedTable: {err_msg}"
        else:
            err_type = err_msg
        if task not in [t for t, _ in failed_tasks]:
            failed_tasks.append((task, err_type))

# 去掉成功列表中也出现在失败列表中的（可能先成功后失败）
fail_names = {t for t, _ in failed_tasks}
success_only = [t for t in success_tasks if t not in fail_names]

print(f"\n成功: {len(success_only)}, 失败: {len(failed_tasks)}")
print("\n--- 成功任务 ---")
for i, t in enumerate(success_only, 1):
    print(f"  {i}. {t}")

print("\n--- 失败任务 ---")
for i, (t, e) in enumerate(failed_tasks, 1):
    print(f"  {i}. {t} → {e}")

# 找出根因（非级联的失败）
root_failures = [(t, e) for t, e in failed_tasks if "级联" not in e]
cascade_failures = [(t, e) for t, e in failed_tasks if "级联" in e]

print(f"\n--- 根因失败（{len(root_failures)} 个）---")
for t, e in root_failures:
    print(f"  {t} → {e}")

print(f"\n--- 级联失败（{len(cascade_failures)} 个）---")
for t, _ in cascade_failures:
    print(f"  {t}")

# 生成报告
report = f"""# 第六次 ETL 执行结果报告

- execution_id: `d9443781-e4ac-4df6-9f87-11c45d72e5ba`
- 执行时间: 2026-02-21 20:45:18 ~ 21:14:45（29 分 26 秒）
- exit_code: 0
- status: success
- 总任务数: 31
- 数据统计: 获取 171,961 / 新增 13,662 / 更新 171,595 / 跳过 0 / 错误 0

## 成功任务（{len(success_only)} 个）

| # | 任务 |
|---|------|
"""
for i, t in enumerate(success_only, 1):
    report += f"| {i} | {t} |\n"

if failed_tasks:
    report += f"""
## 失败任务（{len(failed_tasks)} 个）

| # | 任务 | 错误类型 |
|---|------|----------|
"""
    for i, (t, e) in enumerate(failed_tasks, 1):
        report += f"| {i} | {t} | {e} |\n"

    if root_failures:
        report += f"""
## 根因分析（{len(root_failures)} 个非级联失败）

"""
        for t, e in root_failures:
            report += f"- `{t}`: {e}\n"

    if cascade_failures:
        report += f"""
## 级联失败（{len(cascade_failures)} 个）

由根因失败导致 psycopg2 连接进入 InFailedSqlTransaction 状态，后续任务全部级联失败。
"""

report += """
## 与前次对比

| 轮次 | 成功 | 失败 | 耗时 | 修复的 BUG |
|------|------|------|------|-----------|
"""
report += f"| v1 | 10 | 31 | 9m51s | — |\n"
report += f"| v2 | — | — | 2m30s | BUG 1 |\n"
report += f"| v3 | 9 | 22 | 11m21s | BUG 2+3 |\n"
report += f"| v4 | 10 | 21 | 11m55s | BUG 4 |\n"
report += f"| v5 | 10 | 21 | 11m37s | BUG 5 |\n"
report += f"| v6 | {len(success_only)} | {len(failed_tasks)} | 29m26s | BUG 5+6+7 |\n"

out_path = get_output_path("SYSTEM_LOG_ROOT") / "2026-02-21__etl_run_result_v6.md"
out_path.write_text(report, encoding="utf-8")
print(f"\n报告已导出: {out_path}")