Files
Neo-ZQYY/scripts/ops/etl_integration_report.py

351 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
ETL 全流程联调报告生成脚本
从后端 API 获取执行日志,解析计时数据和错误信息,生成综合联调报告。
报告输出到 SYSTEM_LOG_ROOT 环境变量指定的目录。
"""
import os
import re
import json
import sys
from datetime import datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv
# 加载根 .env
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT")
if not SYSTEM_LOG_ROOT:
print("ERROR: SYSTEM_LOG_ROOT 环境变量未设置", file=sys.stderr)
sys.exit(1)
# ── 执行元数据(从 API 历史获取) ──
EXEC_ID = "1e1c93ff-2ab0-42e6-b529-ec14b551c91a"
EXEC_STATUS = "success"
EXEC_EXIT_CODE = 0
EXEC_STARTED = "2026-02-24T02:15:26.689731+08:00"
EXEC_FINISHED = "2026-02-24T02:50:39.679479+08:00"
EXEC_DURATION_MS = 2112989
TASK_COUNT = 41
FLOW = "api_full"
PROCESSING_MODE = "full_window"
WINDOW_START = "2025-11-01"
WINDOW_END = "2026-02-20"
WINDOW_SPLIT_DAYS = 30
# ── 日志文件路径 ──
SCRIPT_DIR = Path(__file__).resolve().parent
ERROR_LOG_PATH = SCRIPT_DIR / "_tmp_error_log.txt"
if not ERROR_LOG_PATH.exists():
print(f"ERROR: 日志文件不存在: {ERROR_LOG_PATH}", file=sys.stderr)
sys.exit(1)
log_text = ERROR_LOG_PATH.read_text(encoding="utf-8")
lines = log_text.splitlines()
# ── 1. 解析各任务的开始/结束时间 ──
TS_RE = re.compile(r"^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]")
def parse_ts(line: str):
m = TS_RE.match(line)
if m:
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
return None
# 提取每个任务的开始和结束时间
task_timings: dict[str, dict] = {}
START_RE = re.compile(r"开始执行(\w+) \(ODS\)|(\w+): ODS fetch\+load start")
COMPLETE_RE = re.compile(r"(\w+) ODS 任务完成:|(\w+): 完成,统计=|(\w+): 完成, 统计=|(\w+): 结果统计:")
FAIL_RE = re.compile(r"任务 (\w+) 失败:")
DWD_START_RE = re.compile(r"DWD_LOAD_FROM_ODS.*开始|开始运行.*DWD_LOAD_FROM_ODS")
DWD_COMPLETE_RE = re.compile(r"DWD_LOAD_FROM_ODS: 完成")
DWS_START_RE = re.compile(r"(DWS_\w+):.*开始|开始执行.*(DWS_\w+)")
DWS_COMPLETE_RE = re.compile(r"(DWS_\w+): 完成")
for line in lines:
ts = parse_ts(line)
if not ts:
continue
# ODS 任务开始
m = START_RE.search(line)
if m:
task = m.group(1) or m.group(2)
if task and task not in task_timings:
task_timings[task] = {"start": ts, "end": None, "status": "running"}
# ODS 任务完成
m = COMPLETE_RE.search(line)
if m:
task = m.group(1) or m.group(2) or m.group(3) or m.group(4)
if task and task in task_timings:
task_timings[task]["end"] = ts
task_timings[task]["status"] = "success"
# DWD 开始
if "DWD_LOAD_FROM_ODS" in line and ("开始" in line or "start" in line.lower()):
if "DWD_LOAD_FROM_ODS" not in task_timings:
task_timings["DWD_LOAD_FROM_ODS"] = {"start": ts, "end": None, "status": "running"}
# DWD 完成
m = DWD_COMPLETE_RE.search(line)
if m and "DWD_LOAD_FROM_ODS" in task_timings:
task_timings["DWD_LOAD_FROM_ODS"]["end"] = ts
task_timings["DWD_LOAD_FROM_ODS"]["status"] = "success"
# DWS 任务开始(仅首次)
for pattern in [r"(DWS_\w+):.*(?:开始|start)", r"开始执行.*(DWS_\w+)"]:
m2 = re.search(pattern, line)
if m2:
task = m2.group(1)
if task not in task_timings:
task_timings[task] = {"start": ts, "end": None, "status": "running"}
# DWS 任务完成
m = DWS_COMPLETE_RE.search(line)
if m:
task = m.group(1)
if task in task_timings:
task_timings[task]["end"] = ts
task_timings[task]["status"] = "success"
# 任务失败
m = FAIL_RE.search(line)
if m:
task = m.group(1)
if task in task_timings:
task_timings[task]["end"] = ts
task_timings[task]["status"] = "failed"
else:
task_timings[task] = {"start": ts, "end": ts, "status": "failed"}
# 计算耗时
for task, info in task_timings.items():
if info["start"] and info["end"]:
info["duration_s"] = (info["end"] - info["start"]).total_seconds()
else:
info["duration_s"] = 0
# ── 2. 收集错误和警告 ──
errors: list[dict] = []
warnings: list[dict] = []
for i, line in enumerate(lines):
ts = parse_ts(line)
if "ERROR" in line:
# 收集错误行及后续 traceback 上下文(最多 10 行)
context_lines = [line]
for j in range(i + 1, min(i + 10, len(lines))):
next_line = lines[j]
if TS_RE.match(next_line) and "Traceback" not in next_line:
break
context_lines.append(next_line)
errors.append({"ts": ts, "line": line.strip(), "context": "\n".join(context_lines)})
elif "WARNING" in line:
warnings.append({"ts": ts, "line": line.strip()})
# ── 3. 分类错误 ──
error_categories: dict[str, list] = {}
for err in errors:
if "未知的任务类型" in err["line"]:
cat = "任务未注册"
elif "member_retention_clue" in err["context"] or "member_birthday_manual" in err["context"]:
cat = "FDW 表缺失(根因)"
elif "InFailedSqlTransaction" in err["context"]:
cat = "事务级联失败"
else:
cat = "其他"
error_categories.setdefault(cat, []).append(err)
# ── 4. 按层分组计时 ──
ods_tasks = {k: v for k, v in task_timings.items() if k.startswith("ODS_")}
dwd_tasks = {k: v for k, v in task_timings.items() if k.startswith("DWD_")}
dws_tasks = {k: v for k, v in task_timings.items() if k.startswith("DWS_")}
# Top-5 耗时
all_with_duration = [(k, v) for k, v in task_timings.items() if v["duration_s"] > 0]
top5 = sorted(all_with_duration, key=lambda x: x[1]["duration_s"], reverse=True)[:5]
# 各层总耗时
ods_total = sum(v["duration_s"] for v in ods_tasks.values())
dwd_total = sum(v["duration_s"] for v in dwd_tasks.values())
dws_total = sum(v["duration_s"] for v in dws_tasks.values())
# 成功/失败统计
success_count = sum(1 for v in task_timings.values() if v["status"] == "success")
failed_count = sum(1 for v in task_timings.values() if v["status"] == "failed")
failed_tasks = [k for k, v in task_timings.items() if v["status"] == "failed"]
# ── 5. 生成报告 ──
def fmt_duration(seconds: float) -> str:
"""格式化秒数为 mm:ss 或 hh:mm:ss"""
if seconds < 0:
return "N/A"
m, s = divmod(int(seconds), 60)
h, m = divmod(m, 60)
if h > 0:
return f"{h}h {m:02d}m {s:02d}s"
return f"{m}m {s:02d}s"
report_lines: list[str] = []
def w(line: str = ""):
report_lines.append(line)
w("# ETL 全流程联调报告")
w()
w(f"> 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
w()
# ── 执行概要 ──
w("## 执行概要")
w()
w(f"| 项目 | 值 |")
w(f"|------|-----|")
w(f"| Execution ID | `{EXEC_ID}` |")
w(f"| Flow | `{FLOW}` (API → ODS → DWD → DWS → INDEX) |")
w(f"| 处理模式 | `{PROCESSING_MODE}` |")
w(f"| 时间窗口 | {WINDOW_START} ~ {WINDOW_END} |")
w(f"| 窗口切分 | 按天,{WINDOW_SPLIT_DAYS} 天/切片(共 4 个切片) |")
w(f"| 强制全量 | 是 (`force_full`) |")
w(f"| 任务数 | {TASK_COUNT} 个(全选 `is_common=True` |")
w(f"| 开始时间 | {EXEC_STARTED} |")
w(f"| 结束时间 | {EXEC_FINISHED} |")
w(f"| 总时长 | {fmt_duration(EXEC_DURATION_MS / 1000)} ({EXEC_DURATION_MS}ms) |")
w(f"| 退出码 | {EXEC_EXIT_CODE} |")
w(f"| 最终状态 | `{EXEC_STATUS}` |")
w(f"| 任务成功 | {success_count} / {success_count + failed_count} |")
w(f"| 任务失败 | {failed_count} |")
w()
# ── 性能报告 ──
w("## 性能报告")
w()
w("### 各层耗时汇总")
w()
w(f"| 层 | 任务数 | 总耗时 | 平均耗时 |")
w(f"|-----|-------|--------|---------|")
ods_count = len(ods_tasks)
dwd_count = len(dwd_tasks)
dws_count = len(dws_tasks)
w(f"| ODS | {ods_count} | {fmt_duration(ods_total)} | {fmt_duration(ods_total / max(ods_count, 1))} |")
w(f"| DWD | {dwd_count} | {fmt_duration(dwd_total)} | {fmt_duration(dwd_total / max(dwd_count, 1))} |")
w(f"| DWS+INDEX | {dws_count} | {fmt_duration(dws_total)} | {fmt_duration(dws_total / max(dws_count, 1))} |")
w()
w("### Top-5 耗时任务")
w()
w(f"| 排名 | 任务 | 耗时 | 状态 |")
w(f"|------|------|------|------|")
for rank, (task, info) in enumerate(top5, 1):
w(f"| {rank} | `{task}` | {fmt_duration(info['duration_s'])} | {info['status']} |")
w()
w("### ODS 层各任务耗时明细")
w()
w(f"| 任务 | 开始 | 结束 | 耗时 | 记录数 |")
w(f"|------|------|------|------|--------|")
# 从日志中提取 fetched 数量
fetch_counts: dict[str, int] = {}
for line in lines:
m = re.search(r"(\w+) ODS 任务完成: \{'fetched': (\d+)", line)
if m:
fetch_counts[m.group(1)] = int(m.group(2))
for task in sorted(ods_tasks.keys()):
info = ods_tasks[task]
start_str = info["start"].strftime("%H:%M:%S") if info["start"] else "?"
end_str = info["end"].strftime("%H:%M:%S") if info["end"] else "?"
fetched = fetch_counts.get(task, "?")
w(f"| `{task}` | {start_str} | {end_str} | {fmt_duration(info['duration_s'])} | {fetched} |")
w()
w("### DWD + DWS 层各任务耗时明细")
w()
w(f"| 任务 | 开始 | 结束 | 耗时 | 状态 |")
w(f"|------|------|------|------|------|")
for task in sorted({**dwd_tasks, **dws_tasks}.keys()):
info = task_timings[task]
start_str = info["start"].strftime("%H:%M:%S") if info["start"] else "?"
end_str = info["end"].strftime("%H:%M:%S") if info["end"] else "?"
w(f"| `{task}` | {start_str} | {end_str} | {fmt_duration(info['duration_s'])} | {info['status']} |")
w()
# ── DEBUG 报告 ──
w("## DEBUG 报告")
w()
if not errors and not warnings:
w("无错误或警告。")
else:
w(f"共发现 **{len(errors)}** 个 ERROR**{len(warnings)}** 个 WARNING。")
w()
w("### 错误分类汇总")
w()
w(f"| 类别 | 数量 | 说明 |")
w(f"|------|------|------|")
for cat, errs in error_categories.items():
if cat == "FDW 表缺失(根因)":
desc = "`fdw_app.member_birthday_manual` 关系不存在"
elif cat == "事务级联失败":
desc = "根因错误导致事务终止,后续 DWS 任务全部 `InFailedSqlTransaction`"
else:
desc = "未分类"
w(f"| {cat} | {len(errs)} | {desc} |")
w()
w("### 错误详情")
w()
w("#### 错误 1FDW 表缺失导致 DWS 级联失败(根因)")
w()
w("- 时间: 02:50:36")
w("- 根因: `UndefinedTable: 关系 \"fdw_app.member_birthday_manual\" 不存在`")
w("- 触发任务: `DWS_MEMBER_CONSUMPTION`")
w("- 降级尝试: 代码尝试降级为 `dim_member.birthday`,但降级查询在已失败的事务中执行,仍然报错")
w("- 级联影响: 事务被终止后,以下 10 个任务全部 `InFailedSqlTransaction`:")
w(" - `DWS_MEMBER_VISIT`")
w(" - `DWS_FINANCE_DAILY`")
w(" - `DWS_FINANCE_RECHARGE`")
w(" - `DWS_FINANCE_INCOME_STRUCTURE`")
w(" - `DWS_FINANCE_DISCOUNT_DETAIL`")
w(" - `DWS_ASSISTANT_MONTHLY`")
w(" - `DWS_ASSISTANT_FINANCE`")
w(" - `DWS_WINBACK_INDEX`")
w(" - `DWS_NEWCONV_INDEX`")
w(" - `DWS_RELATION_INDEX`")
w("- 建议:")
w(" 1. 在 `zqyy_app` 数据库中创建 `member_birthday_manual` 表(或对应的 FDW 映射)")
w(" 2. 或修改 `DWS_MEMBER_CONSUMPTION` 的降级逻辑,在 FDW 失败时先 ROLLBACK 再重试降级查询")
w(" 3. 考虑为 DWS 任务使用独立事务/连接,避免单任务失败导致级联")
w()
if warnings:
w("### 警告详情")
w()
for warn in warnings:
w(f"- `{warn['line']}`")
w()
# ── 黑盒测试报告占位 ──
w("## 黑盒测试报告")
w()
w("_将在一致性检查完成后追加_")
w()
# ── 输出报告 ──
output_dir = Path(SYSTEM_LOG_ROOT)
output_dir.mkdir(parents=True, exist_ok=True)
date_str = datetime.now().strftime("%Y-%m-%d")
output_path = output_dir / f"{date_str}__etl_integration_report.md"
report_content = "\n".join(report_lines)
output_path.write_text(report_content, encoding="utf-8")
print(f"报告已生成: {output_path}")
print(f"任务统计: {success_count} 成功 / {failed_count} 失败 / {success_count + failed_count} 总计")
print(f"错误数: {len(errors)}, 警告数: {len(warnings)}")