"""ETL 全流程联调报告生成脚本 解析日志文件,提取计时数据、错误/警告,生成综合报告。 输出到 SYSTEM_LOG_ROOT 环境变量指定的目录。 """ import os import re import sys from pathlib import Path from datetime import datetime from collections import defaultdict from dotenv import load_dotenv # 加载根 .env load_dotenv(Path(__file__).resolve().parents[2] / ".env") SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT") if not SYSTEM_LOG_ROOT: raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置") LOG_FILE = Path(r"C:\NeoZQYY\export\ETL-Connectors\feiqiu\LOGS\2681a85399e64c76a040163f956e1907.log") if not LOG_FILE.exists(): raise FileNotFoundError(f"日志文件不存在: {LOG_FILE}") # ── 解析日志 ────────────────────────────────────────────── TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] [\w.]+: (.+)$") TASK_START_RE = re.compile(r"^(\S+): 抓取阶段开始") TASK_TOOL_START_RE = re.compile(r"^(\S+): 开始执行工具类任务") TASK_DONE_RE = re.compile(r"^(\S+): 完成,统计=(.+)$") TASK_TOOL_DONE_RE = re.compile(r"^(\S+): 工具类任务执行成功$") WINDOW_START_RE = re.compile(r"^(\S+): 开始执行\((\d+)/(\d+)\),窗口\[(.+)\]$") WINDOW_DONE_RE = re.compile(r"^(\S+): 完成\((\d+)/(\d+)\),已处理") DWD_LOAD_RE = re.compile(r"^DWD_LOAD_FROM_ODS: (.+)$") lines = LOG_FILE.read_text(encoding="utf-8", errors="replace").splitlines() errors = [] warnings = [] task_timings = {} # task_name -> {start, end, stats} window_timings = defaultdict(list) # task_name -> [(slice_idx, start_ts, end_ts)] first_ts = None last_ts = None all_tasks_done_ts = None for line in lines: m = TS_RE.match(line) if not m: continue ts_str, level, msg = m.groups() ts = datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S") if first_ts is None: first_ts = ts last_ts = ts if level == "ERROR" or level == "CRITICAL": errors.append((ts_str, msg)) elif level == "WARNING": warnings.append((ts_str, msg)) if msg.strip() == "所有任务执行完成": all_tasks_done_ts = ts # 任务开始 sm = TASK_START_RE.match(msg) if sm: tname = sm.group(1) if tname not in task_timings: task_timings[tname] = {"start": ts, "end": None, "stats": None} else: task_timings[tname]["start"] = ts sm2 = TASK_TOOL_START_RE.match(msg) if sm2: tname = sm2.group(1) task_timings[tname] = {"start": ts, "end": None, "stats": None} # 任务完成 dm = TASK_DONE_RE.match(msg) if dm: tname = dm.group(1) if tname in task_timings: task_timings[tname]["end"] = ts task_timings[tname]["stats"] = dm.group(2) dm2 = TASK_TOOL_DONE_RE.match(msg) if dm2: tname = dm2.group(1) if tname in task_timings: task_timings[tname]["end"] = ts # 窗口切片 wm = WINDOW_START_RE.match(msg) if wm: tname, idx = wm.group(1), int(wm.group(2)) window_timings[tname].append({"idx": idx, "total": int(wm.group(3)), "start": ts, "end": None}) wd = WINDOW_DONE_RE.match(msg) if wd: tname, idx = wd.group(1), int(wd.group(2)) for w in window_timings[tname]: if w["idx"] == idx and w["end"] is None: w["end"] = ts break # DWD_LOAD_FROM_ODS 特殊处理 — 从日志中找开始/结束 for line in lines: m = TS_RE.match(line) if not m: continue ts_str, level, msg = m.groups() ts = datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S") if "DWD_LOAD_FROM_ODS" in msg and "DWD_LOAD_FROM_ODS" not in task_timings: task_timings["DWD_LOAD_FROM_ODS"] = {"start": ts, "end": None, "stats": None} if "DWD_LOAD_FROM_ODS" in msg: task_timings.setdefault("DWD_LOAD_FROM_ODS", {"start": ts, "end": None, "stats": None}) task_timings["DWD_LOAD_FROM_ODS"]["end"] = ts # ── 分层统计 ────────────────────────────────────────────── def classify_layer(name): if name.startswith("ODS_"): return "ODS" elif name.startswith("DWD_"): return "DWD" elif name.startswith("DWS_"): if "INDEX" in name: return "INDEX" return "DWS" return "OTHER" layer_tasks = defaultdict(list) for tname, info in task_timings.items(): layer_tasks[classify_layer(tname)].append((tname, info)) # ── 生成报告 ────────────────────────────────────────────── total_duration = (last_ts - first_ts).total_seconds() if first_ts and last_ts else 0 total_min = total_duration / 60 out_dir = Path(SYSTEM_LOG_ROOT) out_dir.mkdir(parents=True, exist_ok=True) report_path = out_dir / "20260227__etl_integration_report.md" rpt = [] rpt.append("# ETL 全流程联调报告") rpt.append("") rpt.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") rpt.append(f"日志文件: `{LOG_FILE}`") rpt.append("") # ── 执行概要 ── rpt.append("## 执行概要") rpt.append("") rpt.append("| 项目 | 值 |") rpt.append("|------|-----|") rpt.append(f"| Flow | `api_full` |") rpt.append(f"| 处理模式 | `full_window` |") rpt.append(f"| 时间窗口 | 2025-11-01 ~ 2026-02-27 |") rpt.append(f"| 窗口切分 | 30 天 |") rpt.append(f"| force_full | ✅ |") rpt.append(f"| 开始时间 | {first_ts.strftime('%Y-%m-%d %H:%M:%S') if first_ts else 'N/A'} |") rpt.append(f"| 结束时间 | {last_ts.strftime('%Y-%m-%d %H:%M:%S') if last_ts else 'N/A'} |") rpt.append(f"| 总耗时 | {total_min:.1f} 分钟 ({total_duration:.0f} 秒) |") rpt.append(f"| 任务总数 | {len(task_timings)} |") rpt.append(f"| 错误数 | {len(errors)} |") rpt.append(f"| 警告数 | {len(warnings)} |") rpt.append(f"| 最终状态 | {'✅ 成功' if all_tasks_done_ts else '❌ 未检测到完成标记'} |") rpt.append("") # ── 性能报告 ── rpt.append("## 性能报告") rpt.append("") # 按层汇总 for layer in ["ODS", "DWD", "DWS", "INDEX"]: tasks = layer_tasks.get(layer, []) if not tasks: continue rpt.append(f"### {layer} 层({len(tasks)} 个任务)") rpt.append("") rpt.append("| 任务 | 开始 | 结束 | 耗时(秒) | 统计 |") rpt.append("|------|------|------|---------|------|") for tname, info in sorted(tasks, key=lambda x: x[1]["start"] if x[1]["start"] else datetime.min): s = info["start"].strftime("%H:%M:%S") if info["start"] else "-" e = info["end"].strftime("%H:%M:%S") if info["end"] else "-" dur = "" if info["start"] and info["end"]: dur = f"{(info['end'] - info['start']).total_seconds():.0f}" stats_short = "" if info["stats"]: # 提取 inserted/deleted 数字 ins_m = re.search(r"'inserted': (\d+)", info["stats"]) del_m = re.search(r"'deleted': (\d+)", info["stats"]) err_m = re.search(r"'errors': (\d+)", info["stats"]) parts = [] if ins_m: parts.append(f"ins={ins_m.group(1)}") if del_m: parts.append(f"del={del_m.group(1)}") if err_m and int(err_m.group(1)) > 0: parts.append(f"err={err_m.group(1)}") stats_short = ", ".join(parts) rpt.append(f"| {tname} | {s} | {e} | {dur} | {stats_short} |") rpt.append("") # Top-5 耗时 rpt.append("### Top-5 耗时任务") rpt.append("") ranked = [] for tname, info in task_timings.items(): if info["start"] and info["end"]: dur = (info["end"] - info["start"]).total_seconds() ranked.append((tname, dur)) ranked.sort(key=lambda x: -x[1]) rpt.append("| 排名 | 任务 | 耗时(秒) | 耗时(分) |") rpt.append("|------|------|---------|---------|") for i, (tname, dur) in enumerate(ranked[:5], 1): rpt.append(f"| {i} | {tname} | {dur:.0f} | {dur/60:.1f} |") rpt.append("") # 窗口切片耗时 has_windows = any(len(v) > 0 for v in window_timings.values()) if has_windows: rpt.append("### 窗口切片耗时(部分任务)") rpt.append("") for tname in sorted(window_timings.keys()): slices = window_timings[tname] if len(slices) <= 1: continue rpt.append(f"**{tname}** ({len(slices)} 个切片)") rpt.append("") rpt.append("| 切片 | 开始 | 结束 | 耗时(秒) |") rpt.append("|------|------|------|---------|") for w in slices: s = w["start"].strftime("%H:%M:%S") e = w["end"].strftime("%H:%M:%S") if w["end"] else "-" dur = f"{(w['end'] - w['start']).total_seconds():.0f}" if w["end"] else "-" rpt.append(f"| {w['idx']}/{w['total']} | {s} | {e} | {dur} |") rpt.append("") # ── DEBUG 报告 ── rpt.append("## DEBUG 报告") rpt.append("") if errors: rpt.append(f"### 错误({len(errors)} 条)") rpt.append("") for ts_str, msg in errors: rpt.append(f"- `{ts_str}` {msg}") rpt.append("") else: rpt.append("### 错误") rpt.append("") rpt.append("无错误。") rpt.append("") if warnings: rpt.append(f"### 警告({len(warnings)} 条)") rpt.append("") rpt.append("
") rpt.append("展开查看全部警告") rpt.append("") for ts_str, msg in warnings: rpt.append(f"- `{ts_str}` {msg}") rpt.append("") rpt.append("
") rpt.append("") else: rpt.append("### 警告") rpt.append("") rpt.append("无警告。") rpt.append("") # ── 黑盒测试报告占位 ── rpt.append("## 黑盒测试报告") rpt.append("") rpt.append("(待 Task 5.3 追加)") rpt.append("") report_text = "\n".join(rpt) report_path.write_text(report_text, encoding="utf-8") print(f"报告已生成: {report_path}") print(f"总耗时: {total_min:.1f} 分钟") print(f"任务数: {len(task_timings)}") print(f"错误: {len(errors)}, 警告: {len(warnings)}")