"""从 API 获取 ETL 执行日志,提取精细计时数据。一次性运维脚本。""" import requests, json, re, sys, os from datetime import datetime from collections import defaultdict, OrderedDict from pathlib import Path from dotenv import load_dotenv load_dotenv(Path(__file__).resolve().parents[2] / ".env") # 环境变量必须存在 SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT") if not SYSTEM_LOG_ROOT: raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置") EXEC_ID = "969c3195-5fea-4f72-873f-18cf75960c64" BASE_URL = "http://localhost:8000" def get_token(): r = requests.post(f"{BASE_URL}/api/auth/login", json={"username": "admin", "password": "admin123"}, timeout=10) r.raise_for_status() return r.json().get("access_token") or r.json().get("token") def get_logs(token): headers = {"Authorization": f"Bearer {token}"} r = requests.get(f"{BASE_URL}/api/execution/{EXEC_ID}/logs", headers=headers, timeout=60) r.raise_for_status() data = r.json() return data.get("error_log", "") or data.get("output_log", "") TS_RE = re.compile(r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]') # 任务开始模式(覆盖 ODS / DWD / DWS / INDEX 各种写法) TASK_START_PATTERNS = [ re.compile(r'开始执行(\w+)\s+\(ODS\)'), # ODS: 开始执行ODS_XXX (ODS) re.compile(r'(\w+): 抓取阶段开始'), # DWD/DWS: XXX: 抓取阶段开始 re.compile(r'(\w+): 开始执行工具类任务'), # DWS 工具类: XXX: 开始执行工具类任务 re.compile(r'(\w+): 本地清洗入库开始'), # DWD fallback ] # 任务完成模式 TASK_DONE_PATTERNS = [ re.compile(r'(\w+) ODS 任务完成'), # ODS: XXX ODS 任务完成 re.compile(r'(\w+): 完成,统计='), # DWD/DWS: XXX: 完成,统计= re.compile(r'(\w+): 工具类任务执行成功'), # DWS 工具类 re.compile(r'(\w+): 结果统计:'), # DWS fallback ] # 任务失败模式 TASK_FAIL_RE = re.compile(r'任务\s+(\w+)\s+失败') # 窗口切片模式(全局 Flow 级别的切片,不是单任务内部切片) FLOW_SLICE_RE = re.compile(r'处理窗口切片\s*(\d+)/(\d+)') # 已知任务名前缀 KNOWN_PREFIXES = {"ODS_", "DWD_", "DWS_"} def classify_task(name): if name.startswith("ODS_"): return "ODS" elif name.startswith("DWD_"): return "DWD" elif name in ("DWS_WINBACK_INDEX", "DWS_NEWCONV_INDEX", "DWS_RELATION_INDEX", "DWS_SPENDING_POWER_INDEX"): return "INDEX" elif name.startswith("DWS_"): return "DWS" return "OTHER" def parse_ts(line): m = TS_RE.search(line) return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") if m else None def parse_timing(log_text): lines = log_text.split("\n") # 任务级计时(保持插入顺序) tasks = OrderedDict() # name -> {start, end, status, stage} global_start = None global_end = None # 已处理的任务名(避免重复匹配) seen_starts = set() for line in lines: ts = parse_ts(line) if ts: if global_start is None: global_start = ts global_end = ts # 任务开始 for pat in TASK_START_PATTERNS: m = pat.search(line) if m and ts: tname = m.group(1) # 只处理已知 ETL 任务名 if not any(tname.startswith(p) for p in KNOWN_PREFIXES): continue if tname not in seen_starts: seen_starts.add(tname) tasks[tname] = { "start": ts, "end": None, "status": "running", "stage": classify_task(tname), } break # 任务完成 for pat in TASK_DONE_PATTERNS: m = pat.search(line) if m and ts: tname = m.group(1) if tname in tasks and tasks[tname]["status"] == "running": tasks[tname]["end"] = ts tasks[tname]["status"] = "success" break # 任务失败 m = TASK_FAIL_RE.search(line) if m and ts: tname = m.group(1) if tname in tasks: tasks[tname]["end"] = ts tasks[tname]["status"] = "failed" # 计算耗时 for info in tasks.values(): if info["start"] and info["end"]: info["duration"] = (info["end"] - info["start"]).total_seconds() else: info["duration"] = None # 阶段汇总 stages = {} for stage_name in ["ODS", "DWD", "DWS", "INDEX"]: stage_tasks = [(n, t) for n, t in tasks.items() if t["stage"] == stage_name] if not stage_tasks: continue starts = [t["start"] for _, t in stage_tasks if t["start"]] ends = [t["end"] for _, t in stage_tasks if t["end"]] stages[stage_name] = { "start": min(starts) if starts else None, "end": max(ends) if ends else None, "task_count": len(stage_tasks), "success": sum(1 for _, t in stage_tasks if t["status"] == "success"), "failed": sum(1 for _, t in stage_tasks if t["status"] == "failed"), } if stages[stage_name]["start"] and stages[stage_name]["end"]: stages[stage_name]["duration"] = ( stages[stage_name]["end"] - stages[stage_name]["start"] ).total_seconds() else: stages[stage_name]["duration"] = None return { "global_start": global_start, "global_end": global_end, "total_duration": (global_end - global_start).total_seconds() if global_start and global_end else 0, "tasks": tasks, "stages": stages, } def fmt_dur(seconds): if seconds is None: return "N/A" m, s = divmod(int(seconds), 60) h, m = divmod(m, 60) return f"{h}h{m:02d}m{s:02d}s" if h > 0 else f"{m}m{s:02d}s" def main(): print("=== ETL 执行日志计时分析 ===\n") token = get_token() print("✓ JWT Token") log_text = get_logs(token) print(f"✓ 日志 {len(log_text)} 字符\n") timing = parse_timing(log_text) # 全局 print(f"执行: {timing['global_start']} ~ {timing['global_end']}") print(f"总耗时: {fmt_dur(timing['total_duration'])}\n") # 阶段 print("--- 阶段计时 ---") for stage in ["ODS", "DWD", "DWS", "INDEX"]: info = timing["stages"].get(stage) if info: print(f" {stage}: {fmt_dur(info['duration'])} " f"({info['success']}/{info['task_count']} 成功" f"{', ' + str(info['failed']) + ' 失败' if info['failed'] else ''})") print() # Top-5 ranked = sorted( [(n, t) for n, t in timing["tasks"].items() if t["duration"] is not None], key=lambda x: x[1]["duration"], reverse=True, ) print("--- Top-5 耗时任务 ---") for i, (name, info) in enumerate(ranked[:5]): print(f" {i+1}. {name}: {fmt_dur(info['duration'])} [{info['status']}]") print() # 全部任务 print(f"--- 全部任务 ({len(timing['tasks'])} 个) ---") for name, info in timing["tasks"].items(): status_icon = "✓" if info["status"] == "success" else "✗" if info["status"] == "failed" else "?" print(f" {status_icon} {name}: {fmt_dur(info['duration'])} [{info['stage']}]") # 输出 JSON output = { "execution_id": EXEC_ID, "global_start": str(timing["global_start"]), "global_end": str(timing["global_end"]), "total_duration_sec": timing["total_duration"], "total_duration_fmt": fmt_dur(timing["total_duration"]), "stages": {}, "top5": [], "all_tasks": {}, } for stage, info in timing["stages"].items(): output["stages"][stage] = { "duration_sec": info["duration"], "duration_fmt": fmt_dur(info["duration"]), "task_count": info["task_count"], "success": info["success"], "failed": info["failed"], } for name, info in ranked[:5]: output["top5"].append({ "task": name, "stage": info["stage"], "duration_sec": info["duration"], "duration_fmt": fmt_dur(info["duration"]), "status": info["status"], }) for name, info in timing["tasks"].items(): output["all_tasks"][name] = { "stage": info["stage"], "duration_sec": info["duration"], "duration_fmt": fmt_dur(info["duration"]), "status": info["status"], } out_path = Path(SYSTEM_LOG_ROOT) / "etl_timing_data.json" out_path.parent.mkdir(parents=True, exist_ok=True) out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8") print(f"\n✓ 计时数据 → {out_path}") if __name__ == "__main__": main()