""" 从 API 获取 ETL 执行日志,提取精细计时数据。 Task 4.1: 运维联调 - 性能计时分析 策略: - 所有任务都使用统一的切片格式 "XXX: 开始执行(n/N)" / "XXX: 完成(n/N)" - 根据任务名前缀分类到 ODS / DWD / DWS / INDEX 阶段 - 工具类任务使用 "开始执行工具类任务" / "工具类任务执行成功" - 失败任务使用 "任务 XXX 失败:" 格式 """ import os import sys import re import requests from datetime import datetime from collections import OrderedDict from dotenv import load_dotenv # 加载环境变量 load_dotenv(os.path.join(os.path.dirname(__file__), '..', '..', '.env')) BACKEND_URL = "http://localhost:8000" EXECUTION_ID = "41938155-db8c-4eec-9b81-9e5aef42fb8a" # 任务分层映射(根据 design.md 中的任务列表) INDEX_TASKS = {"DWS_WINBACK_INDEX", "DWS_NEWCONV_INDEX", "DWS_RELATION_INDEX"} DWD_TASKS = {"DWD_LOAD_FROM_ODS"} # DWS 任务 = 以 DWS_ 开头但不在 INDEX_TASKS 中的 # ODS 任务 = 以 ODS_ 开头的 def classify_task(task_name): """根据任务名分类到阶段""" if task_name in INDEX_TASKS: return "INDEX" if task_name in DWD_TASKS: return "DWD" if task_name.startswith("DWS_"): return "DWS" if task_name.startswith("ODS_"): return "ODS" return "OTHER" def login(): resp = requests.post(f"{BACKEND_URL}/api/auth/login", json={"username": "admin", "password": "admin123"}) resp.raise_for_status() return resp.json()["access_token"] def get_logs(token): headers = {"Authorization": f"Bearer {token}"} resp = requests.get(f"{BACKEND_URL}/api/execution/{EXECUTION_ID}/logs", headers=headers) resp.raise_for_status() data = resp.json() return data.get("error_log", "") or data.get("output_log", "") def parse_timestamp(ts_str): try: return datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S") except ValueError: return None def extract_timing_data(log_text): """从日志提取所有任务的计时数据""" lines = log_text.split('\n') ts_re = re.compile(r'^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]') # 切片开始: "XXX: 开始执行(1/4),窗口[...]" # 也匹配: "XXX: 开始执行(1/4), 窗口[...]"(英文逗号) slice_start_re = re.compile( r'(\w+): 开始执行\((\d+)/(\d+)\)[,,]\s*窗口\[([^\]]+)\]') # 切片完成: "XXX: 完成(1/4),已处理 30/111.17 天" slice_end_re = re.compile( r'(\w+): 完成\((\d+)/(\d+)\)[,,]\s*已处理') # 工具类任务开始: "XXX: 开始执行工具类任务" util_start_re = re.compile(r'(\w+): 开始执行工具类任务') # 工具类任务完成: "XXX: 工具类任务执行成功" util_end_re = re.compile(r'(\w+): 工具类任务执行(成功|失败)') # 任务失败: "任务 XXX 失败:" task_fail_re = re.compile(r'任务 (\w+) 失败') # 任务未启用: "任务 XXX 未启用或不存在" task_skip_re = re.compile(r'任务 (\w+) 未启用或不存在') # 抓取阶段开始(非工具类 DWS 任务的首条日志) fetch_start_re = re.compile(r'(\w+): (抓取阶段开始|ODS fetch\+load start)') # 任务完成统计: "XXX: 完成,统计=" 或 "XXX: 完成, 统计=" task_done_re = re.compile(r'(\w+): 完成[,,]\s*统计=') # ODS 任务完成: "ODS_XXX ODS 任务完成:" ods_done_re = re.compile(r'(\w+) ODS 任务完成') # Flow 结束 flow_end_re = re.compile(r'所有任务执行完成|Flow 执行完成') flow_start_re = re.compile(r'开始执行 Flow: api_full|FLOW_API_FULL.*开始执行') # 数据结构: task_name -> {layer, start, end, status, slices: [...]} tasks = OrderedDict() flow_start = None flow_end = None # 跟踪切片开始时间 pending_slices = {} # (task_name, slice_idx) -> (ts, window) for line in lines: m = ts_re.match(line) if not m: continue ts = parse_timestamp(m.group(1)) if not ts: continue content = line[m.end():].strip() # 去掉日志级别前缀 content = re.sub( r'^(INFO|WARNING|ERROR|DEBUG|CRITICAL)\s*\|\s*\w+\s*\|\s*', '', content) # Flow 开始 if flow_start_re.search(content): if flow_start is None: flow_start = ts # Flow 结束 if flow_end_re.search(content): flow_end = ts # 切片开始 sm = slice_start_re.search(content) if sm: task_name = sm.group(1) idx = int(sm.group(2)) total = int(sm.group(3)) window = sm.group(4) layer = classify_task(task_name) if task_name not in tasks: tasks[task_name] = { 'layer': layer, 'start': ts, 'end': None, 'status': 'running', 'total_slices': total, 'slices': [] } pending_slices[(task_name, idx)] = (ts, window) continue # 切片完成 sm = slice_end_re.search(content) if sm: task_name = sm.group(1) idx = int(sm.group(2)) key = (task_name, idx) if key in pending_slices: start_ts, window = pending_slices.pop(key) dur = (ts - start_ts).total_seconds() if task_name in tasks: tasks[task_name]['slices'].append({ 'idx': idx, 'window': window, 'start': start_ts, 'end': ts, 'duration_sec': dur }) tasks[task_name]['end'] = ts continue # 工具类任务开始 sm = util_start_re.search(content) if sm: task_name = sm.group(1) layer = classify_task(task_name) if task_name not in tasks: tasks[task_name] = { 'layer': layer, 'start': ts, 'end': None, 'status': 'running', 'total_slices': 0, 'slices': [] } else: tasks[task_name]['start'] = ts continue # 工具类任务完成 sm = util_end_re.search(content) if sm: task_name = sm.group(1) success = sm.group(2) == '成功' if task_name in tasks: tasks[task_name]['end'] = ts tasks[task_name]['status'] = 'success' if success else 'failed' continue # 抓取阶段开始(作为任务首次出现的标记) sm = fetch_start_re.search(content) if sm: task_name = sm.group(1) layer = classify_task(task_name) if task_name not in tasks: tasks[task_name] = { 'layer': layer, 'start': ts, 'end': None, 'status': 'running', 'total_slices': 0, 'slices': [] } continue # 任务完成统计: "XXX: 完成,统计=" 格式 sm = task_done_re.search(content) if sm: task_name = sm.group(1) if task_name in tasks: tasks[task_name]['end'] = ts if tasks[task_name]['status'] == 'running': tasks[task_name]['status'] = 'success' continue # ODS 任务完成: "ODS_XXX ODS 任务完成:" 格式 sm = ods_done_re.search(content) if sm: task_name = sm.group(1) if task_name in tasks: tasks[task_name]['end'] = ts if tasks[task_name]['status'] == 'running': tasks[task_name]['status'] = 'success' continue # 任务未启用/跳过 sm = task_skip_re.search(content) if sm: task_name = sm.group(1) layer = classify_task(task_name) tasks[task_name] = { 'layer': layer, 'start': ts, 'end': ts, 'status': 'skipped', 'total_slices': 0, 'slices': [] } continue # 任务失败 sm = task_fail_re.search(content) if sm: task_name = sm.group(1) if task_name in tasks: tasks[task_name]['end'] = ts tasks[task_name]['status'] = 'failed' else: layer = classify_task(task_name) tasks[task_name] = { 'layer': layer, 'start': ts, 'end': ts, 'status': 'failed', 'total_slices': 0, 'slices': [] } # 计算每个任务的总耗时 for name, t in tasks.items(): if t['slices']: t['total_duration_sec'] = sum( s['duration_sec'] for s in t['slices']) # 如果所有切片都完成了,标记为成功 if t['status'] == 'running' and t.get('total_slices', 0) > 0: if len(t['slices']) >= t['total_slices']: t['status'] = 'success' # ODS 任务:切片全部完成即视为成功(total_slices 可能为 0 因为日志格式) if t['status'] == 'running' and t['layer'] == 'ODS' and len(t['slices']) == 4: t['status'] = 'success' elif t['start'] and t['end']: t['total_duration_sec'] = (t['end'] - t['start']).total_seconds() else: t['total_duration_sec'] = 0 return {'flow_start': flow_start, 'flow_end': flow_end, 'tasks': tasks} def fmt_dur(seconds): if seconds is None or seconds == 0: return "<1s" if seconds < 60: return f"{seconds:.0f}s" m = int(seconds // 60) s = seconds % 60 return f"{m}m{s:.0f}s" def fmt_ts(ts): return ts.strftime('%H:%M:%S') if ts else 'N/A' def generate_report(data): """生成 Markdown 计时报告""" out = [] w = out.append w("# ETL 执行精细计时报告\n") w(f"- execution_id: `{EXECUTION_ID}`") w(f"- Flow 开始: {data['flow_start']}") w(f"- Flow 结束: {data['flow_end']}") if data['flow_start'] and data['flow_end']: total = (data['flow_end'] - data['flow_start']).total_seconds() w(f"- 总耗时: {fmt_dur(total)}") w("") # 按阶段分组 layers = OrderedDict([('ODS', []), ('DWD', []), ('DWS', []), ('INDEX', [])]) for name, t in data['tasks'].items(): layer = t['layer'] if layer in layers: layers[layer].append((name, t)) all_durations = [] # (name, dur, layer) for Top-5 layer_totals = {} for layer, task_list in layers.items(): w(f"## {layer} 阶段\n") if not task_list: w(f"未检测到 {layer} 任务计时数据\n") layer_totals[layer] = 0 continue # 按耗时降序 task_list.sort(key=lambda x: x[1]['total_duration_sec'], reverse=True) # 区分成功、失败、跳过 success_tasks = [(n, t) for n, t in task_list if t['status'] not in ('failed', 'skipped')] failed_tasks = [(n, t) for n, t in task_list if t['status'] == 'failed'] skipped_tasks = [(n, t) for n, t in task_list if t['status'] == 'skipped'] layer_dur = 0 if success_tasks: w("| 任务 | 切片数 | 总耗时 | 开始 | 结束 | 状态 |") w("|------|--------|--------|------|------|------|") for name, t in success_tasks: dur = t['total_duration_sec'] layer_dur += dur sc = len(t['slices']) or t.get('total_slices', 0) status = '✅' if t['status'] == 'success' else '⏳' w(f"| {name} | {sc} | {fmt_dur(dur)} | {fmt_ts(t['start'])} | {fmt_ts(t['end'])} | {status} |") all_durations.append((name, dur, layer)) w("") if failed_tasks: w(f"### {layer} 失败任务\n") w("| 任务 | 失败时间 | 状态 |") w("|------|---------|------|") for name, t in failed_tasks: w(f"| {name} | {fmt_ts(t['end'])} | ❌ 失败 |") all_durations.append((name, 0, layer)) w("") if skipped_tasks: w(f"### {layer} 跳过任务\n") for name, t in skipped_tasks: w(f"- {name}(未启用)") w("") # 阶段总耗时 = 从第一个任务开始到最后一个任务结束 all_starts = [t['start'] for _, t in task_list if t['start']] all_ends = [t['end'] for _, t in task_list if t['end']] if all_starts and all_ends: stage_wall = (max(all_ends) - min(all_starts)).total_seconds() w(f"{layer} 阶段墙钟耗时(从首个任务开始到末个任务结束): **{fmt_dur(stage_wall)}**") w(f"{layer} 阶段任务累计耗时: **{fmt_dur(layer_dur)}**") layer_totals[layer] = stage_wall else: layer_totals[layer] = 0 w("") # 切片详情(仅展示有切片且耗时 > 5s 的任务) tasks_with_slices = [(n, t) for n, t in task_list if t['slices'] and t['total_duration_sec'] > 5] if tasks_with_slices: w(f"### {layer} 窗口切片详情\n") for name, t in tasks_with_slices: w(f"#### {name}\n") w("| 切片 | 窗口 | 耗时 | 开始 | 结束 |") w("|------|------|------|------|------|") for s in sorted(t['slices'], key=lambda x: x['idx']): total_s = t.get('total_slices', '?') win_short = s['window'].replace('+08:00', '') w(f"| {s['idx']}/{total_s} | {win_short} | {fmt_dur(s['duration_sec'])} | {fmt_ts(s['start'])} | {fmt_ts(s['end'])} |") w("") # 阶段汇总 w("## 阶段耗时汇总\n") w("| 阶段 | 墙钟耗时 | 占比 |") w("|------|---------|------|") total_wall = sum(layer_totals.values()) for layer in ['ODS', 'DWD', 'DWS', 'INDEX']: dur = layer_totals.get(layer, 0) pct = dur / total_wall * 100 if total_wall > 0 else 0 w(f"| {layer} | {fmt_dur(dur)} | {pct:.1f}% |") w(f"| **合计** | **{fmt_dur(total_wall)}** | **100%** |") w("") # Top-5 w("## Top-5 耗时最长的任务\n") top5 = sorted(all_durations, key=lambda x: x[1], reverse=True)[:5] w("| 排名 | 任务 | 阶段 | 耗时 |") w("|------|------|------|------|") for i, (name, dur, layer) in enumerate(top5, 1): w(f"| {i} | {name} | {layer} | {fmt_dur(dur)} |") w("") # 任务执行总结 w("## 执行统计\n") total_tasks = len(data['tasks']) success = sum(1 for t in data['tasks'].values() if t['status'] == 'success') failed = sum(1 for t in data['tasks'].values() if t['status'] == 'failed') skipped = sum(1 for t in data['tasks'].values() if t['status'] == 'skipped') running = sum(1 for t in data['tasks'].values() if t['status'] == 'running') w(f"- 总任务数: {total_tasks}") w(f"- 成功: {success}") w(f"- 失败: {failed}") if skipped: w(f"- 跳过(未启用): {skipped}") if running: w(f"- 状态不明: {running}") w("") if failed: w("### 失败任务清单\n") for name, t in data['tasks'].items(): if t['status'] == 'failed': w(f"- **{name}** ({t['layer']}) — 失败时间 {fmt_ts(t['end'])}") w("") return '\n'.join(out) def main(): print("=" * 60) print("ETL 执行日志计时数据提取") print("=" * 60) # 1. 登录 print("\n[1] 登录获取 token...") token = login() print(f" Token: {token[:20]}...") # 2. 获取日志 print("\n[2] 获取执行日志...") log_text = get_logs(token) print(f" 日志长度: {len(log_text)} 字符, {log_text.count(chr(10))} 行") if not log_text: print(" 错误: 日志为空!") sys.exit(1) # 保存原始日志 raw_path = os.path.join(os.path.dirname(__file__), '..', '..', 'export', 'temp_raw_execution_log.txt') os.makedirs(os.path.dirname(raw_path), exist_ok=True) with open(raw_path, 'w', encoding='utf-8') as f: f.write(log_text) print(f" 原始日志已保存: {raw_path}") # 3. 解析 print("\n[3] 解析计时数据...") data = extract_timing_data(log_text) print(f" Flow: {data['flow_start']} ~ {data['flow_end']}") layers = {} for t in data['tasks'].values(): layers.setdefault(t['layer'], []).append(t) for layer in ['ODS', 'DWD', 'DWS', 'INDEX']: tl = layers.get(layer, []) ok = sum(1 for t in tl if t['status'] == 'success') fail = sum(1 for t in tl if t['status'] == 'failed') print(f" {layer}: {len(tl)} 任务 ({ok} 成功, {fail} 失败)") # 4. 生成报告 print("\n[4] 生成计时报告...") report = generate_report(data) report_path = os.path.join(os.path.dirname(__file__), '..', '..', 'export', 'temp_timing_report.md') with open(report_path, 'w', encoding='utf-8') as f: f.write(report) print(f" 报告已保存: {report_path}") # 5. 打印 print("\n" + "=" * 60) print(report) if __name__ == '__main__': main()