Files
Neo-ZQYY/scripts/ops/_extract_timing.py

257 lines
9.1 KiB
Python

"""从 API 获取 ETL 执行日志,提取精细计时数据。一次性运维脚本。"""
import requests, json, re, sys, os
from datetime import datetime
from collections import defaultdict, OrderedDict
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
# 环境变量必须存在
SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT")
if not SYSTEM_LOG_ROOT:
raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置")
EXEC_ID = "969c3195-5fea-4f72-873f-18cf75960c64"
BASE_URL = "http://localhost:8000"
def get_token():
r = requests.post(f"{BASE_URL}/api/auth/login",
json={"username": "admin", "password": "admin123"}, timeout=10)
r.raise_for_status()
return r.json().get("access_token") or r.json().get("token")
def get_logs(token):
headers = {"Authorization": f"Bearer {token}"}
r = requests.get(f"{BASE_URL}/api/execution/{EXEC_ID}/logs",
headers=headers, timeout=60)
r.raise_for_status()
data = r.json()
return data.get("error_log", "") or data.get("output_log", "")
TS_RE = re.compile(r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]')
# 任务开始模式(覆盖 ODS / DWD / DWS / INDEX 各种写法)
TASK_START_PATTERNS = [
re.compile(r'开始执行(\w+)\s+\(ODS\)'), # ODS: 开始执行ODS_XXX (ODS)
re.compile(r'(\w+): 抓取阶段开始'), # DWD/DWS: XXX: 抓取阶段开始
re.compile(r'(\w+): 开始执行工具类任务'), # DWS 工具类: XXX: 开始执行工具类任务
re.compile(r'(\w+): 本地清洗入库开始'), # DWD fallback
]
# 任务完成模式
TASK_DONE_PATTERNS = [
re.compile(r'(\w+) ODS 任务完成'), # ODS: XXX ODS 任务完成
re.compile(r'(\w+): 完成,统计='), # DWD/DWS: XXX: 完成,统计=
re.compile(r'(\w+): 工具类任务执行成功'), # DWS 工具类
re.compile(r'(\w+): 结果统计:'), # DWS fallback
]
# 任务失败模式
TASK_FAIL_RE = re.compile(r'任务\s+(\w+)\s+失败')
# 窗口切片模式(全局 Flow 级别的切片,不是单任务内部切片)
FLOW_SLICE_RE = re.compile(r'处理窗口切片\s*(\d+)/(\d+)')
# 已知任务名前缀
KNOWN_PREFIXES = {"ODS_", "DWD_", "DWS_"}
def classify_task(name):
if name.startswith("ODS_"):
return "ODS"
elif name.startswith("DWD_"):
return "DWD"
elif name in ("DWS_WINBACK_INDEX", "DWS_NEWCONV_INDEX",
"DWS_RELATION_INDEX", "DWS_SPENDING_POWER_INDEX"):
return "INDEX"
elif name.startswith("DWS_"):
return "DWS"
return "OTHER"
def parse_ts(line):
m = TS_RE.search(line)
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") if m else None
def parse_timing(log_text):
lines = log_text.split("\n")
# 任务级计时(保持插入顺序)
tasks = OrderedDict() # name -> {start, end, status, stage}
global_start = None
global_end = None
# 已处理的任务名(避免重复匹配)
seen_starts = set()
for line in lines:
ts = parse_ts(line)
if ts:
if global_start is None:
global_start = ts
global_end = ts
# 任务开始
for pat in TASK_START_PATTERNS:
m = pat.search(line)
if m and ts:
tname = m.group(1)
# 只处理已知 ETL 任务名
if not any(tname.startswith(p) for p in KNOWN_PREFIXES):
continue
if tname not in seen_starts:
seen_starts.add(tname)
tasks[tname] = {
"start": ts, "end": None,
"status": "running", "stage": classify_task(tname),
}
break
# 任务完成
for pat in TASK_DONE_PATTERNS:
m = pat.search(line)
if m and ts:
tname = m.group(1)
if tname in tasks and tasks[tname]["status"] == "running":
tasks[tname]["end"] = ts
tasks[tname]["status"] = "success"
break
# 任务失败
m = TASK_FAIL_RE.search(line)
if m and ts:
tname = m.group(1)
if tname in tasks:
tasks[tname]["end"] = ts
tasks[tname]["status"] = "failed"
# 计算耗时
for info in tasks.values():
if info["start"] and info["end"]:
info["duration"] = (info["end"] - info["start"]).total_seconds()
else:
info["duration"] = None
# 阶段汇总
stages = {}
for stage_name in ["ODS", "DWD", "DWS", "INDEX"]:
stage_tasks = [(n, t) for n, t in tasks.items() if t["stage"] == stage_name]
if not stage_tasks:
continue
starts = [t["start"] for _, t in stage_tasks if t["start"]]
ends = [t["end"] for _, t in stage_tasks if t["end"]]
stages[stage_name] = {
"start": min(starts) if starts else None,
"end": max(ends) if ends else None,
"task_count": len(stage_tasks),
"success": sum(1 for _, t in stage_tasks if t["status"] == "success"),
"failed": sum(1 for _, t in stage_tasks if t["status"] == "failed"),
}
if stages[stage_name]["start"] and stages[stage_name]["end"]:
stages[stage_name]["duration"] = (
stages[stage_name]["end"] - stages[stage_name]["start"]
).total_seconds()
else:
stages[stage_name]["duration"] = None
return {
"global_start": global_start,
"global_end": global_end,
"total_duration": (global_end - global_start).total_seconds() if global_start and global_end else 0,
"tasks": tasks,
"stages": stages,
}
def fmt_dur(seconds):
if seconds is None:
return "N/A"
m, s = divmod(int(seconds), 60)
h, m = divmod(m, 60)
return f"{h}h{m:02d}m{s:02d}s" if h > 0 else f"{m}m{s:02d}s"
def main():
print("=== ETL 执行日志计时分析 ===\n")
token = get_token()
print("✓ JWT Token")
log_text = get_logs(token)
print(f"✓ 日志 {len(log_text)} 字符\n")
timing = parse_timing(log_text)
# 全局
print(f"执行: {timing['global_start']} ~ {timing['global_end']}")
print(f"总耗时: {fmt_dur(timing['total_duration'])}\n")
# 阶段
print("--- 阶段计时 ---")
for stage in ["ODS", "DWD", "DWS", "INDEX"]:
info = timing["stages"].get(stage)
if info:
print(f" {stage}: {fmt_dur(info['duration'])} "
f"({info['success']}/{info['task_count']} 成功"
f"{', ' + str(info['failed']) + ' 失败' if info['failed'] else ''})")
print()
# Top-5
ranked = sorted(
[(n, t) for n, t in timing["tasks"].items() if t["duration"] is not None],
key=lambda x: x[1]["duration"], reverse=True,
)
print("--- Top-5 耗时任务 ---")
for i, (name, info) in enumerate(ranked[:5]):
print(f" {i+1}. {name}: {fmt_dur(info['duration'])} [{info['status']}]")
print()
# 全部任务
print(f"--- 全部任务 ({len(timing['tasks'])} 个) ---")
for name, info in timing["tasks"].items():
status_icon = "" if info["status"] == "success" else "" if info["status"] == "failed" else "?"
print(f" {status_icon} {name}: {fmt_dur(info['duration'])} [{info['stage']}]")
# 输出 JSON
output = {
"execution_id": EXEC_ID,
"global_start": str(timing["global_start"]),
"global_end": str(timing["global_end"]),
"total_duration_sec": timing["total_duration"],
"total_duration_fmt": fmt_dur(timing["total_duration"]),
"stages": {},
"top5": [],
"all_tasks": {},
}
for stage, info in timing["stages"].items():
output["stages"][stage] = {
"duration_sec": info["duration"],
"duration_fmt": fmt_dur(info["duration"]),
"task_count": info["task_count"],
"success": info["success"],
"failed": info["failed"],
}
for name, info in ranked[:5]:
output["top5"].append({
"task": name, "stage": info["stage"],
"duration_sec": info["duration"],
"duration_fmt": fmt_dur(info["duration"]),
"status": info["status"],
})
for name, info in timing["tasks"].items():
output["all_tasks"][name] = {
"stage": info["stage"],
"duration_sec": info["duration"],
"duration_fmt": fmt_dur(info["duration"]),
"status": info["status"],
}
out_path = Path(SYSTEM_LOG_ROOT) / "etl_timing_data.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\n✓ 计时数据 → {out_path}")
if __name__ == "__main__":
main()