微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -0,0 +1,256 @@
"""从 API 获取 ETL 执行日志,提取精细计时数据。一次性运维脚本。"""
import requests, json, re, sys, os
from datetime import datetime
from collections import defaultdict, OrderedDict
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
# 环境变量必须存在
SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT")
if not SYSTEM_LOG_ROOT:
raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置")
EXEC_ID = "969c3195-5fea-4f72-873f-18cf75960c64"
BASE_URL = "http://localhost:8000"
def get_token():
r = requests.post(f"{BASE_URL}/api/auth/login",
json={"username": "admin", "password": "admin123"}, timeout=10)
r.raise_for_status()
return r.json().get("access_token") or r.json().get("token")
def get_logs(token):
headers = {"Authorization": f"Bearer {token}"}
r = requests.get(f"{BASE_URL}/api/execution/{EXEC_ID}/logs",
headers=headers, timeout=60)
r.raise_for_status()
data = r.json()
return data.get("error_log", "") or data.get("output_log", "")
TS_RE = re.compile(r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]')
# 任务开始模式(覆盖 ODS / DWD / DWS / INDEX 各种写法)
TASK_START_PATTERNS = [
re.compile(r'开始执行(\w+)\s+\(ODS\)'), # ODS: 开始执行ODS_XXX (ODS)
re.compile(r'(\w+): 抓取阶段开始'), # DWD/DWS: XXX: 抓取阶段开始
re.compile(r'(\w+): 开始执行工具类任务'), # DWS 工具类: XXX: 开始执行工具类任务
re.compile(r'(\w+): 本地清洗入库开始'), # DWD fallback
]
# 任务完成模式
TASK_DONE_PATTERNS = [
re.compile(r'(\w+) ODS 任务完成'), # ODS: XXX ODS 任务完成
re.compile(r'(\w+): 完成,统计='), # DWD/DWS: XXX: 完成,统计=
re.compile(r'(\w+): 工具类任务执行成功'), # DWS 工具类
re.compile(r'(\w+): 结果统计:'), # DWS fallback
]
# 任务失败模式
TASK_FAIL_RE = re.compile(r'任务\s+(\w+)\s+失败')
# 窗口切片模式(全局 Flow 级别的切片,不是单任务内部切片)
FLOW_SLICE_RE = re.compile(r'处理窗口切片\s*(\d+)/(\d+)')
# 已知任务名前缀
KNOWN_PREFIXES = {"ODS_", "DWD_", "DWS_"}
def classify_task(name):
if name.startswith("ODS_"):
return "ODS"
elif name.startswith("DWD_"):
return "DWD"
elif name in ("DWS_WINBACK_INDEX", "DWS_NEWCONV_INDEX",
"DWS_RELATION_INDEX", "DWS_SPENDING_POWER_INDEX"):
return "INDEX"
elif name.startswith("DWS_"):
return "DWS"
return "OTHER"
def parse_ts(line):
m = TS_RE.search(line)
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") if m else None
def parse_timing(log_text):
lines = log_text.split("\n")
# 任务级计时(保持插入顺序)
tasks = OrderedDict() # name -> {start, end, status, stage}
global_start = None
global_end = None
# 已处理的任务名(避免重复匹配)
seen_starts = set()
for line in lines:
ts = parse_ts(line)
if ts:
if global_start is None:
global_start = ts
global_end = ts
# 任务开始
for pat in TASK_START_PATTERNS:
m = pat.search(line)
if m and ts:
tname = m.group(1)
# 只处理已知 ETL 任务名
if not any(tname.startswith(p) for p in KNOWN_PREFIXES):
continue
if tname not in seen_starts:
seen_starts.add(tname)
tasks[tname] = {
"start": ts, "end": None,
"status": "running", "stage": classify_task(tname),
}
break
# 任务完成
for pat in TASK_DONE_PATTERNS:
m = pat.search(line)
if m and ts:
tname = m.group(1)
if tname in tasks and tasks[tname]["status"] == "running":
tasks[tname]["end"] = ts
tasks[tname]["status"] = "success"
break
# 任务失败
m = TASK_FAIL_RE.search(line)
if m and ts:
tname = m.group(1)
if tname in tasks:
tasks[tname]["end"] = ts
tasks[tname]["status"] = "failed"
# 计算耗时
for info in tasks.values():
if info["start"] and info["end"]:
info["duration"] = (info["end"] - info["start"]).total_seconds()
else:
info["duration"] = None
# 阶段汇总
stages = {}
for stage_name in ["ODS", "DWD", "DWS", "INDEX"]:
stage_tasks = [(n, t) for n, t in tasks.items() if t["stage"] == stage_name]
if not stage_tasks:
continue
starts = [t["start"] for _, t in stage_tasks if t["start"]]
ends = [t["end"] for _, t in stage_tasks if t["end"]]
stages[stage_name] = {
"start": min(starts) if starts else None,
"end": max(ends) if ends else None,
"task_count": len(stage_tasks),
"success": sum(1 for _, t in stage_tasks if t["status"] == "success"),
"failed": sum(1 for _, t in stage_tasks if t["status"] == "failed"),
}
if stages[stage_name]["start"] and stages[stage_name]["end"]:
stages[stage_name]["duration"] = (
stages[stage_name]["end"] - stages[stage_name]["start"]
).total_seconds()
else:
stages[stage_name]["duration"] = None
return {
"global_start": global_start,
"global_end": global_end,
"total_duration": (global_end - global_start).total_seconds() if global_start and global_end else 0,
"tasks": tasks,
"stages": stages,
}
def fmt_dur(seconds):
if seconds is None:
return "N/A"
m, s = divmod(int(seconds), 60)
h, m = divmod(m, 60)
return f"{h}h{m:02d}m{s:02d}s" if h > 0 else f"{m}m{s:02d}s"
def main():
print("=== ETL 执行日志计时分析 ===\n")
token = get_token()
print("✓ JWT Token")
log_text = get_logs(token)
print(f"✓ 日志 {len(log_text)} 字符\n")
timing = parse_timing(log_text)
# 全局
print(f"执行: {timing['global_start']} ~ {timing['global_end']}")
print(f"总耗时: {fmt_dur(timing['total_duration'])}\n")
# 阶段
print("--- 阶段计时 ---")
for stage in ["ODS", "DWD", "DWS", "INDEX"]:
info = timing["stages"].get(stage)
if info:
print(f" {stage}: {fmt_dur(info['duration'])} "
f"({info['success']}/{info['task_count']} 成功"
f"{', ' + str(info['failed']) + ' 失败' if info['failed'] else ''})")
print()
# Top-5
ranked = sorted(
[(n, t) for n, t in timing["tasks"].items() if t["duration"] is not None],
key=lambda x: x[1]["duration"], reverse=True,
)
print("--- Top-5 耗时任务 ---")
for i, (name, info) in enumerate(ranked[:5]):
print(f" {i+1}. {name}: {fmt_dur(info['duration'])} [{info['status']}]")
print()
# 全部任务
print(f"--- 全部任务 ({len(timing['tasks'])} 个) ---")
for name, info in timing["tasks"].items():
status_icon = "" if info["status"] == "success" else "" if info["status"] == "failed" else "?"
print(f" {status_icon} {name}: {fmt_dur(info['duration'])} [{info['stage']}]")
# 输出 JSON
output = {
"execution_id": EXEC_ID,
"global_start": str(timing["global_start"]),
"global_end": str(timing["global_end"]),
"total_duration_sec": timing["total_duration"],
"total_duration_fmt": fmt_dur(timing["total_duration"]),
"stages": {},
"top5": [],
"all_tasks": {},
}
for stage, info in timing["stages"].items():
output["stages"][stage] = {
"duration_sec": info["duration"],
"duration_fmt": fmt_dur(info["duration"]),
"task_count": info["task_count"],
"success": info["success"],
"failed": info["failed"],
}
for name, info in ranked[:5]:
output["top5"].append({
"task": name, "stage": info["stage"],
"duration_sec": info["duration"],
"duration_fmt": fmt_dur(info["duration"]),
"status": info["status"],
})
for name, info in timing["tasks"].items():
output["all_tasks"][name] = {
"stage": info["stage"],
"duration_sec": info["duration"],
"duration_fmt": fmt_dur(info["duration"]),
"status": info["status"],
}
out_path = Path(SYSTEM_LOG_ROOT) / "etl_timing_data.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(output, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\n✓ 计时数据 → {out_path}")
if __name__ == "__main__":
main()