Files
Neo-ZQYY/scripts/ops/monitor_etl_run.py

324 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
监控当前 ETL 执行状态,完成后导出执行结果报告到 SYSTEM_LOG_ROOT。
通过后端 API 轮询执行历史,检测 run_uuid 对应的执行是否完成。
完成后从浏览器日志或 API 提取任务级结果,生成 Markdown 报告。
用法python scripts/ops/monitor_etl_run.py
"""
from __future__ import annotations
import json
import sys
import time
from datetime import datetime
from pathlib import Path
import requests
sys.path.insert(0, str(Path(__file__).parent))
from _env_paths import get_output_path
BACKEND_URL = "http://localhost:8000"
TARGET_RUN_UUID = "4ba9d2d365ee4a858f1c4104b1942dc2"
POLL_INTERVAL = 30 # 秒
def get_auth_token() -> str:
"""从后端登录获取 JWT token使用测试账号"""
# 尝试读取已有 token
token_file = Path(__file__).parent / ".monitor_token"
if token_file.exists():
token = token_file.read_text(encoding="utf-8").strip()
# 验证 token 是否有效
try:
r = requests.get(
f"{BACKEND_URL}/api/execution/history",
headers={"Authorization": f"Bearer {token}"},
params={"limit": 1},
timeout=5,
)
if r.status_code == 200:
return token
except Exception:
pass
# token 无效,需要重新登录
print("需要登录后端获取 token。请在浏览器中登录后")
print("从浏览器 DevTools > Application > Local Storage 中复制 token")
print("或直接输入(留空跳过,使用无认证模式):")
token = input("JWT Token: ").strip()
if token:
token_file.write_text(token, encoding="utf-8")
return token
def poll_execution_status(token: str) -> dict | None:
"""轮询执行状态"""
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
try:
r = requests.get(
f"{BACKEND_URL}/api/execution/history",
headers=headers,
params={"limit": 5},
timeout=10,
)
if r.status_code != 200:
print(f" API 返回 {r.status_code}: {r.text[:200]}")
return None
data = r.json()
items = data if isinstance(data, list) else data.get("items", data.get("data", []))
for item in items:
if item.get("run_uuid") == TARGET_RUN_UUID:
return item
# 没找到精确匹配,返回最新的
if items:
return items[0]
return None
except requests.exceptions.ConnectionError:
print(" 后端连接失败,可能已停止")
return None
except Exception as e:
print(f" API 请求异常: {e}")
return None
def extract_log_from_api(token: str) -> str | None:
"""尝试从 API 获取执行日志"""
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
try:
# 尝试获取日志
r = requests.get(
f"{BACKEND_URL}/api/execution/log/{TARGET_RUN_UUID}",
headers=headers,
timeout=30,
)
if r.status_code == 200:
return r.text
except Exception:
pass
return None
def parse_task_results_from_log(log_text: str) -> list[dict]:
"""从日志文本解析各任务的执行结果"""
results = []
lines = log_text.split("\n") if log_text else []
current_task = None
task_start_time = None
for line in lines:
# 检测任务开始
if "开始执行" in line and "ODS" in line or "DWS" in line or "DWD" in line:
# 提取时间戳
ts = extract_timestamp(line)
# 提取任务名
for token in line.split():
if token.startswith("ODS_") or token.startswith("DWS_") or token.startswith("DWD_"):
task_name = token.rstrip(":")
current_task = task_name
task_start_time = ts
break
# 检测任务完成
if current_task and "任务完成" in line and current_task in line:
ts = extract_timestamp(line)
# 提取统计信息
stats = extract_stats(line)
results.append({
"task": current_task,
"status": "success",
"start": task_start_time,
"end": ts,
"stats": stats,
})
current_task = None
# 检测任务失败
if "任务" in line and "失败" in line:
ts = extract_timestamp(line)
for token in line.split():
if token.startswith("ODS_") or token.startswith("DWS_") or token.startswith("DWD_"):
task_name = token.rstrip(":")
# 提取错误信息
error_msg = line.split("失败:")[-1].strip() if "失败:" in line else "未知错误"
results.append({
"task": task_name,
"status": "failed",
"start": task_start_time if current_task == task_name else ts,
"end": ts,
"error": error_msg,
})
if current_task == task_name:
current_task = None
break
return results
def extract_timestamp(line: str) -> str:
"""从日志行提取时间戳"""
# 格式: [2026-02-21 15:29:21]
if "[" in line and "]" in line:
start = line.index("[") + 1
end = line.index("]", start)
return line[start:end]
return ""
def extract_stats(line: str) -> str:
"""从日志行提取统计信息"""
if "{" in line and "}" in line:
start = line.index("{")
end = line.index("}") + 1
return line[start:end]
return ""
def generate_report(execution: dict, task_results: list[dict]) -> str:
"""生成执行结果 Markdown 报告"""
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
status = execution.get("status", "unknown")
start_time = execution.get("started_at", execution.get("start_time", ""))
end_time = execution.get("ended_at", execution.get("end_time", ""))
duration = execution.get("duration", "")
exit_code = execution.get("exit_code", "")
lines = [
f"# ETL 执行结果报告",
f"",
f"> 生成时间:{now}",
f"> run_uuid{TARGET_RUN_UUID}",
f"",
f"---",
f"",
f"## 执行概览",
f"",
f"| 项目 | 值 |",
f"|------|-----|",
f"| 状态 | {status} |",
f"| 开始时间 | {start_time} |",
f"| 结束时间 | {end_time} |",
f"| 时长 | {duration} |",
f"| 退出码 | {exit_code} |",
f"",
]
# 任务级结果
if task_results:
success_count = sum(1 for r in task_results if r["status"] == "success")
failed_count = sum(1 for r in task_results if r["status"] == "failed")
lines.extend([
f"## 任务级结果",
f"",
f"成功:{success_count} | 失败:{failed_count} | 总计:{len(task_results)}",
f"",
f"| # | 任务 | 状态 | 开始 | 结束 | 备注 |",
f"|---|------|------|------|------|------|",
])
for i, r in enumerate(task_results, 1):
note = r.get("stats", r.get("error", ""))
if len(note) > 80:
note = note[:77] + "..."
lines.append(
f"| {i} | {r['task']} | {r['status']} | {r.get('start', '')} | {r.get('end', '')} | {note} |"
)
lines.append("")
# 已知问题(历史,已解决)
lines.extend([
f"## 已知问题",
f"",
f"无活跃已知问题。",
f"",
f"---",
f"",
f"## 下一步",
f"",
f"1. 重新提交包含失败任务的执行,验证修复",
f"2. 运行 ETL Unified Analysis统一分析",
f"3. 运行 /audit 审计",
])
return "\n".join(lines)
def main():
out_dir = get_output_path("SYSTEM_LOG_ROOT")
print(f"ETL 执行监控启动")
print(f" 目标 run_uuid: {TARGET_RUN_UUID}")
print(f" 轮询间隔: {POLL_INTERVAL}s")
print(f" 输出目录: {out_dir}")
print()
# 获取认证 token — 非交互模式,直接尝试无 token
token = ""
token_file = Path(__file__).parent / ".monitor_token"
if token_file.exists():
token = token_file.read_text(encoding="utf-8").strip()
poll_count = 0
max_polls = 120 # 最多轮询 60 分钟
while poll_count < max_polls:
poll_count += 1
now = datetime.now().strftime("%H:%M:%S")
print(f"[{now}] 轮询 #{poll_count}...", end=" ")
execution = poll_execution_status(token)
if execution is None:
print("未获取到执行信息")
time.sleep(POLL_INTERVAL)
continue
status = execution.get("status", "unknown")
print(f"状态: {status}")
if status in ("success", "failed", "completed", "error", "stopped"):
print(f"\n执行已完成,状态: {status}")
# 尝试获取日志
log_text = extract_log_from_api(token)
task_results = parse_task_results_from_log(log_text) if log_text else []
# 生成报告
report = generate_report(execution, task_results)
out_file = out_dir / "2026-02-21__etl_run_result.md"
out_file.write_text(report, encoding="utf-8")
print(f"\n执行结果报告已导出: {out_file}")
# 同时保存原始 API 响应
raw_file = out_dir / "2026-02-21__etl_run_raw.json"
raw_file.write_text(
json.dumps(execution, ensure_ascii=False, indent=2, default=str),
encoding="utf-8",
)
print(f"原始数据已导出: {raw_file}")
return
time.sleep(POLL_INTERVAL)
print(f"\n超过最大轮询次数 ({max_polls}),退出监控")
if __name__ == "__main__":
main()