微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

205
scripts/ops/_etl_monitor.py Normal file
View File

@@ -0,0 +1,205 @@
"""
ETL 执行监控脚本 — 轮询 API 日志,检测 ERROR/WARNING等待任务完成。
用法: python scripts/ops/_etl_monitor.py <execution_id> [--interval 30]
"""
import sys, time, re, json, os
from datetime import datetime
from pathlib import Path
import requests
API_BASE = "http://localhost:8000"
USERNAME = "admin"
PASSWORD = "admin123"
# 关键字检测
ERROR_KEYWORDS = re.compile(r"\b(ERROR|CRITICAL)\b|Traceback|Exception", re.IGNORECASE)
WARNING_KEYWORDS = re.compile(r"\bWARNING\b", re.IGNORECASE)
def login() -> str:
r = requests.post(f"{API_BASE}/api/auth/login", json={"username": USERNAME, "password": PASSWORD})
r.raise_for_status()
return r.json()["access_token"]
def get_logs(token: str, eid: str) -> dict:
r = requests.get(f"{API_BASE}/api/execution/{eid}/logs", headers={"Authorization": f"Bearer {token}"})
r.raise_for_status()
return r.json()
def get_history(token: str) -> list:
r = requests.get(f"{API_BASE}/api/execution/history", headers={"Authorization": f"Bearer {token}"})
r.raise_for_status()
return r.json()
def find_execution_status(history: list, eid: str) -> dict | None:
for item in history:
# API 返回 "id" 字段
if item.get("id") == eid or item.get("execution_id") == eid:
return item
return None
def scan_log_lines(log_text: str, seen_count: int) -> tuple[list, list, int]:
"""扫描日志行,返回 (errors, warnings, new_seen_count)"""
lines = log_text.split("\n") if log_text else []
errors = []
warnings = []
for i, line in enumerate(lines):
if i < seen_count:
continue
if ERROR_KEYWORDS.search(line):
errors.append(line.strip())
elif WARNING_KEYWORDS.search(line):
warnings.append(line.strip())
return errors, warnings, len(lines)
def get_last_timestamp(log_text: str) -> str | None:
"""提取日志中最后一个时间戳"""
matches = re.findall(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]", log_text or "")
return matches[-1] if matches else None
def get_current_task(log_text: str) -> str | None:
"""提取当前正在执行的任务名"""
matches = re.findall(r"开始执行(\w+)", log_text or "")
return matches[-1] if matches else None
def main():
if len(sys.argv) < 2:
print("用法: python scripts/ops/_etl_monitor.py <execution_id>")
sys.exit(1)
eid = sys.argv[1]
interval = int(sys.argv[2]) if len(sys.argv) > 2 else 30
print(f"[监控] execution_id={eid}, 轮询间隔={interval}s")
print(f"[监控] 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("-" * 60)
token = login()
print("[监控] 登录成功")
all_errors: list[str] = []
all_warnings: list[str] = []
seen_lines = 0
last_log_time = None
no_new_log_since = None
check_count = 0
while True:
check_count += 1
try:
# 检查执行状态
history = get_history(token)
exec_info = find_execution_status(history, eid)
status = exec_info.get("status", "unknown") if exec_info else "unknown"
# 获取日志
log_data = get_logs(token, eid)
log_text = log_data.get("output_log", "") or ""
# 扫描新日志行
new_errors, new_warnings, seen_lines = scan_log_lines(log_text, seen_lines)
all_errors.extend(new_errors)
all_warnings.extend(new_warnings)
# 提取当前进度信息
current_task = get_current_task(log_text)
last_ts = get_last_timestamp(log_text)
# 超时检测
if last_ts:
if last_ts != last_log_time:
last_log_time = last_ts
no_new_log_since = None
else:
if no_new_log_since is None:
no_new_log_since = datetime.now()
elapsed = (datetime.now() - no_new_log_since).total_seconds()
if elapsed > 1800: # 30 分钟
print(f"[超时警告] 连续 {elapsed/60:.0f} 分钟无新日志输出!")
# 输出状态
log_line_count = len(log_text.split("\n")) if log_text else 0
print(
f"[检查 #{check_count}] {datetime.now().strftime('%H:%M:%S')} | "
f"状态={status} | 日志行={log_line_count} | "
f"当前任务={current_task or '?'} | "
f"最后日志={last_ts or '?'} | "
f"新ERROR={len(new_errors)} 新WARNING={len(new_warnings)}"
)
# 输出新发现的错误/警告
for e in new_errors:
print(f" ❌ ERROR: {e[:200]}")
for w in new_warnings:
print(f" ⚠️ WARNING: {w[:200]}")
# 任务完成检测
if status in ("success", "failed", "cancelled"):
print("-" * 60)
print(f"[完成] 任务状态: {status}")
if exec_info:
print(f" 开始时间: {exec_info.get('started_at', '?')}")
print(f" 结束时间: {exec_info.get('finished_at', '?')}")
dur_ms = exec_info.get("duration_ms")
if dur_ms:
print(f" 时长: {dur_ms/1000:.1f}s ({dur_ms/60000:.1f}m)")
print(f" 退出码: {exec_info.get('exit_code', '?')}")
break
except requests.exceptions.RequestException as e:
print(f"[网络错误] {e}")
# token 可能过期,重新登录
try:
token = login()
print("[监控] 重新登录成功")
except Exception:
pass
time.sleep(interval)
# 最终汇总
print("\n" + "=" * 60)
print("监控汇总")
print("=" * 60)
print(f"总检查次数: {check_count}")
print(f"ERROR 总数: {len(all_errors)}")
print(f"WARNING 总数: {len(all_warnings)}")
if all_errors:
print("\n--- 所有 ERROR ---")
for i, e in enumerate(all_errors, 1):
print(f" {i}. {e[:300]}")
if all_warnings:
print("\n--- 所有 WARNING ---")
for i, w in enumerate(all_warnings, 1):
print(f" {i}. {w[:300]}")
# 输出 JSON 摘要供后续任务使用
summary = {
"execution_id": eid,
"final_status": status,
"total_checks": check_count,
"error_count": len(all_errors),
"warning_count": len(all_warnings),
"errors": all_errors,
"warnings": all_warnings,
"exit_code": exec_info.get("exit_code") if exec_info else None,
"started_at": exec_info.get("started_at") if exec_info else None,
"ended_at": exec_info.get("finished_at") if exec_info else None,
"duration_ms": exec_info.get("duration_ms") if exec_info else None,
}
print(f"\n[JSON摘要]\n{json.dumps(summary, ensure_ascii=False, indent=2)}")
if __name__ == "__main__":
main()