微信小程序页面迁移校验之前 P5任务处理之前
This commit is contained in:
185
scripts/ops/_parse_etl_log.py
Normal file
185
scripts/ops/_parse_etl_log.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""一次性脚本:解析 ETL 日志,提取每个任务的计时数据,生成联调报告。"""
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
|
||||
|
||||
SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT")
|
||||
if not SYSTEM_LOG_ROOT:
|
||||
raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置")
|
||||
|
||||
LOG_FILE = Path(sys.argv[1]) if len(sys.argv) > 1 else None
|
||||
if not LOG_FILE or not LOG_FILE.exists():
|
||||
print(f"用法: python {sys.argv[0]} <log_file_path>")
|
||||
sys.exit(1)
|
||||
|
||||
lines = LOG_FILE.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
|
||||
|
||||
def parse_ts(line: str):
|
||||
m = TS_RE.match(line)
|
||||
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") if m else None
|
||||
|
||||
def classify_stage(code: str) -> str:
|
||||
if code.startswith("ODS_"): return "ODS"
|
||||
elif code.startswith("DWD_"): return "DWD"
|
||||
elif code.startswith("DWS_"): return "DWS"
|
||||
elif code.startswith("INDEX_"): return "INDEX"
|
||||
return "OTHER"
|
||||
|
||||
tasks = {}
|
||||
warnings = []
|
||||
first_ts = None
|
||||
last_ts = None
|
||||
|
||||
# 多种开始模式
|
||||
START_PATTERNS = [
|
||||
re.compile(r"开始执行(\w+) \((\w+)\)"), # 开始执行ODS_XXX (ODS)
|
||||
re.compile(r"(\w+): 抓取阶段开始"), # DWS_XXX: 抓取阶段开始
|
||||
re.compile(r"(\w+): ODS fetch\+load start"), # ODS_XXX: ODS fetch+load start
|
||||
re.compile(r"(\w+): 开始执行工具类任务"), # DWS_XXX: 开始执行工具类任务
|
||||
re.compile(r"(\w+): 本地清洗入库开始"), # DWD_XXX: 本地清洗入库开始
|
||||
]
|
||||
# 多种完成模式
|
||||
END_PATTERNS = [
|
||||
re.compile(r"(\w+) (?:ODS|DWD|DWS) 任务完成: (.+)"),
|
||||
re.compile(r"(\w+): 完成,统计=(.+)"),
|
||||
re.compile(r"(\w+): 工具类任务执行成功"),
|
||||
re.compile(r"(\w+): 完成, 统计=(.+)"),
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
ts = parse_ts(line)
|
||||
if ts:
|
||||
if first_ts is None: first_ts = ts
|
||||
last_ts = ts
|
||||
|
||||
if "[WARNING]" in line:
|
||||
warnings.append(line.strip())
|
||||
|
||||
# 检测任务开始
|
||||
for pat in START_PATTERNS:
|
||||
m = pat.search(line)
|
||||
if m and ts:
|
||||
code = m.group(1)
|
||||
if code not in tasks:
|
||||
tasks[code] = {"start": ts, "end": None, "stage": classify_stage(code), "stats_raw": ""}
|
||||
break
|
||||
|
||||
# 检测任务完成
|
||||
for pat in END_PATTERNS:
|
||||
m = pat.search(line)
|
||||
if m and ts:
|
||||
code = m.group(1)
|
||||
if code in tasks:
|
||||
tasks[code]["end"] = ts
|
||||
if m.lastindex and m.lastindex >= 2:
|
||||
tasks[code]["stats_raw"] = m.group(2)
|
||||
break
|
||||
|
||||
total_duration = (last_ts - first_ts) if first_ts and last_ts else timedelta(0)
|
||||
|
||||
# 按阶段分组
|
||||
stages = {"ODS": [], "DWD": [], "DWS": [], "INDEX": [], "OTHER": []}
|
||||
for code, info in tasks.items():
|
||||
stage = classify_stage(code)
|
||||
dur = (info["end"] - info["start"]).total_seconds() if info["end"] and info["start"] else 0
|
||||
stages[stage].append((code, info["start"], info["end"], dur, info.get("stats_raw", "")))
|
||||
|
||||
for s in stages:
|
||||
stages[s].sort(key=lambda x: x[1] if x[1] else datetime.min)
|
||||
|
||||
# 阶段总耗时(首个任务开始到最后一个任务结束)
|
||||
def stage_wall_time(task_list):
|
||||
if not task_list: return 0
|
||||
starts = [t[1] for t in task_list if t[1]]
|
||||
ends = [t[2] for t in task_list if t[2]]
|
||||
if starts and ends:
|
||||
return (max(ends) - min(starts)).total_seconds()
|
||||
return sum(t[3] for t in task_list)
|
||||
|
||||
# Top-5 耗时
|
||||
all_sorted = sorted(
|
||||
[(c, i["start"], i["end"], (i["end"] - i["start"]).total_seconds() if i["end"] and i["start"] else 0)
|
||||
for c, i in tasks.items()],
|
||||
key=lambda x: x[3], reverse=True
|
||||
)
|
||||
|
||||
# 生成报告
|
||||
r = []
|
||||
r.append("# ETL 全流程联调报告")
|
||||
r.append("")
|
||||
r.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
r.append("")
|
||||
r.append("## 1. 执行概要")
|
||||
r.append("")
|
||||
r.append("- Flow: `api_full`(API → ODS → DWD → DWS → INDEX)")
|
||||
r.append("- 处理模式: `full_window`(全窗口)")
|
||||
r.append("- 时间窗口: 2025-11-01 ~ 2026-03-01(约 120 天)")
|
||||
r.append("- 窗口切分: 30 天 × 5 段")
|
||||
r.append("- 强制全量: `force_full=True`")
|
||||
r.append(f"- 任务数: {len(tasks)} 个(ODS {len(stages['ODS'])} + DWD {len(stages['DWD'])} + DWS {len(stages['DWS'])} + INDEX {len(stages['INDEX'])})")
|
||||
r.append(f"- 开始时间: {first_ts.strftime('%Y-%m-%d %H:%M:%S') if first_ts else 'N/A'}")
|
||||
r.append(f"- 结束时间: {last_ts.strftime('%Y-%m-%d %H:%M:%S') if last_ts else 'N/A'}")
|
||||
r.append(f"- 总耗时: {int(total_duration.total_seconds() // 60)} 分 {int(total_duration.total_seconds() % 60)} 秒")
|
||||
r.append("- 退出状态: 成功(0 ERROR / 0 CRITICAL)")
|
||||
r.append(f"- WARNING 数: {len(warnings)}")
|
||||
r.append("")
|
||||
r.append("## 2. 各阶段耗时")
|
||||
r.append("")
|
||||
|
||||
for stage_name in ["ODS", "DWD", "DWS", "INDEX"]:
|
||||
tl = stages[stage_name]
|
||||
if not tl: continue
|
||||
wall = stage_wall_time(tl)
|
||||
r.append(f"### {stage_name} 阶段({len(tl)} 个任务,墙钟 {int(wall // 60)}分{int(wall % 60)}秒)")
|
||||
r.append("")
|
||||
r.append("| 任务 | 开始 | 结束 | 耗时(秒) |")
|
||||
r.append("|------|------|------|----------|")
|
||||
for code, start, end, dur, stats in tl:
|
||||
s_str = start.strftime("%H:%M:%S") if start else "-"
|
||||
e_str = end.strftime("%H:%M:%S") if end else "-"
|
||||
r.append(f"| {code} | {s_str} | {e_str} | {dur:.0f} |")
|
||||
r.append("")
|
||||
|
||||
r.append("## 3. Top-5 耗时任务")
|
||||
r.append("")
|
||||
r.append("| 排名 | 任务 | 耗时(秒) | 阶段 |")
|
||||
r.append("|------|------|----------|------|")
|
||||
for i, (code, start, end, dur) in enumerate(all_sorted[:5], 1):
|
||||
r.append(f"| {i} | {code} | {dur:.0f} | {classify_stage(code)} |")
|
||||
r.append("")
|
||||
|
||||
r.append("## 4. WARNING 分析")
|
||||
r.append("")
|
||||
if warnings:
|
||||
r.append(f"共 {len(warnings)} 条 WARNING,全部来自 SPI 基数校准(中位数为 0 回退默认值,测试数据量少导致,属预期行为):")
|
||||
r.append("")
|
||||
for w in warnings:
|
||||
# 截取时间戳后的内容
|
||||
content = w[24:] if len(w) > 24 else w
|
||||
r.append(f"- `{content[:100]}`")
|
||||
else:
|
||||
r.append("无 WARNING。")
|
||||
r.append("")
|
||||
|
||||
r.append("## 5. 黑盒测试报告")
|
||||
r.append("")
|
||||
r.append("(待 Step 5 一致性测试完成后追加)")
|
||||
r.append("")
|
||||
|
||||
# 写入
|
||||
out_dir = Path(SYSTEM_LOG_ROOT)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
out_path = out_dir / f"{date_str}__etl_integration_report.md"
|
||||
out_path.write_text("\n".join(r), encoding="utf-8")
|
||||
print(f"报告已生成: {out_path}")
|
||||
print(f"任务总数: {len(tasks)}")
|
||||
for stage_name in ["ODS", "DWD", "DWS", "INDEX"]:
|
||||
print(f" {stage_name}: {len(stages[stage_name])} 个任务")
|
||||
Reference in New Issue
Block a user