微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -0,0 +1,285 @@
"""ETL 全流程联调报告生成脚本
解析日志文件,提取计时数据、错误/警告,生成综合报告。
输出到 SYSTEM_LOG_ROOT 环境变量指定的目录。
"""
import os
import re
import sys
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from dotenv import load_dotenv
# 加载根 .env
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT")
if not SYSTEM_LOG_ROOT:
raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置")
LOG_FILE = Path(r"C:\NeoZQYY\export\ETL-Connectors\feiqiu\LOGS\2681a85399e64c76a040163f956e1907.log")
if not LOG_FILE.exists():
raise FileNotFoundError(f"日志文件不存在: {LOG_FILE}")
# ── 解析日志 ──────────────────────────────────────────────
TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] [\w.]+: (.+)$")
TASK_START_RE = re.compile(r"^(\S+): 抓取阶段开始")
TASK_TOOL_START_RE = re.compile(r"^(\S+): 开始执行工具类任务")
TASK_DONE_RE = re.compile(r"^(\S+): 完成,统计=(.+)$")
TASK_TOOL_DONE_RE = re.compile(r"^(\S+): 工具类任务执行成功$")
WINDOW_START_RE = re.compile(r"^(\S+): 开始执行\((\d+)/(\d+)\),窗口\[(.+)\]$")
WINDOW_DONE_RE = re.compile(r"^(\S+): 完成\((\d+)/(\d+)\),已处理")
DWD_LOAD_RE = re.compile(r"^DWD_LOAD_FROM_ODS: (.+)$")
lines = LOG_FILE.read_text(encoding="utf-8", errors="replace").splitlines()
errors = []
warnings = []
task_timings = {} # task_name -> {start, end, stats}
window_timings = defaultdict(list) # task_name -> [(slice_idx, start_ts, end_ts)]
first_ts = None
last_ts = None
all_tasks_done_ts = None
for line in lines:
m = TS_RE.match(line)
if not m:
continue
ts_str, level, msg = m.groups()
ts = datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S")
if first_ts is None:
first_ts = ts
last_ts = ts
if level == "ERROR" or level == "CRITICAL":
errors.append((ts_str, msg))
elif level == "WARNING":
warnings.append((ts_str, msg))
if msg.strip() == "所有任务执行完成":
all_tasks_done_ts = ts
# 任务开始
sm = TASK_START_RE.match(msg)
if sm:
tname = sm.group(1)
if tname not in task_timings:
task_timings[tname] = {"start": ts, "end": None, "stats": None}
else:
task_timings[tname]["start"] = ts
sm2 = TASK_TOOL_START_RE.match(msg)
if sm2:
tname = sm2.group(1)
task_timings[tname] = {"start": ts, "end": None, "stats": None}
# 任务完成
dm = TASK_DONE_RE.match(msg)
if dm:
tname = dm.group(1)
if tname in task_timings:
task_timings[tname]["end"] = ts
task_timings[tname]["stats"] = dm.group(2)
dm2 = TASK_TOOL_DONE_RE.match(msg)
if dm2:
tname = dm2.group(1)
if tname in task_timings:
task_timings[tname]["end"] = ts
# 窗口切片
wm = WINDOW_START_RE.match(msg)
if wm:
tname, idx = wm.group(1), int(wm.group(2))
window_timings[tname].append({"idx": idx, "total": int(wm.group(3)), "start": ts, "end": None})
wd = WINDOW_DONE_RE.match(msg)
if wd:
tname, idx = wd.group(1), int(wd.group(2))
for w in window_timings[tname]:
if w["idx"] == idx and w["end"] is None:
w["end"] = ts
break
# DWD_LOAD_FROM_ODS 特殊处理 — 从日志中找开始/结束
for line in lines:
m = TS_RE.match(line)
if not m:
continue
ts_str, level, msg = m.groups()
ts = datetime.strptime(ts_str, "%Y-%m-%d %H:%M:%S")
if "DWD_LOAD_FROM_ODS" in msg and "DWD_LOAD_FROM_ODS" not in task_timings:
task_timings["DWD_LOAD_FROM_ODS"] = {"start": ts, "end": None, "stats": None}
if "DWD_LOAD_FROM_ODS" in msg:
task_timings.setdefault("DWD_LOAD_FROM_ODS", {"start": ts, "end": None, "stats": None})
task_timings["DWD_LOAD_FROM_ODS"]["end"] = ts
# ── 分层统计 ──────────────────────────────────────────────
def classify_layer(name):
if name.startswith("ODS_"):
return "ODS"
elif name.startswith("DWD_"):
return "DWD"
elif name.startswith("DWS_"):
if "INDEX" in name:
return "INDEX"
return "DWS"
return "OTHER"
layer_tasks = defaultdict(list)
for tname, info in task_timings.items():
layer_tasks[classify_layer(tname)].append((tname, info))
# ── 生成报告 ──────────────────────────────────────────────
total_duration = (last_ts - first_ts).total_seconds() if first_ts and last_ts else 0
total_min = total_duration / 60
out_dir = Path(SYSTEM_LOG_ROOT)
out_dir.mkdir(parents=True, exist_ok=True)
report_path = out_dir / "20260227__etl_integration_report.md"
rpt = []
rpt.append("# ETL 全流程联调报告")
rpt.append("")
rpt.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
rpt.append(f"日志文件: `{LOG_FILE}`")
rpt.append("")
# ── 执行概要 ──
rpt.append("## 执行概要")
rpt.append("")
rpt.append("| 项目 | 值 |")
rpt.append("|------|-----|")
rpt.append(f"| Flow | `api_full` |")
rpt.append(f"| 处理模式 | `full_window` |")
rpt.append(f"| 时间窗口 | 2025-11-01 ~ 2026-02-27 |")
rpt.append(f"| 窗口切分 | 30 天 |")
rpt.append(f"| force_full | ✅ |")
rpt.append(f"| 开始时间 | {first_ts.strftime('%Y-%m-%d %H:%M:%S') if first_ts else 'N/A'} |")
rpt.append(f"| 结束时间 | {last_ts.strftime('%Y-%m-%d %H:%M:%S') if last_ts else 'N/A'} |")
rpt.append(f"| 总耗时 | {total_min:.1f} 分钟 ({total_duration:.0f} 秒) |")
rpt.append(f"| 任务总数 | {len(task_timings)} |")
rpt.append(f"| 错误数 | {len(errors)} |")
rpt.append(f"| 警告数 | {len(warnings)} |")
rpt.append(f"| 最终状态 | {'✅ 成功' if all_tasks_done_ts else '❌ 未检测到完成标记'} |")
rpt.append("")
# ── 性能报告 ──
rpt.append("## 性能报告")
rpt.append("")
# 按层汇总
for layer in ["ODS", "DWD", "DWS", "INDEX"]:
tasks = layer_tasks.get(layer, [])
if not tasks:
continue
rpt.append(f"### {layer} 层({len(tasks)} 个任务)")
rpt.append("")
rpt.append("| 任务 | 开始 | 结束 | 耗时(秒) | 统计 |")
rpt.append("|------|------|------|---------|------|")
for tname, info in sorted(tasks, key=lambda x: x[1]["start"] if x[1]["start"] else datetime.min):
s = info["start"].strftime("%H:%M:%S") if info["start"] else "-"
e = info["end"].strftime("%H:%M:%S") if info["end"] else "-"
dur = ""
if info["start"] and info["end"]:
dur = f"{(info['end'] - info['start']).total_seconds():.0f}"
stats_short = ""
if info["stats"]:
# 提取 inserted/deleted 数字
ins_m = re.search(r"'inserted': (\d+)", info["stats"])
del_m = re.search(r"'deleted': (\d+)", info["stats"])
err_m = re.search(r"'errors': (\d+)", info["stats"])
parts = []
if ins_m:
parts.append(f"ins={ins_m.group(1)}")
if del_m:
parts.append(f"del={del_m.group(1)}")
if err_m and int(err_m.group(1)) > 0:
parts.append(f"err={err_m.group(1)}")
stats_short = ", ".join(parts)
rpt.append(f"| {tname} | {s} | {e} | {dur} | {stats_short} |")
rpt.append("")
# Top-5 耗时
rpt.append("### Top-5 耗时任务")
rpt.append("")
ranked = []
for tname, info in task_timings.items():
if info["start"] and info["end"]:
dur = (info["end"] - info["start"]).total_seconds()
ranked.append((tname, dur))
ranked.sort(key=lambda x: -x[1])
rpt.append("| 排名 | 任务 | 耗时(秒) | 耗时(分) |")
rpt.append("|------|------|---------|---------|")
for i, (tname, dur) in enumerate(ranked[:5], 1):
rpt.append(f"| {i} | {tname} | {dur:.0f} | {dur/60:.1f} |")
rpt.append("")
# 窗口切片耗时
has_windows = any(len(v) > 0 for v in window_timings.values())
if has_windows:
rpt.append("### 窗口切片耗时(部分任务)")
rpt.append("")
for tname in sorted(window_timings.keys()):
slices = window_timings[tname]
if len(slices) <= 1:
continue
rpt.append(f"**{tname}** ({len(slices)} 个切片)")
rpt.append("")
rpt.append("| 切片 | 开始 | 结束 | 耗时(秒) |")
rpt.append("|------|------|------|---------|")
for w in slices:
s = w["start"].strftime("%H:%M:%S")
e = w["end"].strftime("%H:%M:%S") if w["end"] else "-"
dur = f"{(w['end'] - w['start']).total_seconds():.0f}" if w["end"] else "-"
rpt.append(f"| {w['idx']}/{w['total']} | {s} | {e} | {dur} |")
rpt.append("")
# ── DEBUG 报告 ──
rpt.append("## DEBUG 报告")
rpt.append("")
if errors:
rpt.append(f"### 错误({len(errors)} 条)")
rpt.append("")
for ts_str, msg in errors:
rpt.append(f"- `{ts_str}` {msg}")
rpt.append("")
else:
rpt.append("### 错误")
rpt.append("")
rpt.append("无错误。")
rpt.append("")
if warnings:
rpt.append(f"### 警告({len(warnings)} 条)")
rpt.append("")
rpt.append("<details>")
rpt.append("<summary>展开查看全部警告</summary>")
rpt.append("")
for ts_str, msg in warnings:
rpt.append(f"- `{ts_str}` {msg}")
rpt.append("")
rpt.append("</details>")
rpt.append("")
else:
rpt.append("### 警告")
rpt.append("")
rpt.append("无警告。")
rpt.append("")
# ── 黑盒测试报告占位 ──
rpt.append("## 黑盒测试报告")
rpt.append("")
rpt.append("(待 Task 5.3 追加)")
rpt.append("")
report_text = "\n".join(rpt)
report_path.write_text(report_text, encoding="utf-8")
print(f"报告已生成: {report_path}")
print(f"总耗时: {total_min:.1f} 分钟")
print(f"任务数: {len(task_timings)}")
print(f"错误: {len(errors)}, 警告: {len(warnings)}")