微信小程序页面迁移校验之前 P5任务处理之前

This commit is contained in:
Neo
2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions

View File

@@ -0,0 +1,185 @@
"""一次性脚本:解析 ETL 日志,提取每个任务的计时数据,生成联调报告。"""
import os
import re
import sys
from datetime import datetime, timedelta
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT")
if not SYSTEM_LOG_ROOT:
raise RuntimeError("SYSTEM_LOG_ROOT 环境变量未设置")
LOG_FILE = Path(sys.argv[1]) if len(sys.argv) > 1 else None
if not LOG_FILE or not LOG_FILE.exists():
print(f"用法: python {sys.argv[0]} <log_file_path>")
sys.exit(1)
lines = LOG_FILE.read_text(encoding="utf-8").splitlines()
TS_RE = re.compile(r"^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})")
def parse_ts(line: str):
m = TS_RE.match(line)
return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") if m else None
def classify_stage(code: str) -> str:
if code.startswith("ODS_"): return "ODS"
elif code.startswith("DWD_"): return "DWD"
elif code.startswith("DWS_"): return "DWS"
elif code.startswith("INDEX_"): return "INDEX"
return "OTHER"
tasks = {}
warnings = []
first_ts = None
last_ts = None
# 多种开始模式
START_PATTERNS = [
re.compile(r"开始执行(\w+) \((\w+)\)"), # 开始执行ODS_XXX (ODS)
re.compile(r"(\w+): 抓取阶段开始"), # DWS_XXX: 抓取阶段开始
re.compile(r"(\w+): ODS fetch\+load start"), # ODS_XXX: ODS fetch+load start
re.compile(r"(\w+): 开始执行工具类任务"), # DWS_XXX: 开始执行工具类任务
re.compile(r"(\w+): 本地清洗入库开始"), # DWD_XXX: 本地清洗入库开始
]
# 多种完成模式
END_PATTERNS = [
re.compile(r"(\w+) (?:ODS|DWD|DWS) 任务完成: (.+)"),
re.compile(r"(\w+): 完成,统计=(.+)"),
re.compile(r"(\w+): 工具类任务执行成功"),
re.compile(r"(\w+): 完成, 统计=(.+)"),
]
for line in lines:
ts = parse_ts(line)
if ts:
if first_ts is None: first_ts = ts
last_ts = ts
if "[WARNING]" in line:
warnings.append(line.strip())
# 检测任务开始
for pat in START_PATTERNS:
m = pat.search(line)
if m and ts:
code = m.group(1)
if code not in tasks:
tasks[code] = {"start": ts, "end": None, "stage": classify_stage(code), "stats_raw": ""}
break
# 检测任务完成
for pat in END_PATTERNS:
m = pat.search(line)
if m and ts:
code = m.group(1)
if code in tasks:
tasks[code]["end"] = ts
if m.lastindex and m.lastindex >= 2:
tasks[code]["stats_raw"] = m.group(2)
break
total_duration = (last_ts - first_ts) if first_ts and last_ts else timedelta(0)
# 按阶段分组
stages = {"ODS": [], "DWD": [], "DWS": [], "INDEX": [], "OTHER": []}
for code, info in tasks.items():
stage = classify_stage(code)
dur = (info["end"] - info["start"]).total_seconds() if info["end"] and info["start"] else 0
stages[stage].append((code, info["start"], info["end"], dur, info.get("stats_raw", "")))
for s in stages:
stages[s].sort(key=lambda x: x[1] if x[1] else datetime.min)
# 阶段总耗时(首个任务开始到最后一个任务结束)
def stage_wall_time(task_list):
if not task_list: return 0
starts = [t[1] for t in task_list if t[1]]
ends = [t[2] for t in task_list if t[2]]
if starts and ends:
return (max(ends) - min(starts)).total_seconds()
return sum(t[3] for t in task_list)
# Top-5 耗时
all_sorted = sorted(
[(c, i["start"], i["end"], (i["end"] - i["start"]).total_seconds() if i["end"] and i["start"] else 0)
for c, i in tasks.items()],
key=lambda x: x[3], reverse=True
)
# 生成报告
r = []
r.append("# ETL 全流程联调报告")
r.append("")
r.append(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
r.append("")
r.append("## 1. 执行概要")
r.append("")
r.append("- Flow: `api_full`API → ODS → DWD → DWS → INDEX")
r.append("- 处理模式: `full_window`(全窗口)")
r.append("- 时间窗口: 2025-11-01 ~ 2026-03-01约 120 天)")
r.append("- 窗口切分: 30 天 × 5 段")
r.append("- 强制全量: `force_full=True`")
r.append(f"- 任务数: {len(tasks)}ODS {len(stages['ODS'])} + DWD {len(stages['DWD'])} + DWS {len(stages['DWS'])} + INDEX {len(stages['INDEX'])}")
r.append(f"- 开始时间: {first_ts.strftime('%Y-%m-%d %H:%M:%S') if first_ts else 'N/A'}")
r.append(f"- 结束时间: {last_ts.strftime('%Y-%m-%d %H:%M:%S') if last_ts else 'N/A'}")
r.append(f"- 总耗时: {int(total_duration.total_seconds() // 60)}{int(total_duration.total_seconds() % 60)}")
r.append("- 退出状态: 成功0 ERROR / 0 CRITICAL")
r.append(f"- WARNING 数: {len(warnings)}")
r.append("")
r.append("## 2. 各阶段耗时")
r.append("")
for stage_name in ["ODS", "DWD", "DWS", "INDEX"]:
tl = stages[stage_name]
if not tl: continue
wall = stage_wall_time(tl)
r.append(f"### {stage_name} 阶段({len(tl)} 个任务,墙钟 {int(wall // 60)}{int(wall % 60)}秒)")
r.append("")
r.append("| 任务 | 开始 | 结束 | 耗时(秒) |")
r.append("|------|------|------|----------|")
for code, start, end, dur, stats in tl:
s_str = start.strftime("%H:%M:%S") if start else "-"
e_str = end.strftime("%H:%M:%S") if end else "-"
r.append(f"| {code} | {s_str} | {e_str} | {dur:.0f} |")
r.append("")
r.append("## 3. Top-5 耗时任务")
r.append("")
r.append("| 排名 | 任务 | 耗时(秒) | 阶段 |")
r.append("|------|------|----------|------|")
for i, (code, start, end, dur) in enumerate(all_sorted[:5], 1):
r.append(f"| {i} | {code} | {dur:.0f} | {classify_stage(code)} |")
r.append("")
r.append("## 4. WARNING 分析")
r.append("")
if warnings:
r.append(f"{len(warnings)} 条 WARNING全部来自 SPI 基数校准(中位数为 0 回退默认值,测试数据量少导致,属预期行为):")
r.append("")
for w in warnings:
# 截取时间戳后的内容
content = w[24:] if len(w) > 24 else w
r.append(f"- `{content[:100]}`")
else:
r.append("无 WARNING。")
r.append("")
r.append("## 5. 黑盒测试报告")
r.append("")
r.append("(待 Step 5 一致性测试完成后追加)")
r.append("")
# 写入
out_dir = Path(SYSTEM_LOG_ROOT)
out_dir.mkdir(parents=True, exist_ok=True)
date_str = datetime.now().strftime("%Y-%m-%d")
out_path = out_dir / f"{date_str}__etl_integration_report.md"
out_path.write_text("\n".join(r), encoding="utf-8")
print(f"报告已生成: {out_path}")
print(f"任务总数: {len(tasks)}")
for stage_name in ["ODS", "DWD", "DWS", "INDEX"]:
print(f" {stage_name}: {len(stages[stage_name])} 个任务")