""" ETL 全流程联调报告生成脚本 从后端 API 获取执行日志,解析计时数据和错误信息,生成综合联调报告。 报告输出到 SYSTEM_LOG_ROOT 环境变量指定的目录。 """ import os import re import json import sys from datetime import datetime, timedelta from pathlib import Path from dotenv import load_dotenv # 加载根 .env load_dotenv(Path(__file__).resolve().parents[2] / ".env") SYSTEM_LOG_ROOT = os.environ.get("SYSTEM_LOG_ROOT") if not SYSTEM_LOG_ROOT: print("ERROR: SYSTEM_LOG_ROOT 环境变量未设置", file=sys.stderr) sys.exit(1) # ── 执行元数据(从 API 历史获取) ── EXEC_ID = "1e1c93ff-2ab0-42e6-b529-ec14b551c91a" EXEC_STATUS = "success" EXEC_EXIT_CODE = 0 EXEC_STARTED = "2026-02-24T02:15:26.689731+08:00" EXEC_FINISHED = "2026-02-24T02:50:39.679479+08:00" EXEC_DURATION_MS = 2112989 TASK_COUNT = 41 FLOW = "api_full" PROCESSING_MODE = "full_window" WINDOW_START = "2025-11-01" WINDOW_END = "2026-02-20" WINDOW_SPLIT_DAYS = 30 # ── 日志文件路径 ── SCRIPT_DIR = Path(__file__).resolve().parent ERROR_LOG_PATH = SCRIPT_DIR / "_tmp_error_log.txt" if not ERROR_LOG_PATH.exists(): print(f"ERROR: 日志文件不存在: {ERROR_LOG_PATH}", file=sys.stderr) sys.exit(1) log_text = ERROR_LOG_PATH.read_text(encoding="utf-8") lines = log_text.splitlines() # ── 1. 解析各任务的开始/结束时间 ── TS_RE = re.compile(r"^\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]") def parse_ts(line: str): m = TS_RE.match(line) if m: return datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S") return None # 提取每个任务的开始和结束时间 task_timings: dict[str, dict] = {} START_RE = re.compile(r"开始执行(\w+) \(ODS\)|(\w+): ODS fetch\+load start") COMPLETE_RE = re.compile(r"(\w+) ODS 任务完成:|(\w+): 完成,统计=|(\w+): 完成, 统计=|(\w+): 结果统计:") FAIL_RE = re.compile(r"任务 (\w+) 失败:") DWD_START_RE = re.compile(r"DWD_LOAD_FROM_ODS.*开始|开始运行.*DWD_LOAD_FROM_ODS") DWD_COMPLETE_RE = re.compile(r"DWD_LOAD_FROM_ODS: 完成") DWS_START_RE = re.compile(r"(DWS_\w+):.*开始|开始执行.*(DWS_\w+)") DWS_COMPLETE_RE = re.compile(r"(DWS_\w+): 完成") for line in lines: ts = parse_ts(line) if not ts: continue # ODS 任务开始 m = START_RE.search(line) if m: task = m.group(1) or m.group(2) if task and task not in task_timings: task_timings[task] = {"start": ts, "end": None, "status": "running"} # ODS 任务完成 m = COMPLETE_RE.search(line) if m: task = m.group(1) or m.group(2) or m.group(3) or m.group(4) if task and task in task_timings: task_timings[task]["end"] = ts task_timings[task]["status"] = "success" # DWD 开始 if "DWD_LOAD_FROM_ODS" in line and ("开始" in line or "start" in line.lower()): if "DWD_LOAD_FROM_ODS" not in task_timings: task_timings["DWD_LOAD_FROM_ODS"] = {"start": ts, "end": None, "status": "running"} # DWD 完成 m = DWD_COMPLETE_RE.search(line) if m and "DWD_LOAD_FROM_ODS" in task_timings: task_timings["DWD_LOAD_FROM_ODS"]["end"] = ts task_timings["DWD_LOAD_FROM_ODS"]["status"] = "success" # DWS 任务开始(仅首次) for pattern in [r"(DWS_\w+):.*(?:开始|start)", r"开始执行.*(DWS_\w+)"]: m2 = re.search(pattern, line) if m2: task = m2.group(1) if task not in task_timings: task_timings[task] = {"start": ts, "end": None, "status": "running"} # DWS 任务完成 m = DWS_COMPLETE_RE.search(line) if m: task = m.group(1) if task in task_timings: task_timings[task]["end"] = ts task_timings[task]["status"] = "success" # 任务失败 m = FAIL_RE.search(line) if m: task = m.group(1) if task in task_timings: task_timings[task]["end"] = ts task_timings[task]["status"] = "failed" else: task_timings[task] = {"start": ts, "end": ts, "status": "failed"} # 计算耗时 for task, info in task_timings.items(): if info["start"] and info["end"]: info["duration_s"] = (info["end"] - info["start"]).total_seconds() else: info["duration_s"] = 0 # ── 2. 收集错误和警告 ── errors: list[dict] = [] warnings: list[dict] = [] for i, line in enumerate(lines): ts = parse_ts(line) if "ERROR" in line: # 收集错误行及后续 traceback 上下文(最多 10 行) context_lines = [line] for j in range(i + 1, min(i + 10, len(lines))): next_line = lines[j] if TS_RE.match(next_line) and "Traceback" not in next_line: break context_lines.append(next_line) errors.append({"ts": ts, "line": line.strip(), "context": "\n".join(context_lines)}) elif "WARNING" in line: warnings.append({"ts": ts, "line": line.strip()}) # ── 3. 分类错误 ── error_categories: dict[str, list] = {} for err in errors: if "未知的任务类型" in err["line"]: cat = "任务未注册" elif "member_retention_clue" in err["context"] or "member_birthday_manual" in err["context"]: cat = "FDW 表缺失(根因)" elif "InFailedSqlTransaction" in err["context"]: cat = "事务级联失败" else: cat = "其他" error_categories.setdefault(cat, []).append(err) # ── 4. 按层分组计时 ── ods_tasks = {k: v for k, v in task_timings.items() if k.startswith("ODS_")} dwd_tasks = {k: v for k, v in task_timings.items() if k.startswith("DWD_")} dws_tasks = {k: v for k, v in task_timings.items() if k.startswith("DWS_")} # Top-5 耗时 all_with_duration = [(k, v) for k, v in task_timings.items() if v["duration_s"] > 0] top5 = sorted(all_with_duration, key=lambda x: x[1]["duration_s"], reverse=True)[:5] # 各层总耗时 ods_total = sum(v["duration_s"] for v in ods_tasks.values()) dwd_total = sum(v["duration_s"] for v in dwd_tasks.values()) dws_total = sum(v["duration_s"] for v in dws_tasks.values()) # 成功/失败统计 success_count = sum(1 for v in task_timings.values() if v["status"] == "success") failed_count = sum(1 for v in task_timings.values() if v["status"] == "failed") failed_tasks = [k for k, v in task_timings.items() if v["status"] == "failed"] # ── 5. 生成报告 ── def fmt_duration(seconds: float) -> str: """格式化秒数为 mm:ss 或 hh:mm:ss""" if seconds < 0: return "N/A" m, s = divmod(int(seconds), 60) h, m = divmod(m, 60) if h > 0: return f"{h}h {m:02d}m {s:02d}s" return f"{m}m {s:02d}s" report_lines: list[str] = [] def w(line: str = ""): report_lines.append(line) w("# ETL 全流程联调报告") w() w(f"> 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") w() # ── 执行概要 ── w("## 执行概要") w() w(f"| 项目 | 值 |") w(f"|------|-----|") w(f"| Execution ID | `{EXEC_ID}` |") w(f"| Flow | `{FLOW}` (API → ODS → DWD → DWS → INDEX) |") w(f"| 处理模式 | `{PROCESSING_MODE}` |") w(f"| 时间窗口 | {WINDOW_START} ~ {WINDOW_END} |") w(f"| 窗口切分 | 按天,{WINDOW_SPLIT_DAYS} 天/切片(共 4 个切片) |") w(f"| 强制全量 | 是 (`force_full`) |") w(f"| 任务数 | {TASK_COUNT} 个(全选 `is_common=True`) |") w(f"| 开始时间 | {EXEC_STARTED} |") w(f"| 结束时间 | {EXEC_FINISHED} |") w(f"| 总时长 | {fmt_duration(EXEC_DURATION_MS / 1000)} ({EXEC_DURATION_MS}ms) |") w(f"| 退出码 | {EXEC_EXIT_CODE} |") w(f"| 最终状态 | `{EXEC_STATUS}` |") w(f"| 任务成功 | {success_count} / {success_count + failed_count} |") w(f"| 任务失败 | {failed_count} |") w() # ── 性能报告 ── w("## 性能报告") w() w("### 各层耗时汇总") w() w(f"| 层 | 任务数 | 总耗时 | 平均耗时 |") w(f"|-----|-------|--------|---------|") ods_count = len(ods_tasks) dwd_count = len(dwd_tasks) dws_count = len(dws_tasks) w(f"| ODS | {ods_count} | {fmt_duration(ods_total)} | {fmt_duration(ods_total / max(ods_count, 1))} |") w(f"| DWD | {dwd_count} | {fmt_duration(dwd_total)} | {fmt_duration(dwd_total / max(dwd_count, 1))} |") w(f"| DWS+INDEX | {dws_count} | {fmt_duration(dws_total)} | {fmt_duration(dws_total / max(dws_count, 1))} |") w() w("### Top-5 耗时任务") w() w(f"| 排名 | 任务 | 耗时 | 状态 |") w(f"|------|------|------|------|") for rank, (task, info) in enumerate(top5, 1): w(f"| {rank} | `{task}` | {fmt_duration(info['duration_s'])} | {info['status']} |") w() w("### ODS 层各任务耗时明细") w() w(f"| 任务 | 开始 | 结束 | 耗时 | 记录数 |") w(f"|------|------|------|------|--------|") # 从日志中提取 fetched 数量 fetch_counts: dict[str, int] = {} for line in lines: m = re.search(r"(\w+) ODS 任务完成: \{'fetched': (\d+)", line) if m: fetch_counts[m.group(1)] = int(m.group(2)) for task in sorted(ods_tasks.keys()): info = ods_tasks[task] start_str = info["start"].strftime("%H:%M:%S") if info["start"] else "?" end_str = info["end"].strftime("%H:%M:%S") if info["end"] else "?" fetched = fetch_counts.get(task, "?") w(f"| `{task}` | {start_str} | {end_str} | {fmt_duration(info['duration_s'])} | {fetched} |") w() w("### DWD + DWS 层各任务耗时明细") w() w(f"| 任务 | 开始 | 结束 | 耗时 | 状态 |") w(f"|------|------|------|------|------|") for task in sorted({**dwd_tasks, **dws_tasks}.keys()): info = task_timings[task] start_str = info["start"].strftime("%H:%M:%S") if info["start"] else "?" end_str = info["end"].strftime("%H:%M:%S") if info["end"] else "?" w(f"| `{task}` | {start_str} | {end_str} | {fmt_duration(info['duration_s'])} | {info['status']} |") w() # ── DEBUG 报告 ── w("## DEBUG 报告") w() if not errors and not warnings: w("无错误或警告。") else: w(f"共发现 **{len(errors)}** 个 ERROR,**{len(warnings)}** 个 WARNING。") w() w("### 错误分类汇总") w() w(f"| 类别 | 数量 | 说明 |") w(f"|------|------|------|") for cat, errs in error_categories.items(): if cat == "FDW 表缺失(根因)": desc = "`fdw_app.member_birthday_manual` 关系不存在" elif cat == "事务级联失败": desc = "根因错误导致事务终止,后续 DWS 任务全部 `InFailedSqlTransaction`" else: desc = "未分类" w(f"| {cat} | {len(errs)} | {desc} |") w() w("### 错误详情") w() w("#### 错误 1:FDW 表缺失导致 DWS 级联失败(根因)") w() w("- 时间: 02:50:36") w("- 根因: `UndefinedTable: 关系 \"fdw_app.member_birthday_manual\" 不存在`") w("- 触发任务: `DWS_MEMBER_CONSUMPTION`") w("- 降级尝试: 代码尝试降级为 `dim_member.birthday`,但降级查询在已失败的事务中执行,仍然报错") w("- 级联影响: 事务被终止后,以下 10 个任务全部 `InFailedSqlTransaction`:") w(" - `DWS_MEMBER_VISIT`") w(" - `DWS_FINANCE_DAILY`") w(" - `DWS_FINANCE_RECHARGE`") w(" - `DWS_FINANCE_INCOME_STRUCTURE`") w(" - `DWS_FINANCE_DISCOUNT_DETAIL`") w(" - `DWS_ASSISTANT_MONTHLY`") w(" - `DWS_ASSISTANT_FINANCE`") w(" - `DWS_WINBACK_INDEX`") w(" - `DWS_NEWCONV_INDEX`") w(" - `DWS_RELATION_INDEX`") w("- 建议:") w(" 1. 在 `zqyy_app` 数据库中创建 `member_birthday_manual` 表(或对应的 FDW 映射)") w(" 2. 或修改 `DWS_MEMBER_CONSUMPTION` 的降级逻辑,在 FDW 失败时先 ROLLBACK 再重试降级查询") w(" 3. 考虑为 DWS 任务使用独立事务/连接,避免单任务失败导致级联") w() if warnings: w("### 警告详情") w() for warn in warnings: w(f"- `{warn['line']}`") w() # ── 黑盒测试报告占位 ── w("## 黑盒测试报告") w() w("_(将在一致性检查完成后追加)_") w() # ── 输出报告 ── output_dir = Path(SYSTEM_LOG_ROOT) output_dir.mkdir(parents=True, exist_ok=True) date_str = datetime.now().strftime("%Y-%m-%d") output_path = output_dir / f"{date_str}__etl_integration_report.md" report_content = "\n".join(report_lines) output_path.write_text(report_content, encoding="utf-8") print(f"报告已生成: {output_path}") print(f"任务统计: {success_count} 成功 / {failed_count} 失败 / {success_count + failed_count} 总计") print(f"错误数: {len(errors)}, 警告数: {len(warnings)}")