在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

View File

@@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
"""修复数据库序列:将序列值同步到表中的最大主键值。
根因:序列被重置到 1但表中已有数据导致 INSERT 时主键冲突。
"""
import sys
from pathlib import Path
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
config = AppConfig.load()
db = DatabaseConnection(
dsn=config["db"]["dsn"],
connect_timeout=config["db"].get("connect_timeout_sec"),
)
# 需要修复的序列列表:(序列名, 表名, 主键列名)
SEQUENCES_TO_FIX = [
("meta.etl_run_run_id_seq", "meta.etl_run", "run_id"),
("dws.dws_index_percentile_history_history_id_seq", "dws.dws_index_percentile_history", "history_id"),
]
for seq_name, table_name, pk_col in SEQUENCES_TO_FIX:
try:
# 获取当前最大值
rows = db.query(f"SELECT COALESCE(max({pk_col}), 0) as max_val FROM {table_name}")
max_val = rows[0]["max_val"]
# 获取当前序列值
rows2 = db.query(f"SELECT last_value, is_called FROM {seq_name}")
cur_val = rows2[0]["last_value"]
print(f"{seq_name}:")
print(f"{table_name} 最大 {pk_col} = {max_val}")
print(f" 序列当前值 = {cur_val}")
if max_val > cur_val:
# 修复:将序列设置为 max_val + 1
db.query(f"SELECT setval('{seq_name}', {max_val})")
db.commit()
# 验证
rows3 = db.query(f"SELECT last_value, is_called FROM {seq_name}")
print(f" ✓ 已修复: 序列新值 = {rows3[0]['last_value']}")
else:
print(f" ✓ 序列值正常,无需修复")
print()
except Exception as e:
print(f" ✗ 修复失败: {e}")
db.rollback()
db.close()
print("完成。")

View File

@@ -0,0 +1,878 @@
# -*- coding: utf-8 -*-
"""ETL 架构分析脚本。
通过静态分析AST 解析、import 扫描、文件统计)评估 ETL 代码结构,
生成架构优化报告Markdown
分析维度:
1. 模块依赖关系 — 扫描 import构建依赖图识别循环依赖
2. 文件大小分析 — 统计行数,识别过大文件(>500 行)
3. 函数复杂度 — AST 分析圈复杂度(分支/嵌套深度)
4. 重复代码检测 — 比较函数签名和结构相似度
5. 耦合度评估 — 模块间导入关系密度
6. 任务分类分析 — 从 TaskRegistry 读取元数据,评估分类合理性
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.analyze_architecture
"""
from __future__ import annotations
import ast
import argparse
import logging
import os
import sys
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Iterator
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
# ── 分析范围ETL 核心模块 ──
_CORE_MODULES = [
"api", "cli", "config", "database", "loaders", "models",
"orchestration", "quality", "scd", "tasks", "utils",
]
# ═══════════════════════════════════════════════════════════════
# 数据结构
# ═══════════════════════════════════════════════════════════════
@dataclass
class FileInfo:
"""单个 .py 文件的统计信息"""
path: Path
rel_path: str
lines: int = 0
code_lines: int = 0 # 非空非注释行
blank_lines: int = 0
comment_lines: int = 0
module: str = "" # 所属模块api/cli/...
@dataclass
class FunctionInfo:
"""函数/方法的分析信息"""
name: str
file: str
line: int
complexity: int = 1 # 圈复杂度
max_nesting: int = 0 # 最大嵌套深度
param_count: int = 0
lines: int = 0 # 函数体行数
is_method: bool = False
class_name: str = ""
@dataclass
class ImportEdge:
"""模块间的导入关系"""
source_module: str # 导入方
target_module: str # 被导入方
source_file: str
import_name: str # 具体导入的名称
@dataclass
class ArchitectureReport:
"""架构分析报告的完整数据"""
generated_at: datetime = field(default_factory=datetime.now)
# 文件统计
files: list[FileInfo] = field(default_factory=list)
# 函数分析
functions: list[FunctionInfo] = field(default_factory=list)
# 依赖关系
import_edges: list[ImportEdge] = field(default_factory=list)
circular_deps: list[tuple[str, str]] = field(default_factory=list)
# 任务分类
task_classification: dict = field(default_factory=dict)
# 重复代码
similar_functions: list[tuple[str, str, float]] = field(default_factory=list)
# ═══════════════════════════════════════════════════════════════
# 日志
# ═══════════════════════════════════════════════════════════════
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("analyze_architecture")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
# ═══════════════════════════════════════════════════════════════
# 1. 文件扫描与行数统计
# ═══════════════════════════════════════════════════════════════
def _iter_py_files(root: Path) -> Iterator[Path]:
"""递归遍历核心模块下的 .py 文件,跳过 __pycache__ / .hypothesis 等。"""
skip_dirs = {"__pycache__", ".hypothesis", ".pytest_cache", "export", "Asia"}
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if d not in skip_dirs]
for fn in filenames:
if fn.endswith(".py"):
yield Path(dirpath) / fn
def _classify_module(rel_path: str) -> str:
"""从相对路径提取所属模块名。"""
parts = Path(rel_path).parts
if parts:
top = parts[0]
if top in _CORE_MODULES:
return top
if top == "scripts":
return "scripts"
if top == "tests":
return "tests"
return "root"
def _count_lines(filepath: Path) -> FileInfo:
"""统计单个文件的行数分布。"""
info = FileInfo(path=filepath, rel_path="")
try:
text = filepath.read_text(encoding="utf-8", errors="replace")
except Exception:
return info
raw_lines = text.splitlines()
info.lines = len(raw_lines)
for line in raw_lines:
stripped = line.strip()
if not stripped:
info.blank_lines += 1
elif stripped.startswith("#"):
info.comment_lines += 1
else:
info.code_lines += 1
return info
def scan_files(root: Path, logger: logging.Logger) -> list[FileInfo]:
"""扫描所有 .py 文件并统计行数。"""
results: list[FileInfo] = []
for fp in _iter_py_files(root):
info = _count_lines(fp)
info.path = fp
info.rel_path = str(fp.relative_to(root)).replace("\\", "/")
info.module = _classify_module(info.rel_path)
results.append(info)
logger.info("扫描完成:共 %d 个 .py 文件", len(results))
return results
# ═══════════════════════════════════════════════════════════════
# 2. AST 分析:函数复杂度
# ═══════════════════════════════════════════════════════════════
# 增加圈复杂度的 AST 节点类型
_COMPLEXITY_NODES = (
ast.If, ast.For, ast.While, ast.ExceptHandler,
ast.With, ast.Assert, ast.BoolOp,
)
# 仅 comprehension 内的 if 子句
_COMP_NODES = (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)
def _calc_complexity(node: ast.AST) -> int:
"""计算函数体的圈复杂度McCabe"""
complexity = 1
for child in ast.walk(node):
if isinstance(child, _COMPLEXITY_NODES):
complexity += 1
# BoolOp 中每个额外的 and/or 加 1
if isinstance(child, ast.BoolOp):
complexity += len(child.values) - 2 if len(child.values) > 2 else 0
elif isinstance(child, _COMP_NODES):
for gen in child.generators:
complexity += len(gen.ifs)
return complexity
def _calc_max_nesting(node: ast.AST, depth: int = 0) -> int:
"""计算最大嵌套深度。"""
nesting_types = (ast.If, ast.For, ast.While, ast.With, ast.Try, ast.ExceptHandler)
max_depth = depth
for child in ast.iter_child_nodes(node):
if isinstance(child, nesting_types):
child_depth = _calc_max_nesting(child, depth + 1)
max_depth = max(max_depth, child_depth)
else:
child_depth = _calc_max_nesting(child, depth)
max_depth = max(max_depth, child_depth)
return max_depth
def _func_body_lines(node: ast.FunctionDef | ast.AsyncFunctionDef) -> int:
"""计算函数体行数。"""
if not node.body:
return 0
first_line = node.body[0].lineno
last_line = node.body[-1].end_lineno or node.body[-1].lineno
return last_line - first_line + 1
def _walk_with_parent(tree: ast.AST):
"""遍历 AST 并记录每个节点的父节点(避免 O(n²) 嵌套 walk"""
# 先给所有节点标记 parent
for node in ast.walk(tree):
for child in ast.iter_child_nodes(node):
child._parent = node # type: ignore[attr-defined]
def analyze_functions(files: list[FileInfo], logger: logging.Logger) -> list[FunctionInfo]:
"""对所有文件做 AST 分析,提取函数/方法信息。"""
results: list[FunctionInfo] = []
for fi in files:
try:
source = fi.path.read_text(encoding="utf-8", errors="replace")
tree = ast.parse(source, filename=fi.rel_path)
except (SyntaxError, UnicodeDecodeError):
continue
_walk_with_parent(tree)
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
# 通过 _parent 属性判断是否为方法
parent = getattr(node, "_parent", None)
class_name = ""
is_method = False
if isinstance(parent, ast.ClassDef):
class_name = parent.name
is_method = True
param_count = len(node.args.args)
if is_method and param_count > 0:
param_count -= 1 # 去掉 self/cls
info = FunctionInfo(
name=node.name,
file=fi.rel_path,
line=node.lineno,
complexity=_calc_complexity(node),
max_nesting=_calc_max_nesting(node),
param_count=param_count,
lines=_func_body_lines(node),
is_method=is_method,
class_name=class_name,
)
results.append(info)
logger.info("函数分析完成:共 %d 个函数/方法", len(results))
return results
# ═══════════════════════════════════════════════════════════════
# 3. 依赖关系分析
# ═══════════════════════════════════════════════════════════════
def _extract_imports(filepath: Path, rel_path: str) -> list[ImportEdge]:
"""从单个文件提取 import 语句,映射到模块级别。"""
edges: list[ImportEdge] = []
try:
source = filepath.read_text(encoding="utf-8", errors="replace")
tree = ast.parse(source, filename=rel_path)
except (SyntaxError, UnicodeDecodeError):
return edges
source_module = _classify_module(rel_path)
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
target = _resolve_import_module(alias.name)
if target and target != source_module:
edges.append(ImportEdge(
source_module=source_module,
target_module=target,
source_file=rel_path,
import_name=alias.name,
))
elif isinstance(node, ast.ImportFrom):
if node.module:
target = _resolve_import_module(node.module)
if target and target != source_module:
names = ", ".join(a.name for a in (node.names or []))
edges.append(ImportEdge(
source_module=source_module,
target_module=target,
source_file=rel_path,
import_name=f"{node.module}.{{{names}}}",
))
return edges
def _resolve_import_module(import_path: str) -> str | None:
"""将 import 路径映射到核心模块名。"""
parts = import_path.split(".")
top = parts[0]
if top in _CORE_MODULES:
return top
return None
def analyze_dependencies(files: list[FileInfo], logger: logging.Logger) -> tuple[list[ImportEdge], list[tuple[str, str]]]:
"""分析模块间依赖关系,检测循环依赖。"""
all_edges: list[ImportEdge] = []
for fi in files:
all_edges.extend(_extract_imports(fi.path, fi.rel_path))
# 构建有向图检测循环
graph: dict[str, set[str]] = defaultdict(set)
for edge in all_edges:
graph[edge.source_module].add(edge.target_module)
circular: list[tuple[str, str]] = []
for src, targets in graph.items():
for tgt in targets:
if src in graph.get(tgt, set()):
pair = tuple(sorted([src, tgt]))
if pair not in circular:
circular.append(pair)
logger.info("依赖分析完成:%d 条导入边,%d 对循环依赖", len(all_edges), len(circular))
return all_edges, circular
# ═══════════════════════════════════════════════════════════════
# 4. 重复代码检测(基于函数签名相似度)
# ═══════════════════════════════════════════════════════════════
def _func_signature_key(fn: FunctionInfo) -> str:
"""生成函数签名指纹:参数数量 + 行数范围 + 复杂度。"""
line_bucket = fn.lines // 10 * 10 # 按 10 行分桶
return f"p{fn.param_count}_l{line_bucket}_c{fn.complexity}"
def detect_similar_functions(
functions: list[FunctionInfo],
logger: logging.Logger,
min_lines: int = 15,
) -> list[tuple[str, str, float]]:
"""检测签名相似的函数对(可能是重复代码)。
只比较行数 >= min_lines 的函数,避免噪声。
"""
# 按签名分桶
buckets: dict[str, list[FunctionInfo]] = defaultdict(list)
for fn in functions:
if fn.lines >= min_lines:
key = _func_signature_key(fn)
buckets[key].append(fn)
similar: list[tuple[str, str, float]] = []
for key, group in buckets.items():
if len(group) < 2:
continue
# 同一桶内两两配对
for i in range(len(group)):
for j in range(i + 1, len(group)):
a, b = group[i], group[j]
# 跳过同文件内的重载/变体
if a.file == b.file:
continue
# 简单相似度:行数差异越小越相似
line_ratio = 1 - abs(a.lines - b.lines) / max(a.lines, b.lines)
if line_ratio >= 0.7:
label_a = f"{a.file}:{a.class_name}.{a.name}" if a.class_name else f"{a.file}:{a.name}"
label_b = f"{b.file}:{b.class_name}.{b.name}" if b.class_name else f"{b.file}:{b.name}"
similar.append((label_a, label_b, round(line_ratio, 2)))
logger.info("重复检测完成:%d 对相似函数", len(similar))
return similar
# ═══════════════════════════════════════════════════════════════
# 5. 任务分类分析
# ═══════════════════════════════════════════════════════════════
def analyze_task_classification(logger: logging.Logger) -> dict:
"""从 TaskRegistry 读取 52 个任务的元数据,分析分类合理性。"""
try:
from orchestration.task_registry import default_registry, TaskMeta
except ImportError:
logger.warning("无法导入 TaskRegistry跳过任务分类分析")
return {}
all_codes = default_registry.get_all_task_codes()
by_layer: dict[str, list[str]] = defaultdict(list)
by_type: dict[str, list[str]] = defaultdict(list)
anomalies: list[str] = []
for code in all_codes:
meta: TaskMeta | None = default_registry.get_metadata(code)
if not meta:
continue
layer = meta.layer or "NONE"
by_layer[layer].append(code)
by_type[meta.task_type].append(code)
# 检测命名与分类不一致
if code.startswith("DWS_") and layer not in ("DWS", "INDEX"):
anomalies.append(f"{code}: 前缀 DWS_ 但分类为 {layer}")
if code.startswith("ODS_") and layer != "ODS":
anomalies.append(f"{code}: 前缀 ODS_ 但分类为 {layer}")
if code.startswith("DWD_") and layer != "DWD":
anomalies.append(f"{code}: 前缀 DWD_ 但分类为 {layer}")
# 检测 INDEX 层任务命名
if layer == "INDEX" and not code.startswith("DWS_"):
anomalies.append(f"{code}: INDEX 层但不以 DWS_ 开头,可能造成混淆")
# INDEX 层任务以 DWS_ 开头的命名问题
index_tasks = by_layer.get("INDEX", [])
if index_tasks and all(c.startswith("DWS_") for c in index_tasks):
anomalies.append(
f"INDEX 层全部 {len(index_tasks)} 个任务以 DWS_ 开头,"
"建议改为 IDX_ 前缀以区分 DWS 汇总任务"
)
result = {
"total": len(all_codes),
"by_layer": {k: {"count": len(v), "tasks": sorted(v)} for k, v in sorted(by_layer.items())},
"by_type": {k: {"count": len(v), "tasks": sorted(v)} for k, v in sorted(by_type.items())},
"anomalies": anomalies,
}
logger.info("任务分类分析完成:共 %d 个任务,%d 个异常", len(all_codes), len(anomalies))
return result
# ═══════════════════════════════════════════════════════════════
# 6. 耦合度评估
# ═══════════════════════════════════════════════════════════════
def evaluate_coupling(
edges: list[ImportEdge],
files: list[FileInfo],
) -> dict:
"""评估模块间耦合度。
指标:
- 传入耦合Ca有多少模块依赖本模块
- 传出耦合Ce本模块依赖多少其他模块
- 不稳定度 I = Ce / (Ca + Ce),越接近 1 越不稳定
"""
# 只统计核心模块
modules = set(m for m in _CORE_MODULES if any(f.module == m for f in files))
ca: Counter = Counter() # 传入
ce: Counter = Counter() # 传出
# 去重:同一 source_module → target_module 只计一次
seen = set()
for edge in edges:
pair = (edge.source_module, edge.target_module)
if pair in seen:
continue
seen.add(pair)
if edge.source_module in modules:
ce[edge.source_module] += 1
if edge.target_module in modules:
ca[edge.target_module] += 1
coupling: dict[str, dict] = {}
for m in sorted(modules):
ca_val = ca.get(m, 0)
ce_val = ce.get(m, 0)
total = ca_val + ce_val
instability = round(ce_val / total, 2) if total > 0 else 0.0
coupling[m] = {
"afferent_coupling": ca_val,
"efferent_coupling": ce_val,
"instability": instability,
}
return coupling
# ═══════════════════════════════════════════════════════════════
# 7. Markdown 报告生成
# ═══════════════════════════════════════════════════════════════
def generate_report(report: ArchitectureReport, coupling: dict) -> str:
"""生成 Markdown 格式的架构优化报告。"""
lines: list[str] = []
_a = lines.append
_a(f"# ETL 架构分析报告")
_a(f"")
_a(f"> 生成时间:{report.generated_at.strftime('%Y-%m-%d %H:%M:%S')}")
_a(f"> 分析范围:`apps/etl/connectors/feiqiu/` 核心模块")
_a("")
# ── 概览 ──
total_files = len(report.files)
total_lines = sum(f.lines for f in report.files)
total_code = sum(f.code_lines for f in report.files)
_a("## 1. 概览")
_a("")
_a(f"| 指标 | 值 |")
_a(f"|------|-----|")
_a(f"| Python 文件数 | {total_files} |")
_a(f"| 总行数 | {total_lines:,} |")
_a(f"| 代码行数 | {total_code:,} |")
_a(f"| 函数/方法数 | {len(report.functions):,} |")
_a(f"| 注册任务数 | {report.task_classification.get('total', 'N/A')} |")
_a(f"| 循环依赖数 | {len(report.circular_deps)} |")
_a(f"| 相似函数对数 | {len(report.similar_functions)} |")
_a("")
# ── 模块规模 ──
_a("## 2. 模块规模分析")
_a("")
module_stats: dict[str, dict] = defaultdict(lambda: {"files": 0, "lines": 0, "code_lines": 0})
for f in report.files:
ms = module_stats[f.module]
ms["files"] += 1
ms["lines"] += f.lines
ms["code_lines"] += f.code_lines
_a("| 模块 | 文件数 | 总行数 | 代码行数 |")
_a("|------|--------|--------|----------|")
for mod in sorted(module_stats, key=lambda m: module_stats[m]["lines"], reverse=True):
s = module_stats[mod]
_a(f"| `{mod}` | {s['files']} | {s['lines']:,} | {s['code_lines']:,} |")
_a("")
# ── 大文件 ──
large_files = [f for f in report.files if f.lines > 500]
large_files.sort(key=lambda f: f.lines, reverse=True)
_a("## 3. 大文件识别(>500 行)")
_a("")
if large_files:
_a("| 文件 | 行数 | 代码行 | 模块 |")
_a("|------|------|--------|------|")
for f in large_files:
_a(f"| `{f.rel_path}` | {f.lines:,} | {f.code_lines:,} | {f.module} |")
_a("")
_a(f"> ⚠️ 共 {len(large_files)} 个文件超过 500 行,建议拆分以降低维护成本。")
else:
_a("所有文件均在 500 行以内。✅")
_a("")
# ── 函数复杂度 ──
_a("## 4. 函数复杂度分析")
_a("")
high_complexity = [fn for fn in report.functions if fn.complexity >= 10]
high_complexity.sort(key=lambda fn: fn.complexity, reverse=True)
_a(f"### 4.1 高复杂度函数(圈复杂度 ≥ 10")
_a("")
if high_complexity:
_a("| 函数 | 文件 | 行号 | 复杂度 | 嵌套深度 | 函数行数 |")
_a("|------|------|------|--------|----------|----------|")
for fn in high_complexity[:20]:
name = f"{fn.class_name}.{fn.name}" if fn.class_name else fn.name
_a(f"| `{name}` | `{fn.file}` | {fn.line} | {fn.complexity} | {fn.max_nesting} | {fn.lines} |")
if len(high_complexity) > 20:
_a(f"| ... | 共 {len(high_complexity)} 个 | | | | |")
else:
_a("所有函数复杂度均在合理范围内。✅")
_a("")
# 长函数
long_funcs = [fn for fn in report.functions if fn.lines >= 80]
long_funcs.sort(key=lambda fn: fn.lines, reverse=True)
_a("### 4.2 长函数(≥ 80 行)")
_a("")
if long_funcs:
_a("| 函数 | 文件 | 行号 | 函数行数 | 复杂度 |")
_a("|------|------|------|----------|--------|")
for fn in long_funcs[:15]:
name = f"{fn.class_name}.{fn.name}" if fn.class_name else fn.name
_a(f"| `{name}` | `{fn.file}` | {fn.line} | {fn.lines} | {fn.complexity} |")
if len(long_funcs) > 15:
_a(f"| ... | 共 {len(long_funcs)} 个 | | | |")
else:
_a("所有函数行数均在合理范围内。✅")
_a("")
# ── 依赖关系 ──
_a("## 5. 模块依赖关系")
_a("")
# 依赖矩阵
dep_matrix: dict[str, Counter] = defaultdict(Counter)
for edge in report.import_edges:
dep_matrix[edge.source_module][edge.target_module] += 1
all_modules = sorted(set(
list(dep_matrix.keys()) +
[t for counts in dep_matrix.values() for t in counts]
))
# 只保留核心模块
all_modules = [m for m in all_modules if m in _CORE_MODULES]
if all_modules:
_a("### 5.1 依赖矩阵(行→列 = 导入次数)")
_a("")
header = "| 模块 | " + " | ".join(f"`{m}`" for m in all_modules) + " |"
_a(header)
_a("|" + "------|" * (len(all_modules) + 1))
for src in all_modules:
row = f"| `{src}` |"
for tgt in all_modules:
count = dep_matrix.get(src, {}).get(tgt, 0)
row += f" {count or '·'} |"
_a(row)
_a("")
# 循环依赖
_a("### 5.2 循环依赖")
_a("")
if report.circular_deps:
for a, b in report.circular_deps:
_a(f"- ⚠️ `{a}` ↔ `{b}`")
_a("")
_a("> 循环依赖增加模块间耦合,建议通过接口抽象或依赖注入解耦。")
else:
_a("未检测到模块级循环依赖。✅")
_a("")
# ── 耦合度 ──
_a("## 6. 耦合度评估")
_a("")
_a("| 模块 | 传入耦合 Ca | 传出耦合 Ce | 不稳定度 I |")
_a("|------|-----------|-----------|-----------|")
for mod, vals in sorted(coupling.items(), key=lambda x: x[1]["instability"], reverse=True):
flag = " ⚠️" if vals["instability"] > 0.8 else ""
_a(f"| `{mod}` | {vals['afferent_coupling']} | {vals['efferent_coupling']} | {vals['instability']}{flag} |")
_a("")
_a("> 不稳定度 I = Ce/(Ca+Ce)。I 接近 1 表示模块高度依赖外部,变更风险大。")
_a("> I 接近 0 表示模块被广泛依赖,是稳定基础设施。")
_a("")
# ── 重复代码 ──
_a("## 7. 重复代码检测")
_a("")
if report.similar_functions:
_a("以下函数对具有相似的签名特征(参数数量、行数、复杂度),可能存在重复逻辑:")
_a("")
_a("| 函数 A | 函数 B | 相似度 |")
_a("|--------|--------|--------|")
for a, b, sim in report.similar_functions[:20]:
_a(f"| `{a}` | `{b}` | {sim:.0%} |")
if len(report.similar_functions) > 20:
_a(f"| ... | 共 {len(report.similar_functions)} 对 | |")
_a("")
_a("> 建议人工审查上述函数对,确认是否可提取公共逻辑。")
else:
_a("未检测到明显的重复函数。✅")
_a("")
# ── 任务分类 ──
tc = report.task_classification
_a("## 8. 任务分类分析")
_a("")
if tc:
_a(f"### 8.1 按层分布(共 {tc['total']} 个任务)")
_a("")
_a("| 层 | 数量 | 任务列表 |")
_a("|-----|------|----------|")
for layer, info in tc.get("by_layer", {}).items():
tasks_str = ", ".join(f"`{t}`" for t in info["tasks"][:8])
if info["count"] > 8:
tasks_str += f" ... 共 {info['count']}"
_a(f"| {layer} | {info['count']} | {tasks_str} |")
_a("")
_a("### 8.2 按类型分布")
_a("")
_a("| 类型 | 数量 |")
_a("|------|------|")
for ttype, info in tc.get("by_type", {}).items():
_a(f"| {ttype} | {info['count']} |")
_a("")
anomalies = tc.get("anomalies", [])
_a("### 8.3 分类异常")
_a("")
if anomalies:
for a in anomalies:
_a(f"- ⚠️ {a}")
else:
_a("未发现分类异常。✅")
else:
_a("任务分类分析未执行TaskRegistry 导入失败)。")
_a("")
# ── 优化建议 ──
_a("## 9. 架构优化建议")
_a("")
suggestions = _generate_suggestions(report, coupling)
for i, s in enumerate(suggestions, 1):
_a(f"{i}. {s}")
_a("")
return "\n".join(lines)
def _generate_suggestions(report: ArchitectureReport, coupling: dict) -> list[str]:
"""基于分析结果生成具体优化建议。"""
suggestions: list[str] = []
# 大文件建议
large_files = [f for f in report.files if f.lines > 500]
if large_files:
biggest = max(large_files, key=lambda f: f.lines)
suggestions.append(
f"**拆分大文件**`{biggest.rel_path}`{biggest.lines:,} 行)是最大文件,"
"建议按职责拆分为多个子模块。"
)
# 高复杂度建议
high_cx = [fn for fn in report.functions if fn.complexity >= 15]
if high_cx:
worst = max(high_cx, key=lambda fn: fn.complexity)
name = f"{worst.class_name}.{worst.name}" if worst.class_name else worst.name
suggestions.append(
f"**降低函数复杂度**`{name}`(复杂度 {worst.complexity})建议提取子函数或使用策略模式。"
)
# 循环依赖建议
if report.circular_deps:
pairs = ", ".join(f"`{a}`↔`{b}`" for a, b in report.circular_deps)
suggestions.append(
f"**消除循环依赖**{pairs}。可通过引入接口层或依赖注入解耦。"
)
# 高不稳定模块
unstable = [m for m, v in coupling.items() if v["instability"] > 0.8]
if unstable:
suggestions.append(
f"**稳定化高不稳定模块**{', '.join(f'`{m}`' for m in unstable)} "
"的不稳定度 > 0.8,建议减少对外部模块的依赖。"
)
# 任务命名建议
tc = report.task_classification
if tc:
anomalies = tc.get("anomalies", [])
if any("INDEX" in a for a in anomalies):
suggestions.append(
"**统一 INDEX 层任务命名**:当前 INDEX 层任务以 `DWS_` 开头,"
"建议改为 `IDX_` 前缀以避免与 DWS 汇总任务混淆。"
)
# 重复代码建议
if len(report.similar_functions) > 5:
suggestions.append(
f"**消除重复代码**:检测到 {len(report.similar_functions)} 对相似函数,"
"建议提取公共基类或工具函数。"
)
if not suggestions:
suggestions.append("当前架构整体健康,未发现需要立即优化的问题。")
return suggestions
# ═══════════════════════════════════════════════════════════════
# 主流程
# ═══════════════════════════════════════════════════════════════
def run_analysis(root: Path, logger: logging.Logger) -> tuple[ArchitectureReport, dict]:
"""执行完整架构分析,返回报告数据和耦合度评估。"""
report = ArchitectureReport()
logger.info("=" * 60)
logger.info("ETL 架构分析开始")
logger.info("分析根目录: %s", root)
logger.info("=" * 60)
# 1. 文件扫描
logger.info("── 阶段 1/6文件扫描 ──")
report.files = scan_files(root, logger)
# 2. 函数复杂度
logger.info("── 阶段 2/6函数复杂度分析 ──")
report.functions = analyze_functions(report.files, logger)
# 3. 依赖关系
logger.info("── 阶段 3/6依赖关系分析 ──")
report.import_edges, report.circular_deps = analyze_dependencies(report.files, logger)
# 4. 重复代码
logger.info("── 阶段 4/6重复代码检测 ──")
report.similar_functions = detect_similar_functions(report.functions, logger)
# 5. 任务分类
logger.info("── 阶段 5/6任务分类分析 ──")
report.task_classification = analyze_task_classification(logger)
# 6. 耦合度
logger.info("── 阶段 6/6耦合度评估 ──")
coupling = evaluate_coupling(report.import_edges, report.files)
logger.info("=" * 60)
logger.info("分析完成")
logger.info("=" * 60)
return report, coupling
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="ETL 架构分析")
parser.add_argument(
"--output", "-o",
help="报告输出路径(默认自动生成带日期的文件名)",
default=None,
)
return parser.parse_args()
def main():
logger = _setup_logging()
args = parse_args()
root = _FEIQIU_ROOT
report, coupling = run_analysis(root, logger)
# 生成 Markdown 报告
md_content = generate_report(report, coupling)
# 确定输出路径
reports_dir = root / "docs" / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)
if args.output:
output_path = Path(args.output)
else:
date_str = datetime.now().strftime("%Y%m%d")
output_path = reports_dir / f"architecture_report_{date_str}.md"
output_path.write_text(md_content, encoding="utf-8")
logger.info("报告已保存: %s", output_path)
# 打印摘要
total_files = len(report.files)
total_lines = sum(f.lines for f in report.files)
large_count = sum(1 for f in report.files if f.lines > 500)
high_cx = sum(1 for fn in report.functions if fn.complexity >= 10)
logger.info("")
logger.info("═══ 分析摘要 ═══")
logger.info(" 文件数: %d", total_files)
logger.info(" 总行数: %s", f"{total_lines:,}")
logger.info(" 大文件(>500行): %d", large_count)
logger.info(" 高复杂度函数(≥10): %d", high_cx)
logger.info(" 循环依赖: %d", len(report.circular_deps))
logger.info(" 相似函数对: %d", len(report.similar_functions))
logger.info(" 注册任务: %s", report.task_classification.get("total", "N/A"))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,928 @@
"""
性能分析脚本 — 读取全量刷新阶段采集的计时 JSON统计耗时、识别瓶颈、生成优化报告。
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.analyze_performance [--input <json>] [--output <md>] [--skip-sql]
功能:
1. 层级耗时统计:各层总耗时、平均耗时、任务数
2. 任务耗时排名Top 5 瓶颈任务,含 fetched/inserted 等指标
3. API 调用分析:响应时间、分页效率(每页记录数 vs 请求次数)
4. SQL 查询分析:连接数据库执行 EXPLAIN ANALYZE 分析关键查询
5. 优化建议:基于分析结果给出具体优化建议
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# 路径常量
# ---------------------------------------------------------------------------
_SCRIPT_DIR = Path(__file__).resolve().parent
_FEIQIU_ROOT = _SCRIPT_DIR.parent.parent # apps/etl/connectors/feiqiu
_OUTPUT_DIR = _SCRIPT_DIR / "output"
_REPORTS_DIR = _FEIQIU_ROOT / "docs" / "reports"
# ---------------------------------------------------------------------------
# 数据结构
# ---------------------------------------------------------------------------
@dataclass
class TaskTiming:
"""单个任务的计时数据。"""
task_code: str
layer: str
duration_sec: float
status: str
counts: dict[str, int]
error: str | None
api_calls: int
api_total_sec: float
@property
def fetched(self) -> int:
return self.counts.get("fetched", 0)
@property
def inserted(self) -> int:
return self.counts.get("inserted", 0)
@property
def updated(self) -> int:
return self.counts.get("updated", 0)
@property
def skipped(self) -> int:
return self.counts.get("skipped", 0)
@property
def throughput(self) -> float:
"""每秒处理记录数fetched / duration"""
if self.duration_sec <= 0:
return 0.0
return self.fetched / self.duration_sec
@dataclass
class LayerTiming:
"""单层的汇总计时。"""
layer: str
duration_sec: float
status: str
task_count: int
success_count: int
fail_count: int
skip_count: int
total_fetched: int
total_inserted: int
total_updated: int
total_errors: int
tasks: list[TaskTiming]
@dataclass
class VerificationSummary:
"""校验阶段摘要。"""
status: str
duration_sec: float
total_tables: int
consistent_tables: int
total_backfilled: int
error_tables: int
layers: dict[str, Any]
@dataclass
class PerformanceData:
"""完整的性能数据。"""
flow: str
window_start: str
window_end: str
overall_duration_sec: float
overall_status: str
layers: list[LayerTiming]
verification: VerificationSummary | None
@dataclass
class SQLAnalysisResult:
"""SQL EXPLAIN ANALYZE 分析结果。"""
query_name: str
table_name: str
plan_summary: str
total_cost: float
actual_time_ms: float
rows_processed: int
seq_scans: list[str]
missing_indexes: list[str]
recommendations: list[str]
@dataclass
class PerformanceReport:
"""性能分析报告数据。"""
data: PerformanceData
bottleneck_tasks: list[TaskTiming]
layer_stats: list[dict[str, Any]]
api_analysis: list[dict[str, Any]]
sql_analysis: list[SQLAnalysisResult] = field(default_factory=list)
recommendations: list[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# 日志
# ---------------------------------------------------------------------------
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("analyze_performance")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
# ---------------------------------------------------------------------------
# 数据加载
# ---------------------------------------------------------------------------
def load_timing_data(json_path: Path, logger: logging.Logger) -> PerformanceData:
"""从 JSON 文件加载计时数据。"""
logger.info("加载计时数据: %s", json_path)
raw = json.loads(json_path.read_text(encoding="utf-8"))
layers: list[LayerTiming] = []
for layer_raw in raw.get("layers", []):
tasks = [
TaskTiming(
task_code=t["task_code"],
layer=t.get("layer", layer_raw["layer"]),
duration_sec=t.get("duration_sec", 0),
status=t.get("status", "UNKNOWN"),
counts=t.get("counts", {}),
error=t.get("error"),
api_calls=t.get("api_calls", 0),
api_total_sec=t.get("api_total_sec", 0.0),
)
for t in layer_raw.get("tasks", [])
]
layers.append(LayerTiming(
layer=layer_raw["layer"],
duration_sec=layer_raw.get("duration_sec", 0),
status=layer_raw.get("status", "UNKNOWN"),
task_count=layer_raw.get("task_count", len(tasks)),
success_count=layer_raw.get("success_count", 0),
fail_count=layer_raw.get("fail_count", 0),
skip_count=layer_raw.get("skip_count", 0),
total_fetched=layer_raw.get("total_fetched", 0),
total_inserted=layer_raw.get("total_inserted", 0),
total_updated=layer_raw.get("total_updated", 0),
total_errors=layer_raw.get("total_errors", 0),
tasks=tasks,
))
verification = None
if "verification" in raw:
v = raw["verification"]
verification = VerificationSummary(
status=v.get("status", "UNKNOWN"),
duration_sec=v.get("duration_sec", 0),
total_tables=v.get("total_tables", 0),
consistent_tables=v.get("consistent_tables", 0),
total_backfilled=v.get("total_backfilled", 0),
error_tables=v.get("error_tables", 0),
layers=v.get("layers", {}),
)
return PerformanceData(
flow=raw.get("flow", ""),
window_start=raw.get("window_start", ""),
window_end=raw.get("window_end", ""),
overall_duration_sec=raw.get("overall_duration_sec", 0),
overall_status=raw.get("overall_status", "UNKNOWN"),
layers=layers,
verification=verification,
)
# ---------------------------------------------------------------------------
# 分析函数
# ---------------------------------------------------------------------------
def analyze_layer_stats(data: PerformanceData) -> list[dict[str, Any]]:
"""统计各层耗时、任务数、吞吐量。"""
stats = []
for layer in data.layers:
executed = [t for t in layer.tasks if t.status != "SKIP"]
durations = [t.duration_sec for t in executed] if executed else [0]
avg_dur = sum(durations) / len(durations) if durations else 0
total_fetched = sum(t.fetched for t in executed)
throughput = total_fetched / layer.duration_sec if layer.duration_sec > 0 else 0
stats.append({
"layer": layer.layer,
"duration_sec": layer.duration_sec,
"pct_of_total": (layer.duration_sec / data.overall_duration_sec * 100
if data.overall_duration_sec > 0 else 0),
"task_count": layer.task_count,
"executed_count": len(executed),
"success_count": layer.success_count,
"fail_count": layer.fail_count,
"skip_count": layer.skip_count,
"avg_task_sec": round(avg_dur, 2),
"max_task_sec": round(max(durations), 2),
"min_task_sec": round(min(durations), 2),
"total_fetched": total_fetched,
"total_inserted": layer.total_inserted,
"total_updated": layer.total_updated,
"throughput_per_sec": round(throughput, 1),
"status": layer.status,
})
return stats
def find_bottleneck_tasks(data: PerformanceData, top_n: int = 5) -> list[TaskTiming]:
"""识别耗时最长的前 N 个任务。"""
all_tasks: list[TaskTiming] = []
for layer in data.layers:
all_tasks.extend(layer.tasks)
# 按耗时降序排列,排除 SKIP 状态
active = [t for t in all_tasks if t.status != "SKIP"]
active.sort(key=lambda t: t.duration_sec, reverse=True)
return active[:top_n]
def analyze_api_calls(data: PerformanceData) -> list[dict[str, Any]]:
"""分析 API 调用的响应时间和分页效率。"""
results = []
for layer in data.layers:
for task in layer.tasks:
if task.status == "SKIP":
continue
fetched = task.fetched
# 根据默认 API_PAGE_SIZE=200 估算分页次数
page_size = 200
estimated_pages = max(1, (fetched + page_size - 1) // page_size) if fetched > 0 else 0
# 计算 DB 处理时间(总耗时 - API 耗时)
db_time = max(0, task.duration_sec - task.api_total_sec)
# 每条记录的处理耗时
per_record_ms = (task.duration_sec / fetched * 1000) if fetched > 0 else 0
results.append({
"task_code": task.task_code,
"layer": task.layer,
"fetched": fetched,
"api_calls": task.api_calls,
"api_total_sec": task.api_total_sec,
"estimated_pages": estimated_pages,
"avg_page_time_ms": (task.api_total_sec / estimated_pages * 1000
if estimated_pages > 0 and task.api_total_sec > 0 else 0),
"records_per_page": (fetched / estimated_pages
if estimated_pages > 0 else 0),
"db_time_sec": round(db_time, 2),
"per_record_ms": round(per_record_ms, 2),
"total_sec": task.duration_sec,
"status": task.status,
})
return results
def analyze_sql_queries(
dsn: str,
logger: logging.Logger,
) -> list[SQLAnalysisResult]:
"""连接数据库执行 EXPLAIN ANALYZE 分析关键查询。"""
try:
import psycopg2 # noqa: F811
except ImportError:
logger.warning("psycopg2 未安装,跳过 SQL 分析")
return []
# 关键查询列表ODS INSERT、DWD MERGE、DWS 汇总
queries = [
{
"name": "ODS 批量 INSERTpayment_transactions",
"table": "ods.payment_transactions",
"sql": """
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM ods.payment_transactions
WHERE fetched_at >= NOW() - INTERVAL '7 days'
LIMIT 100
""",
},
{
"name": "ODS 批量 INSERTplatform_coupon_redemption_records",
"table": "ods.platform_coupon_redemption_records",
"sql": """
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM ods.platform_coupon_redemption_records
WHERE fetched_at >= NOW() - INTERVAL '7 days'
LIMIT 100
""",
},
{
"name": "ODS content_hash 去重查询",
"table": "ods.member_balance_changes",
"sql": """
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT id, content_hash FROM ods.member_balance_changes
WHERE fetched_at >= NOW() - INTERVAL '7 days'
""",
},
{
"name": "DWD SCD2 合并dim_table",
"table": "dwd.dim_table",
"sql": """
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM dwd.dim_table
WHERE scd2_is_current = 1
""",
},
{
"name": "DWS 订单汇总查询",
"table": "dws.dws_order_summary",
"sql": """
EXPLAIN (ANALYZE, BUFFERS, FORMAT TEXT)
SELECT * FROM dws.dws_order_summary
WHERE order_date >= CURRENT_DATE - INTERVAL '30 days'
LIMIT 100
""",
},
]
results: list[SQLAnalysisResult] = []
conn = None
try:
conn = psycopg2.connect(dsn, connect_timeout=10)
conn.autocommit = True
cur = conn.cursor()
for q in queries:
try:
cur.execute(q["sql"])
rows = cur.fetchall()
plan_text = "\n".join(r[0] for r in rows)
# 解析执行计划
result = _parse_explain_plan(q["name"], q["table"], plan_text)
results.append(result)
logger.info("%s: %.1fms", q["name"], result.actual_time_ms)
except Exception as e:
logger.warning("%s: %s", q["name"], e)
results.append(SQLAnalysisResult(
query_name=q["name"],
table_name=q["table"],
plan_summary=f"执行失败: {e}",
total_cost=0,
actual_time_ms=0,
rows_processed=0,
seq_scans=[],
missing_indexes=[],
recommendations=[f"查询执行失败,需检查表是否存在: {e}"],
))
except Exception as e:
logger.error("数据库连接失败: %s", e)
finally:
if conn:
conn.close()
return results
def _parse_explain_plan(
query_name: str,
table_name: str,
plan_text: str,
) -> SQLAnalysisResult:
"""解析 EXPLAIN ANALYZE 输出,提取关键指标。"""
import re
seq_scans: list[str] = []
missing_indexes: list[str] = []
recommendations: list[str] = []
total_cost = 0.0
actual_time_ms = 0.0
rows_processed = 0
for line in plan_text.split("\n"):
# 提取总耗时
m = re.search(r"actual time=([\d.]+)\.\.([\d.]+)", line)
if m:
actual_time_ms = max(actual_time_ms, float(m.group(2)))
# 提取 cost
m = re.search(r"cost=([\d.]+)\.\.([\d.]+)", line)
if m:
total_cost = max(total_cost, float(m.group(2)))
# 提取行数
m = re.search(r"rows=(\d+)", line)
if m:
rows_processed = max(rows_processed, int(m.group(1)))
# 检测 Seq Scan
if "Seq Scan" in line:
m_tbl = re.search(r"Seq Scan on (\S+)", line)
tbl = m_tbl.group(1) if m_tbl else "unknown"
seq_scans.append(tbl)
# 基于分析结果生成建议
if seq_scans:
for tbl in seq_scans:
missing_indexes.append(tbl)
recommendations.append(f"{tbl} 存在全表扫描,建议添加索引")
if actual_time_ms > 100:
recommendations.append(f"查询耗时 {actual_time_ms:.1f}ms考虑优化查询或添加索引")
# 截取前 10 行作为摘要
summary_lines = plan_text.strip().split("\n")[:10]
plan_summary = "\n".join(summary_lines)
return SQLAnalysisResult(
query_name=query_name,
table_name=table_name,
plan_summary=plan_summary,
total_cost=total_cost,
actual_time_ms=actual_time_ms,
rows_processed=rows_processed,
seq_scans=seq_scans,
missing_indexes=missing_indexes,
recommendations=recommendations,
)
def generate_recommendations(
report: PerformanceReport,
logger: logging.Logger,
) -> list[str]:
"""基于分析结果生成优化建议。"""
recs: list[str] = []
# 1. 基于瓶颈任务的建议
for task in report.bottleneck_tasks:
if task.duration_sec > 100:
skip_ratio = task.skipped / task.fetched * 100 if task.fetched > 0 else 0
if skip_ratio > 90:
recs.append(
f"**{task.task_code}**(耗时 {task.duration_sec:.1f}s"
f"跳过率 {skip_ratio:.0f}%,建议优化 content_hash 去重逻辑,"
f"在数据库端用索引加速 hash 比对,或在 API 端增加增量过滤参数减少无效拉取"
)
elif task.fetched > 10000:
recs.append(
f"**{task.task_code}**(耗时 {task.duration_sec:.1f}s"
f"拉取 {task.fetched:,} 条记录,建议增大 API_PAGE_SIZE 或启用并行分页"
)
else:
recs.append(
f"**{task.task_code}**(耗时 {task.duration_sec:.1f}s"
f"建议分析具体耗时分布API vs DB针对性优化"
)
# 2. 基于层级统计的建议
for stat in report.layer_stats:
if stat["pct_of_total"] > 80:
recs.append(
f"**{stat['layer']} 层**占总耗时 {stat['pct_of_total']:.1f}%"
f"是主要瓶颈层,建议优先优化该层任务"
)
if stat["skip_count"] > stat["task_count"] * 0.5:
recs.append(
f"**{stat['layer']} 层**有 {stat['skip_count']}/{stat['task_count']} "
f"个任务被跳过,建议检查跳过条件是否合理"
)
# 3. 基于 API 分析的建议
high_per_record = [a for a in report.api_analysis
if a["per_record_ms"] > 5 and a["fetched"] > 1000]
if high_per_record:
recs.append(
"以下任务每条记录处理耗时较高(>5ms建议优化批量写入逻辑" +
"".join(f"{a['task_code']}({a['per_record_ms']:.1f}ms/条)"
for a in high_per_record[:5])
)
# 4. 基于 SQL 分析的建议
for sql_r in report.sql_analysis:
recs.extend(sql_r.recommendations)
# 5. 通用建议
if report.data.overall_duration_sec > 600:
recs.append(
f"全量刷新总耗时 {report.data.overall_duration_sec:.0f}s{report.data.overall_duration_sec/60:.1f}分钟),"
"建议考虑以下通用优化策略:"
)
recs.append(" - ODS 层任务间无依赖,可并行执行以大幅缩短总耗时")
recs.append(" - 对高跳过率任务,在 API 请求中增加时间过滤参数减少无效数据传输")
recs.append(" - 对大表 INSERT使用 COPY 协议替代逐行 INSERT 提升写入性能")
recs.append(" - 考虑在 content_hash 列上建立索引加速去重判断")
return recs
# ---------------------------------------------------------------------------
# 报告生成
# ---------------------------------------------------------------------------
def generate_report(report: PerformanceReport) -> str:
"""生成 Markdown 格式的性能分析报告。"""
lines: list[str] = []
_w = lines.append
_w("# ETL 性能分析报告")
_w("")
_w(f"> 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
_w(f"> 数据来源: 全量刷新计时数据({report.data.flow}")
_w(f"> 时间窗口: {report.data.window_start} ~ {report.data.window_end}")
_w(f"> 总耗时: {report.data.overall_duration_sec:.1f}s "
f"({report.data.overall_duration_sec/60:.1f}分钟)")
_w(f"> 状态: {report.data.overall_status}")
_w("")
# ── 目录 ──
_w("## 目录")
_w("")
_w("1. [执行概览](#1-执行概览)")
_w("2. [层级耗时统计](#2-层级耗时统计)")
_w("3. [性能瓶颈 Top 5](#3-性能瓶颈-top-5)")
_w("4. [任务耗时明细](#4-任务耗时明细)")
_w("5. [API 调用分析](#5-api-调用分析)")
_w("6. [SQL 查询分析](#6-sql-查询分析)")
_w("7. [校验阶段分析](#7-校验阶段分析)")
_w("8. [优化建议](#8-优化建议)")
_w("")
# ── 1. 执行概览 ──
_w("## 1. 执行概览")
_w("")
_w("| 指标 | 值 |")
_w("|------|-----|")
_w(f"| Flow | `{report.data.flow}` |")
_w(f"| 时间窗口 | {report.data.window_start} ~ {report.data.window_end} |")
_w(f"| 总耗时 | {report.data.overall_duration_sec:.1f}s "
f"({report.data.overall_duration_sec/60:.1f}分钟) |")
_w(f"| 状态 | {report.data.overall_status} |")
total_tasks = sum(s["task_count"] for s in report.layer_stats)
total_success = sum(s["success_count"] for s in report.layer_stats)
total_fail = sum(s["fail_count"] for s in report.layer_stats)
total_skip = sum(s["skip_count"] for s in report.layer_stats)
_w(f"| 总任务数 | {total_tasks} |")
_w(f"| 成功/失败/跳过 | {total_success}/{total_fail}/{total_skip} |")
total_fetched = sum(s["total_fetched"] for s in report.layer_stats)
total_inserted = sum(s["total_inserted"] for s in report.layer_stats)
_w(f"| 总拉取记录 | {total_fetched:,} |")
_w(f"| 总写入记录 | {total_inserted:,} |")
if report.data.verification:
v = report.data.verification
_w(f"| 校验耗时 | {v.duration_sec:.1f}s |")
_w(f"| 校验表数 | {v.total_tables}(一致 {v.consistent_tables}"
f"补齐 {v.total_backfilled},错误 {v.error_tables}|")
_w("")
# ── 2. 层级耗时统计 ──
_w("## 2. 层级耗时统计")
_w("")
_w("| 层 | 耗时(s) | 占比 | 任务数 | 执行数 | 成功 | 失败 | 跳过 | "
"平均(s) | 最大(s) | 拉取 | 写入 | 吞吐(条/s) |")
_w("|-----|---------|------|--------|--------|------|------|------|"
"---------|---------|------|------|------------|")
for s in report.layer_stats:
_w(f"| {s['layer']} | {s['duration_sec']:.1f} | {s['pct_of_total']:.1f}% | "
f"{s['task_count']} | {s['executed_count']} | {s['success_count']} | "
f"{s['fail_count']} | {s['skip_count']} | {s['avg_task_sec']} | "
f"{s['max_task_sec']} | {s['total_fetched']:,} | {s['total_inserted']:,} | "
f"{s['throughput_per_sec']} |")
_w("")
# 耗时分布可视化(文本柱状图)
_w("### 耗时分布")
_w("")
_w("```")
max_dur = max(s["duration_sec"] for s in report.layer_stats) if report.layer_stats else 1
for s in report.layer_stats:
bar_len = int(s["duration_sec"] / max_dur * 40) if max_dur > 0 else 0
bar = "" * bar_len
_w(f" {s['layer']:>5}{bar} {s['duration_sec']:.1f}s ({s['pct_of_total']:.1f}%)")
_w("```")
_w("")
# ── 3. 性能瓶颈 Top 5 ──
_w("## 3. 性能瓶颈 Top 5")
_w("")
_w("| 排名 | 任务 | 层 | 耗时(s) | 状态 | 拉取 | 写入 | 更新 | 跳过 | "
"吞吐(条/s) | 每条耗时(ms) |")
_w("|------|------|-----|---------|------|------|------|------|------|"
"------------|-------------|")
for i, t in enumerate(report.bottleneck_tasks, 1):
per_rec = (t.duration_sec / t.fetched * 1000) if t.fetched > 0 else 0
_w(f"| {i} | `{t.task_code}` | {t.layer} | {t.duration_sec:.1f} | "
f"{t.status} | {t.fetched:,} | {t.inserted:,} | {t.updated:,} | "
f"{t.skipped:,} | {t.throughput:.1f} | {per_rec:.2f} |")
_w("")
# 瓶颈分析
_w("### 瓶颈分析")
_w("")
for i, t in enumerate(report.bottleneck_tasks, 1):
_w(f"**{i}. {t.task_code}**{t.duration_sec:.1f}s")
if t.fetched > 0:
skip_ratio = t.skipped / t.fetched * 100
_w(f"- 拉取 {t.fetched:,} 条,跳过 {t.skipped:,} 条(跳过率 {skip_ratio:.0f}%")
_w(f"- 实际写入 {t.inserted:,} 条,写入率 {t.inserted/t.fetched*100:.1f}%")
_w(f"- 每条记录处理耗时 {t.duration_sec/t.fetched*1000:.2f}ms")
if skip_ratio > 90:
_w(f"- ⚠️ 跳过率极高,大量时间花在 content_hash 比对上")
if t.error:
_w(f"- ❌ 错误: {t.error}")
_w("")
# ── 4. 任务耗时明细 ──
_w("## 4. 任务耗时明细")
_w("")
for layer in report.data.layers:
_w(f"### {layer.layer}")
_w("")
_w("| 任务 | 耗时(s) | 状态 | 拉取 | 写入 | 跳过 | 错误 |")
_w("|------|---------|------|------|------|------|------|")
sorted_tasks = sorted(layer.tasks, key=lambda t: t.duration_sec, reverse=True)
for t in sorted_tasks:
_w(f"| `{t.task_code}` | {t.duration_sec:.1f} | {t.status} | "
f"{t.fetched:,} | {t.inserted:,} | {t.skipped:,} | "
f"{t.counts.get('errors', 0)} |")
_w("")
# ── 5. API 调用分析 ──
_w("## 5. API 调用分析")
_w("")
# 只展示有实际数据拉取的任务
api_with_data = [a for a in report.api_analysis if a["fetched"] > 0]
if api_with_data:
api_with_data.sort(key=lambda a: a["total_sec"], reverse=True)
_w("| 任务 | 拉取 | 估算页数 | 总耗时(s) | DB耗时(s) | 每条(ms) |")
_w("|------|------|----------|-----------|-----------|----------|")
for a in api_with_data:
_w(f"| `{a['task_code']}` | {a['fetched']:,} | {a['estimated_pages']} | "
f"{a['total_sec']:.1f} | {a['db_time_sec']} | {a['per_record_ms']} |")
_w("")
# 分页效率分析
_w("### 分页效率分析")
_w("")
total_records = sum(a["fetched"] for a in api_with_data)
total_pages = sum(a["estimated_pages"] for a in api_with_data)
avg_per_page = total_records / total_pages if total_pages > 0 else 0
_w(f"- 总拉取记录: {total_records:,}")
_w(f"- 估算总页数: {total_pages:,}")
_w(f"- 平均每页记录数: {avg_per_page:.1f}")
_w(f"- 当前 API_PAGE_SIZE: 200")
_w("")
if avg_per_page < 150:
_w("> ⚠️ 实际每页记录数低于 PAGE_SIZE部分端点可能返回不满页的数据")
_w("")
else:
_w("本次运行中 API 调用计时数据为 0可能未单独采集 API 耗时)。")
_w("")
_w("> 注意:当前计时数据中 `api_calls` 和 `api_total_sec` 均为 0"
"说明全量刷新脚本未单独采集 API 调用耗时。")
_w("> 建议在后续版本中为 API 调用添加独立计时,以便区分 API 等待时间和 DB 写入时间。")
_w("")
# ── 6. SQL 查询分析 ──
_w("## 6. SQL 查询分析")
_w("")
if report.sql_analysis:
for sql_r in report.sql_analysis:
_w(f"### {sql_r.query_name}")
_w("")
_w(f"- 表: `{sql_r.table_name}`")
_w(f"- 实际耗时: {sql_r.actual_time_ms:.1f}ms")
_w(f"- 预估成本: {sql_r.total_cost:.1f}")
_w(f"- 处理行数: {sql_r.rows_processed:,}")
if sql_r.seq_scans:
_w(f"- ⚠️ 全表扫描: {', '.join(sql_r.seq_scans)}")
if sql_r.missing_indexes:
_w(f"- 🔍 建议添加索引: {', '.join(sql_r.missing_indexes)}")
_w("")
_w("```")
_w(sql_r.plan_summary)
_w("```")
_w("")
else:
_w("未执行 SQL 分析(使用 `--skip-sql` 跳过或数据库连接失败)。")
_w("")
# ── 7. 校验阶段分析 ──
_w("## 7. 校验阶段分析")
_w("")
if report.data.verification:
v = report.data.verification
_w(f"- 状态: {v.status}")
_w(f"- 耗时: {v.duration_sec:.1f}s")
_w(f"- 校验表数: {v.total_tables}")
_w(f"- 一致表数: {v.consistent_tables}")
_w(f"- 补齐记录: {v.total_backfilled}")
_w(f"- 错误表数: {v.error_tables}")
_w("")
for layer_name, layer_v in v.layers.items():
_w(f"### {layer_name} 层校验")
_w("")
_w(f"- 状态: {layer_v.get('status', 'N/A')}")
_w(f"- 表数: {layer_v.get('total_tables', 0)}")
_w(f"- 一致: {layer_v.get('consistent_tables', 0)}")
_w(f"- 不一致: {layer_v.get('inconsistent_tables', 0)}")
_w(f"- 源记录: {layer_v.get('total_source_count', 0):,}")
_w(f"- 目标记录: {layer_v.get('total_target_count', 0):,}")
_w(f"- 补齐: {layer_v.get('total_backfilled', 0)}")
_w(f"- 耗时: {layer_v.get('elapsed_seconds', 0):.1f}s")
_w("")
else:
_w("无校验数据。")
_w("")
# ── 8. 优化建议 ──
_w("## 8. 优化建议")
_w("")
if report.recommendations:
for i, rec in enumerate(report.recommendations, 1):
_w(f"{i}. {rec}")
_w("")
else:
_w("暂无优化建议。")
_w("")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# 主流程
# ---------------------------------------------------------------------------
def find_latest_timing_file(output_dir: Path) -> Path | None:
"""查找最新的计时 JSON 文件(排除 intermediate/checkpoint 中间文件)。"""
exclude = {"intermediate", "checkpoint"}
json_files = sorted(
[f for f in output_dir.glob("full_refresh_*.json")
if not any(kw in f.name for kw in exclude)],
reverse=True,
)
return json_files[0] if json_files else None
def run_analysis(
json_path: Path,
logger: logging.Logger,
skip_sql: bool = False,
) -> PerformanceReport:
"""执行完整的性能分析。"""
# 1. 加载数据
data = load_timing_data(json_path, logger)
logger.info("数据加载完成: %s, 总耗时 %.1fs, %d",
data.flow, data.overall_duration_sec, len(data.layers))
# 2. 层级统计
logger.info("分析层级耗时...")
layer_stats = analyze_layer_stats(data)
for s in layer_stats:
logger.info(" %s: %.1fs (%.1f%%), %d 任务",
s["layer"], s["duration_sec"], s["pct_of_total"], s["task_count"])
# 3. 瓶颈识别
logger.info("识别性能瓶颈...")
bottlenecks = find_bottleneck_tasks(data, top_n=5)
for i, t in enumerate(bottlenecks, 1):
logger.info(" Top %d: %s (%.1fs, %s)", i, t.task_code, t.duration_sec, t.layer)
# 4. API 分析
logger.info("分析 API 调用...")
api_analysis = analyze_api_calls(data)
# 5. SQL 分析
sql_analysis: list[SQLAnalysisResult] = []
if not skip_sql:
logger.info("分析 SQL 查询执行计划...")
dsn = _load_dsn()
if dsn:
sql_analysis = analyze_sql_queries(dsn, logger)
logger.info("SQL 分析完成: %d 个查询", len(sql_analysis))
else:
logger.warning("未找到数据库 DSN跳过 SQL 分析")
else:
logger.info("跳过 SQL 分析(--skip-sql")
# 6. 构建报告
report = PerformanceReport(
data=data,
bottleneck_tasks=bottlenecks,
layer_stats=layer_stats,
api_analysis=api_analysis,
sql_analysis=sql_analysis,
)
# 7. 生成建议
logger.info("生成优化建议...")
report.recommendations = generate_recommendations(report, logger)
return report
def _load_dsn() -> str | None:
"""从 .env 加载数据库 DSN。"""
env_path = _FEIQIU_ROOT / ".env"
if not env_path.exists():
return None
try:
from dotenv import dotenv_values
values = dotenv_values(env_path)
return values.get("PG_DSN")
except ImportError:
# 手动解析
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if line.startswith("PG_DSN="):
return line.split("=", 1)[1].strip()
return None
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="ETL 性能分析")
parser.add_argument(
"--input", "-i",
help="计时 JSON 文件路径(默认自动查找最新文件)",
)
parser.add_argument(
"--output", "-o",
help="报告输出路径(默认 docs/reports/performance_report_YYYYMMDD.md",
)
parser.add_argument(
"--skip-sql",
action="store_true",
help="跳过 SQL EXPLAIN ANALYZE 分析",
)
return parser.parse_args()
def main():
logger = _setup_logging()
args = parse_args()
# 确定输入文件
if args.input:
json_path = Path(args.input)
else:
json_path = find_latest_timing_file(_OUTPUT_DIR)
if not json_path:
logger.error("未找到计时 JSON 文件,请指定 --input 参数")
sys.exit(1)
if not json_path.exists():
logger.error("文件不存在: %s", json_path)
sys.exit(1)
logger.info("═══ ETL 性能分析 ═══")
# 执行分析
report = run_analysis(json_path, logger, skip_sql=args.skip_sql)
# 生成报告
md_content = generate_report(report)
# 确定输出路径
_REPORTS_DIR.mkdir(parents=True, exist_ok=True)
if args.output:
output_path = Path(args.output)
else:
date_str = datetime.now().strftime("%Y%m%d")
output_path = _REPORTS_DIR / f"performance_report_{date_str}.md"
output_path.write_text(md_content, encoding="utf-8")
logger.info("报告已保存: %s", output_path)
# 打印摘要
logger.info("")
logger.info("═══ 分析摘要 ═══")
logger.info(" 总耗时: %.1fs (%.1f分钟)",
report.data.overall_duration_sec,
report.data.overall_duration_sec / 60)
logger.info(" 层数: %d", len(report.layer_stats))
logger.info(" 瓶颈任务: %s",
", ".join(f"{t.task_code}({t.duration_sec:.0f}s)"
for t in report.bottleneck_tasks))
logger.info(" SQL 分析: %d 个查询", len(report.sql_analysis))
logger.info(" 优化建议: %d", len(report.recommendations))
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,790 @@
# -*- coding: utf-8 -*-
"""DWD 层调试脚本。
执行 DWD_LOAD_FROM_ODS 任务,验证 TABLE_MAP 中每对 DWD→ODS 映射的处理结果,
检查维度表 SCD2 版本链完整性、事实表时间窗口增量写入正确性、FACT_MAPPINGS 列映射。
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.debug_dwd [--hours 2] [--tables dwd.dim_member,dwd.dwd_payment]
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from database.operations import DatabaseOperations
from api.client import APIClient
from orchestration.task_registry import default_registry
from orchestration.cursor_manager import CursorManager
from orchestration.run_tracker import RunTracker
from orchestration.task_executor import TaskExecutor
from tasks.dwd.dwd_load_task import DwdLoadTask
# 时间列候选列表(原 DwdLoadTask.FACT_ORDER_CANDIDATES重构后内联
_TIME_COLUMN_CANDIDATES = [
"pay_time", "create_time", "update_time",
"occur_time", "settle_time", "start_use_time", "fetched_at",
]
@dataclass
class DebugResult:
"""单个 DWD 表的调试结果"""
layer: str = "DWD"
task_code: str = "DWD_LOAD_FROM_ODS"
table_name: str = ""
ods_source: str = ""
mode: str = "" # SCD2 / INCREMENT / TYPE1_UPSERT
status: str = "" # PASS / FAIL / WARN / ERROR
message: str = ""
counts: dict = field(default_factory=dict)
dwd_row_count: int | None = None
ods_row_count: int | None = None
scd2_check: dict | None = None
fact_window_check: dict | None = None
mapping_check: dict | None = None
duration_sec: float = 0.0
error_detail: str | None = None
fix_applied: str | None = None
# ── 工具函数 ──────────────────────────────────────────────────
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("debug_dwd")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
def _build_components(config: AppConfig, logger: logging.Logger):
"""构建 DB / API / TaskExecutor 等组件。"""
db_conn = DatabaseConnection(
dsn=config["db"]["dsn"],
session=config["db"].get("session"),
connect_timeout=config["db"].get("connect_timeout_sec"),
)
api_client = APIClient(
base_url=config["api"]["base_url"],
token=config["api"]["token"],
timeout=config["api"].get("timeout_sec", 20),
retry_max=config["api"].get("retries", {}).get("max_attempts", 3),
headers_extra=config["api"].get("headers_extra"),
)
db_ops = DatabaseOperations(db_conn)
cursor_mgr = CursorManager(db_conn)
run_tracker = RunTracker(db_conn)
executor = TaskExecutor(
config, db_ops, api_client,
cursor_mgr, run_tracker, default_registry, logger,
)
return db_conn, api_client, db_ops, executor
def _query_count(db_conn: DatabaseConnection, table: str) -> int:
"""查询表的总行数。"""
rows = db_conn.query(f"SELECT COUNT(*) AS cnt FROM {table}")
return int(rows[0]["cnt"]) if rows else 0
def _query_count_windowed(db_conn: DatabaseConnection, table: str,
col: str, start: datetime, end: datetime) -> int:
"""查询表在指定时间窗口内的行数。"""
sql = f'SELECT COUNT(*) AS cnt FROM {table} WHERE "{col}" >= %s AND "{col}" < %s'
rows = db_conn.query(sql, (start, end))
return int(rows[0]["cnt"]) if rows else 0
def _has_column(db_conn: DatabaseConnection, table: str, column: str) -> bool:
"""检查表是否包含指定列。"""
sql = """
SELECT 1 FROM information_schema.columns
WHERE table_schema || '.' || table_name = %s
AND column_name = %s
LIMIT 1
"""
rows = db_conn.query(sql, (table, column))
return bool(rows)
def _is_dim_table(table_name: str) -> bool:
"""判断是否为维度表dim_ 前缀)。"""
base = table_name.split(".")[-1] if "." in table_name else table_name
return base.startswith("dim_")
# ── SCD2 版本链完整性检查 ─────────────────────────────────────
def _check_scd2_integrity(db_conn: DatabaseConnection, dwd_table: str,
logger: logging.Logger) -> dict:
"""检查维度表 SCD2 版本链完整性。
验证项:
- 每个业务主键至多一条 scd2_is_current=1 的记录
- scd2_version 连续递增(无跳号)
- scd2_end_time 与下一版本的 scd2_start_time 一致
"""
result = {"has_scd2": False, "checks": []}
# 先确认表是否有 SCD2 列
if not _has_column(db_conn, dwd_table, "scd2_is_current"):
result["checks"].append("无 SCD2 列,跳过检查")
return result
result["has_scd2"] = True
# 获取业务主键(排除 SCD2 列)
pk_sql = """
SELECT a.attname
FROM pg_index i
JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
WHERE i.indrelid = %s::regclass AND i.indisprimary
ORDER BY array_position(i.indkey, a.attnum)
"""
pk_rows = db_conn.query(pk_sql, (dwd_table,))
scd_cols = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
business_keys = [r["attname"] for r in pk_rows if r["attname"] not in scd_cols]
if not business_keys:
result["checks"].append("未找到业务主键")
return result
bk_sql = ", ".join(f'"{k}"' for k in business_keys)
# 检查1每个业务主键至多一条 current 记录
dup_current_sql = f"""
SELECT {bk_sql}, COUNT(*) AS cnt
FROM {dwd_table}
WHERE COALESCE(scd2_is_current, 1) = 1
GROUP BY {bk_sql}
HAVING COUNT(*) > 1
LIMIT 10
"""
try:
dup_rows = db_conn.query(dup_current_sql)
dup_count = len(dup_rows) if dup_rows else 0
if dup_count > 0:
result["checks"].append(f"⚠ 发现 {dup_count} 个业务主键有多条 current 记录")
else:
result["checks"].append("✓ 每个业务主键至多一条 current 记录")
except Exception as exc:
result["checks"].append(f"✗ 检查 current 唯一性失败: {exc}")
# 检查2scd2_version 连续性(抽样检查前 100 个多版本主键)
version_gap_sql = f"""
WITH multi_ver AS (
SELECT {bk_sql}
FROM {dwd_table}
GROUP BY {bk_sql}
HAVING COUNT(*) > 1
LIMIT 100
),
versioned AS (
SELECT t.{business_keys[0]},
t.scd2_version,
LAG(t.scd2_version) OVER (
PARTITION BY {', '.join(f't."{k}"' for k in business_keys)}
ORDER BY t.scd2_version
) AS prev_version
FROM {dwd_table} t
INNER JOIN multi_ver m ON {' AND '.join(f't."{k}" = m."{k}"' for k in business_keys)}
)
SELECT COUNT(*) AS gap_count
FROM versioned
WHERE prev_version IS NOT NULL AND scd2_version - prev_version != 1
"""
try:
gap_rows = db_conn.query(version_gap_sql)
gap_count = int(gap_rows[0]["gap_count"]) if gap_rows else 0
if gap_count > 0:
result["checks"].append(f"⚠ 发现 {gap_count} 处版本号跳号")
else:
result["checks"].append("✓ 版本号连续递增")
except Exception as exc:
result["checks"].append(f"✗ 检查版本连续性失败: {exc}")
# 检查3总行数和 current 行数
try:
total = _query_count(db_conn, dwd_table)
current_sql = f"SELECT COUNT(*) AS cnt FROM {dwd_table} WHERE COALESCE(scd2_is_current, 1) = 1"
current_rows = db_conn.query(current_sql)
current_count = int(current_rows[0]["cnt"]) if current_rows else 0
result["total_rows"] = total
result["current_rows"] = current_count
result["historical_rows"] = total - current_count
result["checks"].append(f"✓ 总行数={total}, current={current_count}, 历史={total - current_count}")
except Exception as exc:
result["checks"].append(f"✗ 查询行数失败: {exc}")
return result
# ── 事实表时间窗口增量写入检查 ────────────────────────────────
def _check_fact_window(db_conn: DatabaseConnection, dwd_table: str, ods_table: str,
window_start: datetime, window_end: datetime,
logger: logging.Logger) -> dict:
"""检查事实表时间窗口增量写入正确性。
验证项:
- DWD 表在窗口内的行数 vs ODS 表在窗口内的行数
- 主键无重复
- fetched_at 范围合理
"""
result = {"checks": []}
# 确定时间列:优先用 _TIME_COLUMN_CANDIDATES 中存在的列
order_col = None
for candidate in _TIME_COLUMN_CANDIDATES:
if _has_column(db_conn, dwd_table, candidate):
order_col = candidate
break
if not order_col:
result["checks"].append("⚠ 未找到可用的时间列,跳过窗口检查")
return result
# DWD 窗口内行数
try:
dwd_count = _query_count_windowed(db_conn, dwd_table, order_col, window_start, window_end)
result["dwd_window_count"] = dwd_count
result["order_column"] = order_col
except Exception as exc:
result["checks"].append(f"✗ 查询 DWD 窗口行数失败: {exc}")
return result
# ODS 窗口内行数(用 fetched_at
try:
ods_count = _query_count_windowed(db_conn, ods_table, "fetched_at", window_start, window_end)
result["ods_window_count"] = ods_count
except Exception as exc:
result["checks"].append(f"✗ 查询 ODS 窗口行数失败: {exc}")
ods_count = None
if ods_count is not None:
# 事实表可能因去重/映射导致行数不完全一致,但差异不应过大
if ods_count > 0:
ratio = dwd_count / ods_count if ods_count > 0 else 0
result["ratio"] = round(ratio, 4)
if ratio < 0.5:
result["checks"].append(f"⚠ DWD/ODS 比率偏低: {ratio:.2%} (DWD={dwd_count}, ODS={ods_count})")
else:
result["checks"].append(f"✓ DWD/ODS 比率正常: {ratio:.2%} (DWD={dwd_count}, ODS={ods_count})")
else:
result["checks"].append(f" ODS 窗口内无数据 (DWD={dwd_count})")
# 主键重复检查
pk_sql = """
SELECT a.attname
FROM pg_index i
JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
WHERE i.indrelid = %s::regclass AND i.indisprimary
ORDER BY array_position(i.indkey, a.attnum)
"""
try:
pk_rows = db_conn.query(pk_sql, (dwd_table,))
pk_cols = [r["attname"] for r in pk_rows]
if pk_cols:
pk_list = ", ".join(f'"{c}"' for c in pk_cols)
dup_sql = f"""
SELECT {pk_list}, COUNT(*) AS cnt
FROM {dwd_table}
GROUP BY {pk_list}
HAVING COUNT(*) > 1
LIMIT 5
"""
dup_rows = db_conn.query(dup_sql)
dup_count = len(dup_rows) if dup_rows else 0
if dup_count > 0:
result["checks"].append(f"⚠ 发现 {dup_count} 组主键重复")
else:
result["checks"].append("✓ 主键无重复")
except Exception as exc:
result["checks"].append(f"✗ 主键重复检查失败: {exc}")
return result
# ── FACT_MAPPINGS 列映射检查 ──────────────────────────────────
def _check_fact_mappings(db_conn: DatabaseConnection, dwd_table: str, ods_table: str,
logger: logging.Logger) -> dict:
"""验证 FACT_MAPPINGS 中的列映射和类型转换。
验证项:
- 映射中的 DWD 目标列确实存在于 DWD 表
- 简单列名映射的 ODS 源列确实存在于 ODS 表
- 类型转换标注合理cast_type 非空时目标列类型匹配)
"""
result = {"checks": [], "mapping_count": 0, "issues": []}
mappings = DwdLoadTask.FACT_MAPPINGS.get(dwd_table, [])
if not mappings:
result["checks"].append(" 无显式 FACT_MAPPINGS 条目")
return result
result["mapping_count"] = len(mappings)
# 获取 DWD 和 ODS 的列集合
dwd_cols_sql = """
SELECT column_name FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
"""
ods_cols_sql = dwd_cols_sql
dwd_schema, dwd_name = dwd_table.split(".", 1)
ods_schema, ods_name = ods_table.split(".", 1)
try:
dwd_col_rows = db_conn.query(dwd_cols_sql, (dwd_schema, dwd_name))
dwd_cols = {r["column_name"].lower() for r in dwd_col_rows}
except Exception as exc:
result["checks"].append(f"✗ 获取 DWD 列信息失败: {exc}")
return result
try:
ods_col_rows = db_conn.query(ods_cols_sql, (ods_schema, ods_name))
ods_cols = {r["column_name"].lower() for r in ods_col_rows}
except Exception as exc:
result["checks"].append(f"✗ 获取 ODS 列信息失败: {exc}")
return result
missing_dwd = []
missing_ods = []
for dwd_col, ods_expr, cast_type in mappings:
# 检查 DWD 目标列
if dwd_col.lower() not in dwd_cols:
missing_dwd.append(dwd_col)
# 检查 ODS 源列(仅简单列名,跳过表达式如 JSON 提取、CASE 等)
is_simple_col = (
ods_expr.isidentifier()
or (ods_expr.startswith('"') and ods_expr.endswith('"'))
)
if is_simple_col:
col_name = ods_expr.strip('"').lower()
if col_name not in ods_cols:
missing_ods.append((dwd_col, ods_expr))
if missing_dwd:
result["issues"].extend([f"DWD 列不存在: {c}" for c in missing_dwd])
result["checks"].append(f"{len(missing_dwd)} 个 DWD 目标列不存在: {missing_dwd}")
else:
result["checks"].append(f"✓ 所有 {len(mappings)} 个 DWD 目标列均存在")
if missing_ods:
result["issues"].extend([f"ODS 列不存在: {dwd}{ods}" for dwd, ods in missing_ods])
result["checks"].append(f"{len(missing_ods)} 个 ODS 源列不存在: {missing_ods}")
else:
simple_count = sum(
1 for _, expr, _ in mappings
if expr.isidentifier() or (expr.startswith('"') and expr.endswith('"'))
)
result["checks"].append(f"✓ 所有 {simple_count} 个简单列名映射的 ODS 源列均存在")
return result
# ── 单表调试 ──────────────────────────────────────────────────
def _debug_single_table(
dwd_table: str,
ods_table: str,
db_conn: DatabaseConnection,
window_start: datetime,
window_end: datetime,
logger: logging.Logger,
) -> DebugResult:
"""对单张 DWD 表执行调试检查(不执行装载,仅验证现有数据)。"""
result = DebugResult(table_name=dwd_table, ods_source=ods_table)
is_dim = _is_dim_table(dwd_table)
result.mode = "SCD2" if is_dim else "INCREMENT"
logger.info("" * 60)
logger.info("▶ 检查: %s%s (%s)", dwd_table, ods_table, result.mode)
t0 = time.monotonic()
issues = []
# 1) 基本行数
try:
dwd_total = _query_count(db_conn, dwd_table)
ods_total = _query_count(db_conn, ods_table)
result.dwd_row_count = dwd_total
result.ods_row_count = ods_total
logger.info(" 行数: DWD=%d, ODS=%d", dwd_total, ods_total)
except Exception as exc:
result.status = "ERROR"
result.message = f"查询行数失败: {exc}"
result.error_detail = traceback.format_exc()
result.duration_sec = round(time.monotonic() - t0, 2)
logger.error("%s", result.message)
return result
# 2) FACT_MAPPINGS 列映射检查
try:
mapping_check = _check_fact_mappings(db_conn, dwd_table, ods_table, logger)
result.mapping_check = mapping_check
for check in mapping_check.get("checks", []):
logger.info(" 映射: %s", check)
if mapping_check.get("issues"):
issues.extend(mapping_check["issues"])
except Exception as exc:
logger.warning(" ⚠ 列映射检查异常: %s", exc)
# 3) 维度表 SCD2 检查 / 事实表窗口检查
if is_dim:
try:
scd2_check = _check_scd2_integrity(db_conn, dwd_table, logger)
result.scd2_check = scd2_check
for check in scd2_check.get("checks", []):
logger.info(" SCD2: %s", check)
# 含 ⚠ 的检查项视为 issue
issues.extend(c for c in scd2_check.get("checks", []) if "" in c)
except Exception as exc:
logger.warning(" ⚠ SCD2 检查异常: %s", exc)
else:
try:
fact_check = _check_fact_window(
db_conn, dwd_table, ods_table, window_start, window_end, logger,
)
result.fact_window_check = fact_check
for check in fact_check.get("checks", []):
logger.info(" 窗口: %s", check)
issues.extend(c for c in fact_check.get("checks", []) if "" in c)
except Exception as exc:
logger.warning(" ⚠ 窗口检查异常: %s", exc)
# 4) 最终状态
result.duration_sec = round(time.monotonic() - t0, 2)
if issues:
result.status = "WARN"
result.message = f"{len(issues)} 个问题: {issues[0]}"
elif dwd_total == 0:
result.status = "WARN"
result.message = "DWD 表为空"
else:
result.status = "PASS"
result.message = f"检查通过 (DWD={dwd_total}行)"
icon = {"PASS": "", "WARN": "", "ERROR": "", "FAIL": ""}.get(result.status, "?")
logger.info(" %s 结果: %s - %s (%.1fs)", icon, result.status, result.message, result.duration_sec)
return result
# ── 执行 DWD_LOAD_FROM_ODS 任务 ──────────────────────────────
def _execute_dwd_load(
executor: TaskExecutor,
config: AppConfig,
logger: logging.Logger,
) -> dict:
"""执行 DWD_LOAD_FROM_ODS 任务并返回结果。"""
store_id = int(config.get("app.store_id"))
run_uuid = f"debug-dwd-load-{int(time.time())}"
logger.info("" * 60)
logger.info("▶ 执行 DWD_LOAD_FROM_ODS 任务")
t0 = time.monotonic()
try:
task_result = executor.run_single_task(
task_code="DWD_LOAD_FROM_ODS",
run_uuid=run_uuid,
store_id=store_id,
data_source="online",
)
elapsed = round(time.monotonic() - t0, 2)
logger.info(" 执行完成,耗时 %.1fs", elapsed)
# 解析结果
tables = task_result.get("tables", [])
errors = task_result.get("errors", [])
logger.info(" 处理表数: %d, 错误表数: %d", len(tables), len(errors))
for t in tables:
tbl = t.get("table", "")
mode = t.get("mode", "")
ins = t.get("inserted", 0)
upd = t.get("updated", 0)
proc = t.get("processed", 0)
logger.info(" %s [%s]: processed=%d, inserted=%d, updated=%d", tbl, mode, proc, ins, upd)
for e in errors:
logger.error("%s: %s", e.get("table", ""), e.get("error", ""))
return {
"status": "SUCCESS" if not errors else "PARTIAL",
"tables": tables,
"errors": errors,
"duration_sec": elapsed,
}
except Exception as exc:
elapsed = round(time.monotonic() - t0, 2)
logger.error(" ✗ 执行异常: %s", exc)
return {
"status": "ERROR",
"tables": [],
"errors": [{"table": "DWD_LOAD_FROM_ODS", "error": str(exc)}],
"duration_sec": elapsed,
"traceback": traceback.format_exc(),
}
# ── 主流程 ────────────────────────────────────────────────────
def run_dwd_debug(
hours: float = 2.0,
table_filter: list[str] | None = None,
skip_load: bool = False,
) -> list[DebugResult]:
"""执行 DWD 层全量调试。
Args:
hours: 回溯窗口小时数(默认 2 小时)
table_filter: 仅调试指定的 DWD 表名列表None 表示全部
skip_load: 跳过 DWD_LOAD_FROM_ODS 执行,仅做数据检查
Returns:
所有表的 DebugResult 列表
"""
logger = _setup_logging()
logger.info("=" * 60)
logger.info("DWD 层调试开始")
logger.info("=" * 60)
# 加载配置
config = AppConfig.load()
tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
window_end = datetime.now(tz)
window_start = window_end - timedelta(hours=hours)
logger.info("门店 ID: %s", config.get("app.store_id"))
logger.info("数据库: %s", config.get("db.name", ""))
logger.info("时间窗口: %s ~ %s (%.1f 小时)", window_start, window_end, hours)
# 设置 window_override
config.config.setdefault("run", {}).setdefault("window_override", {})
config.config["run"]["window_override"]["start"] = window_start
config.config["run"]["window_override"]["end"] = window_end
# 构建组件
db_conn, api_client, db_ops, executor = _build_components(config, logger)
# 步骤1执行 DWD_LOAD_FROM_ODS可选
load_result = None
if not skip_load:
load_result = _execute_dwd_load(executor, config, logger)
logger.info("")
# 步骤2逐表检查 TABLE_MAP 中的映射
table_map = DwdLoadTask.TABLE_MAP
if table_filter:
filter_set = {t.lower() for t in table_filter}
filtered_map = {
k: v for k, v in table_map.items()
if k.lower() in filter_set or k.split(".")[-1].lower() in filter_set
}
skipped = filter_set - {k.lower() for k in filtered_map}
if skipped:
logger.warning("以下表不在 TABLE_MAP 中,已跳过: %s", skipped)
table_map = filtered_map
logger.info("")
logger.info("=" * 60)
logger.info("逐表数据检查 (%d 张表)", len(table_map))
logger.info("=" * 60)
results: list[DebugResult] = []
for idx, (dwd_table, ods_table) in enumerate(table_map.items(), start=1):
logger.info("[%d/%d] %s", idx, len(table_map), dwd_table)
try:
r = _debug_single_table(
dwd_table=dwd_table,
ods_table=ods_table,
db_conn=db_conn,
window_start=window_start,
window_end=window_end,
logger=logger,
)
# 补充装载结果中的 counts
if load_result and load_result.get("tables"):
for t in load_result["tables"]:
if t.get("table") == dwd_table:
r.counts = {
k: v for k, v in t.items() if k != "table"
}
break
# 补充装载错误
if load_result and load_result.get("errors"):
for e in load_result["errors"]:
if e.get("table") == dwd_table:
r.status = "ERROR"
r.message = f"装载失败: {e.get('error', '')}"
r.error_detail = e.get("error", "")
break
except Exception as exc:
r = DebugResult(
table_name=dwd_table,
ods_source=ods_table,
status="ERROR",
message=f"未捕获异常: {exc}",
error_detail=traceback.format_exc(),
)
logger.error(" ✗ 未捕获异常: %s", exc)
results.append(r)
db_conn.ensure_open()
# 汇总
_print_summary(results, load_result, logger)
# 输出 JSON
output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output"
output_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"debug_dwd_{ts}.json"
_save_results(results, load_result, output_file)
logger.info("结果已保存: %s", output_file)
db_conn.close()
return results
# ── 汇总与输出 ────────────────────────────────────────────────
def _print_summary(results: list[DebugResult], load_result: dict | None,
logger: logging.Logger):
"""打印调试汇总。"""
logger.info("")
logger.info("=" * 60)
logger.info("DWD 层调试汇总")
logger.info("=" * 60)
# 装载结果
if load_result:
logger.info("DWD_LOAD_FROM_ODS 执行: %s (耗时 %.1fs)",
load_result.get("status", "N/A"),
load_result.get("duration_sec", 0))
tables = load_result.get("tables", [])
errors = load_result.get("errors", [])
total_inserted = sum(t.get("inserted", 0) for t in tables)
total_updated = sum(t.get("updated", 0) for t in tables)
logger.info(" 处理表数: %d, 错误表数: %d", len(tables), len(errors))
logger.info(" 总计: inserted=%d, updated=%d", total_inserted, total_updated)
# 逐表检查结果
pass_count = sum(1 for r in results if r.status == "PASS")
warn_count = sum(1 for r in results if r.status == "WARN")
error_count = sum(1 for r in results if r.status in ("ERROR", "FAIL"))
total_duration = sum(r.duration_sec for r in results)
logger.info("")
logger.info("逐表检查: %d 张表", len(results))
logger.info(" ✓ PASS: %d", pass_count)
logger.info(" ⚠ WARN: %d", warn_count)
logger.info(" ✗ ERROR: %d", error_count)
logger.info(" 总耗时: %.1f", total_duration)
# 维度表 vs 事实表统计
dim_results = [r for r in results if r.mode == "SCD2"]
fact_results = [r for r in results if r.mode == "INCREMENT"]
logger.info("")
logger.info("维度表: %d 张 (PASS=%d, WARN=%d, ERROR=%d)",
len(dim_results),
sum(1 for r in dim_results if r.status == "PASS"),
sum(1 for r in dim_results if r.status == "WARN"),
sum(1 for r in dim_results if r.status in ("ERROR", "FAIL")))
logger.info("事实表: %d 张 (PASS=%d, WARN=%d, ERROR=%d)",
len(fact_results),
sum(1 for r in fact_results if r.status == "PASS"),
sum(1 for r in fact_results if r.status == "WARN"),
sum(1 for r in fact_results if r.status in ("ERROR", "FAIL")))
# 列出非 PASS 的表
non_pass = [r for r in results if r.status != "PASS"]
if non_pass:
logger.info("")
logger.info("需关注的表:")
for r in non_pass:
logger.info(" [%s] %s: %s", r.status, r.table_name, r.message)
else:
logger.info("")
logger.info("所有表均通过 ✓")
def _save_results(results: list[DebugResult], load_result: dict | None, path: Path):
"""将结果序列化为 JSON。"""
data = {
"load_result": _sanitize_for_json(load_result) if load_result else None,
"table_checks": [_sanitize_for_json(asdict(r)) for r in results],
}
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2, default=str),
encoding="utf-8",
)
def _sanitize_for_json(obj):
"""递归处理不可序列化的值。"""
if isinstance(obj, dict):
return {k: _sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_sanitize_for_json(v) for v in obj]
if isinstance(obj, datetime):
return obj.isoformat()
return obj
# ── CLI 入口 ──────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser(description="DWD 层调试脚本")
parser.add_argument("--hours", type=float, default=2.0,
help="回溯窗口小时数(默认 2")
parser.add_argument("--tables", type=str, default=None,
help="仅调试指定 DWD 表,逗号分隔(如 dwd.dim_member,dwd.dwd_payment")
parser.add_argument("--skip-load", action="store_true",
help="跳过 DWD_LOAD_FROM_ODS 执行,仅做数据检查")
return parser.parse_args()
def main():
args = parse_args()
table_filter = None
if args.tables:
table_filter = [t.strip() for t in args.tables.split(",") if t.strip()]
results = run_dwd_debug(
hours=args.hours,
table_filter=table_filter,
skip_load=args.skip_load,
)
has_error = any(r.status in ("ERROR", "FAIL") for r in results)
sys.exit(1 if has_error else 0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,575 @@
# -*- coding: utf-8 -*-
"""DWS 层逐任务调试脚本。
连接真实数据库,逐个执行 15 个 DWS 汇总任务,
验证返回结果和 DWS 表写入情况,抽样检查汇总数据与 DWD 明细数据的一致性。
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.debug_dws [--hours 48] [--tasks DWS_FINANCE_DAILY,DWS_ASSISTANT_DAILY]
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from database.operations import DatabaseOperations
from api.client import APIClient
from orchestration.task_registry import default_registry
from orchestration.cursor_manager import CursorManager
from orchestration.run_tracker import RunTracker
from orchestration.task_executor import TaskExecutor
@dataclass
class DebugResult:
"""单个 DWS 任务的调试结果"""
layer: str = "DWS"
task_code: str = ""
status: str = "" # PASS / FAIL / WARN / ERROR
message: str = ""
counts: dict = field(default_factory=dict)
target_table: str = ""
pre_row_count: int | None = None
post_row_count: int | None = None
consistency_check: dict | None = None
duration_sec: float = 0.0
error_detail: str | None = None
fix_applied: str | None = None
# ── 工具函数 ──────────────────────────────────────────────────
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("debug_dws")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
def _build_components(config: AppConfig, logger: logging.Logger):
"""构建 DB / API / TaskExecutor 等组件。"""
db_conn = DatabaseConnection(
dsn=config["db"]["dsn"],
session=config["db"].get("session"),
connect_timeout=config["db"].get("connect_timeout_sec"),
)
api_client = APIClient(
base_url=config["api"]["base_url"],
token=config["api"]["token"],
timeout=config["api"].get("timeout_sec", 20),
retry_max=config["api"].get("retries", {}).get("max_attempts", 3),
headers_extra=config["api"].get("headers_extra"),
)
db_ops = DatabaseOperations(db_conn)
cursor_mgr = CursorManager(db_conn)
run_tracker = RunTracker(db_conn)
executor = TaskExecutor(
config, db_ops, api_client,
cursor_mgr, run_tracker, default_registry, logger,
)
return db_conn, api_client, db_ops, executor
def _get_dws_target_table(task_code: str, config, db_conn, api_client, logger) -> str | None:
"""通过临时实例获取 DWS 任务的目标表名。"""
meta = default_registry.get_metadata(task_code)
if meta is None:
return None
try:
task_instance = meta.task_class(config, db_conn, api_client, logger)
raw_name = task_instance.get_target_table()
# 目标表名不含 schema 前缀时补上 dws.
if raw_name and "." not in raw_name:
return f"dws.{raw_name}"
return raw_name
except Exception:
return None
def _query_count(db_conn: DatabaseConnection, table: str) -> int:
"""查询表的总行数。"""
rows = db_conn.query(f"SELECT COUNT(*) AS cnt FROM {table}")
return int(rows[0]["cnt"]) if rows else 0
def _has_column(db_conn: DatabaseConnection, table: str, column: str) -> bool:
"""检查表是否包含指定列。"""
sql = """
SELECT 1 FROM information_schema.columns
WHERE table_schema || '.' || table_name = %s
AND column_name = %s
LIMIT 1
"""
rows = db_conn.query(sql, (table, column))
return bool(rows)
def _table_exists(db_conn: DatabaseConnection, table: str) -> bool:
"""检查表/视图是否存在。"""
rows = db_conn.query("SELECT to_regclass(%s) AS reg", (table,))
return bool(rows and rows[0].get("reg"))
# ── DWS 与 DWD 一致性抽样验证 ────────────────────────────────
# 已知的 DWS→DWD 聚合关系映射(用于抽样验证)
# 格式: dws_table -> {dwd_source, dws_date_col, dwd_date_col, amount_cols}
_DWS_DWD_CONSISTENCY_MAP: dict[str, dict] = {
"dws.dws_assistant_daily_detail": {
"dwd_source": "dwd.dwd_assistant_service_log",
"dws_date_col": "stat_date",
"dwd_date_col": "service_date",
"group_cols": ["site_id", "assistant_id"],
"dws_count_col": "service_count",
"dwd_count_expr": "COUNT(*)",
"description": "助教日度服务次数 vs DWD 服务流水",
},
"dws.dws_finance_daily_summary": {
"dwd_source": "dwd.dwd_order",
"dws_date_col": "stat_date",
"dwd_date_col": "order_date",
"group_cols": ["site_id"],
"dws_count_col": "order_count",
"dwd_count_expr": "COUNT(*)",
"description": "财务日度订单数 vs DWD 订单表",
},
"dws.dws_member_visit_detail": {
"dwd_source": "dwd.dwd_order",
"dws_date_col": "visit_date",
"dwd_date_col": "order_date",
"group_cols": ["site_id", "member_id"],
"dws_count_col": None, # 无直接计数列,仅做行数对比
"dwd_count_expr": None,
"description": "会员到店明细 vs DWD 订单表",
},
}
def _check_dws_dwd_consistency(
db_conn: DatabaseConnection,
dws_table: str,
logger: logging.Logger,
) -> dict:
"""抽样验证 DWS 汇总数据与 DWD 明细数据的一致性。
对已知映射关系的表,抽取最近 3 天的数据做聚合对比。
对未知映射的表,仅做基本行数检查。
"""
result = {"checks": [], "has_mapping": False}
mapping = _DWS_DWD_CONSISTENCY_MAP.get(dws_table)
if not mapping:
result["checks"].append(" 无已知 DWS→DWD 映射,跳过一致性验证")
return result
result["has_mapping"] = True
result["description"] = mapping["description"]
dwd_source = mapping["dwd_source"]
dws_date_col = mapping["dws_date_col"]
dwd_date_col = mapping["dwd_date_col"]
# 检查 DWD 源表是否存在
if not _table_exists(db_conn, dwd_source):
result["checks"].append(f"⚠ DWD 源表不存在: {dwd_source}")
return result
# 抽样:取 DWS 表中最近 3 个不同日期
try:
sample_sql = f"""
SELECT DISTINCT "{dws_date_col}" AS d
FROM {dws_table}
ORDER BY d DESC
LIMIT 3
"""
date_rows = db_conn.query(sample_sql)
if not date_rows:
result["checks"].append(" DWS 表无数据,跳过一致性验证")
return result
sample_dates = [r["d"] for r in date_rows]
except Exception as exc:
result["checks"].append(f"✗ 查询 DWS 日期失败: {exc}")
return result
# 对比每个抽样日期的行数
mismatches = []
for sample_date in sample_dates:
try:
dws_count_sql = f"""
SELECT COUNT(*) AS cnt FROM {dws_table}
WHERE "{dws_date_col}" = %s
"""
dws_rows = db_conn.query(dws_count_sql, (sample_date,))
dws_count = int(dws_rows[0]["cnt"]) if dws_rows else 0
# DWD 侧:检查对应日期列是否存在
if not _has_column(db_conn, dwd_source, dwd_date_col):
result["checks"].append(f"⚠ DWD 表缺少日期列 {dwd_date_col}")
break
dwd_count_sql = f"""
SELECT COUNT(*) AS cnt FROM {dwd_source}
WHERE "{dwd_date_col}" = %s
"""
dwd_rows = db_conn.query(dwd_count_sql, (sample_date,))
dwd_count = int(dwd_rows[0]["cnt"]) if dwd_rows else 0
# DWS 是聚合表,行数通常 <= DWD 行数(按 group_cols 聚合)
if dws_count > 0 and dwd_count == 0:
mismatches.append(
f"日期 {sample_date}: DWS={dws_count} 但 DWD=0DWD 无对应数据)"
)
elif dws_count == 0 and dwd_count > 0:
mismatches.append(
f"日期 {sample_date}: DWS=0 但 DWD={dwd_count}DWS 未汇总)"
)
else:
result["checks"].append(
f"✓ 日期 {sample_date}: DWS={dws_count}行, DWD={dwd_count}"
)
except Exception as exc:
result["checks"].append(f"✗ 日期 {sample_date} 对比失败: {exc}")
if mismatches:
result["checks"].extend(f"{m}" for m in mismatches)
result["mismatch_count"] = len(mismatches)
else:
result["mismatch_count"] = 0
return result
# ── 核心调试逻辑 ──────────────────────────────────────────────
def debug_single_dws_task(
task_code: str,
executor: TaskExecutor,
db_conn: DatabaseConnection,
config: AppConfig,
api_client,
logger: logging.Logger,
window_start: datetime,
window_end: datetime,
) -> DebugResult:
"""执行单个 DWS 任务并验证结果。"""
result = DebugResult(task_code=task_code)
# 获取目标表名
target_table = _get_dws_target_table(task_code, config, db_conn, api_client, logger)
result.target_table = target_table or ""
store_id = int(config.get("app.store_id"))
run_uuid = f"debug-dws-{task_code.lower()}-{int(time.time())}"
logger.info("" * 60)
logger.info("▶ 开始调试: %s (表: %s)", task_code, target_table or "未知")
# 执行前查询表行数
if target_table and _table_exists(db_conn, target_table):
try:
result.pre_row_count = _query_count(db_conn, target_table)
logger.info(" 执行前表行数: %d", result.pre_row_count)
except Exception as exc:
logger.warning(" 查询执行前行数失败: %s", exc)
# 执行任务
t0 = time.monotonic()
try:
task_result = executor.run_single_task(
task_code=task_code,
run_uuid=run_uuid,
store_id=store_id,
data_source="online",
)
result.duration_sec = round(time.monotonic() - t0, 2)
except Exception as exc:
result.duration_sec = round(time.monotonic() - t0, 2)
result.status = "ERROR"
result.message = f"任务执行异常: {exc}"
result.error_detail = traceback.format_exc()
logger.error(" ✗ 执行异常: %s", exc)
return result
# 解析返回结果
task_status = (task_result.get("status") or "").upper()
counts = task_result.get("counts") or {}
result.counts = counts
logger.info(" 返回状态: %s", task_status)
logger.info(" counts: %s", counts)
# 执行后查询表行数
if target_table and _table_exists(db_conn, target_table):
try:
result.post_row_count = _query_count(db_conn, target_table)
logger.info(" 执行后表行数: %d", result.post_row_count)
if result.pre_row_count is not None:
delta = result.post_row_count - result.pre_row_count
logger.info(" 行数变化: %+d", delta)
except Exception as exc:
logger.warning(" 查询执行后行数失败: %s", exc)
# 抽样验证 DWS 与 DWD 一致性
if target_table and _table_exists(db_conn, target_table):
try:
consistency = _check_dws_dwd_consistency(db_conn, target_table, logger)
result.consistency_check = consistency
for check in consistency.get("checks", []):
logger.info(" 一致性: %s", check)
except Exception as exc:
logger.warning(" ⚠ 一致性检查异常: %s", exc)
# 最终状态判定
issues = []
errors_count = counts.get("errors", 0)
if errors_count:
issues.append(f"执行有 {errors_count} 个错误")
if result.consistency_check and result.consistency_check.get("mismatch_count", 0) > 0:
issues.append(f"一致性检查有 {result.consistency_check['mismatch_count']} 处不一致")
if result.post_row_count is not None and result.post_row_count == 0:
issues.append("执行后表为空")
if issues:
result.status = "WARN"
result.message = "; ".join(issues)
elif task_status in ("SUCCESS", "PARTIAL", "COMPLETE"):
result.status = "PASS"
result.message = f"执行成功, counts={counts}"
elif task_status == "SKIP":
result.status = "WARN"
result.message = "任务被跳过(未启用或不存在)"
else:
result.status = "WARN"
result.message = f"未知状态: {task_status}"
icon = {"PASS": "", "WARN": "", "ERROR": "", "FAIL": ""}.get(result.status, "?")
logger.info(" %s 结果: %s - %s (耗时 %.1fs)", icon, result.status, result.message, result.duration_sec)
return result
# ── 主流程 ────────────────────────────────────────────────────
def run_dws_debug(
hours: float = 48.0,
task_filter: list[str] | None = None,
) -> list[DebugResult]:
"""执行 DWS 层全量调试。
Args:
hours: 回溯窗口小时数(默认 48 小时DWS 汇总通常按天粒度)
task_filter: 仅调试指定的任务代码列表None 表示全部
Returns:
所有任务的 DebugResult 列表
"""
logger = _setup_logging()
logger.info("=" * 60)
logger.info("DWS 层调试开始")
logger.info("=" * 60)
# 加载配置(从 .env
config = AppConfig.load()
tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
window_end = datetime.now(tz)
window_start = window_end - timedelta(hours=hours)
logger.info("门店 ID: %s", config.get("app.store_id"))
logger.info("数据库: %s", config.get("db.name", ""))
logger.info("API: %s", config.get("api.base_url", ""))
logger.info("时间窗口: %s ~ %s (%.1f 小时)", window_start, window_end, hours)
# 设置 window_override 让所有任务使用统一窗口
config.config.setdefault("run", {}).setdefault("window_override", {})
config.config["run"]["window_override"]["start"] = window_start
config.config["run"]["window_override"]["end"] = window_end
# 构建组件
db_conn, api_client, db_ops, executor = _build_components(config, logger)
# 获取所有 DWS 层任务
all_dws_codes = sorted(default_registry.get_tasks_by_layer("DWS"))
if task_filter:
filter_set = {t.upper() for t in task_filter}
dws_codes = [c for c in all_dws_codes if c in filter_set]
skipped = filter_set - set(dws_codes)
if skipped:
logger.warning("以下任务不在 DWS 层注册表中,已跳过: %s", skipped)
else:
dws_codes = all_dws_codes
logger.info("待调试 DWS 任务: %d", len(dws_codes))
logger.info("任务列表: %s", ", ".join(dws_codes))
logger.info("")
# 逐个执行
results: list[DebugResult] = []
for idx, task_code in enumerate(dws_codes, start=1):
logger.info("[%d/%d] %s", idx, len(dws_codes), task_code)
try:
r = debug_single_dws_task(
task_code=task_code,
executor=executor,
db_conn=db_conn,
config=config,
api_client=api_client,
logger=logger,
window_start=window_start,
window_end=window_end,
)
except Exception as exc:
r = DebugResult(
task_code=task_code,
status="ERROR",
message=f"未捕获异常: {exc}",
error_detail=traceback.format_exc(),
)
logger.error(" ✗ 未捕获异常: %s", exc)
results.append(r)
# 确保连接可用
db_conn.ensure_open()
# 汇总
_print_summary(results, logger)
# 输出 JSON 结果
output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output"
output_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"debug_dws_{ts}.json"
_save_results(results, output_file)
logger.info("结果已保存: %s", output_file)
# 清理
db_conn.close()
return results
# ── 汇总与输出 ────────────────────────────────────────────────
def _print_summary(results: list[DebugResult], logger: logging.Logger):
"""打印调试汇总。"""
logger.info("")
logger.info("=" * 60)
logger.info("DWS 层调试汇总")
logger.info("=" * 60)
pass_count = sum(1 for r in results if r.status == "PASS")
warn_count = sum(1 for r in results if r.status == "WARN")
error_count = sum(1 for r in results if r.status in ("ERROR", "FAIL"))
total_duration = sum(r.duration_sec for r in results)
logger.info("总计: %d 个任务", len(results))
logger.info(" ✓ PASS: %d", pass_count)
logger.info(" ⚠ WARN: %d", warn_count)
logger.info(" ✗ ERROR: %d", error_count)
logger.info(" 总耗时: %.1f", total_duration)
logger.info("")
# 按任务类型分组统计
regular_tasks = [r for r in results if not r.task_code.startswith("DWS_MV_")
and r.task_code != "DWS_RETENTION_CLEANUP"
and r.task_code != "DWS_BUILD_ORDER_SUMMARY"]
mv_tasks = [r for r in results if r.task_code.startswith("DWS_MV_")]
utility_tasks = [r for r in results if r.task_code in ("DWS_RETENTION_CLEANUP", "DWS_BUILD_ORDER_SUMMARY")]
if regular_tasks:
logger.info("业务汇总任务: %d 个 (PASS=%d, WARN=%d, ERROR=%d)",
len(regular_tasks),
sum(1 for r in regular_tasks if r.status == "PASS"),
sum(1 for r in regular_tasks if r.status == "WARN"),
sum(1 for r in regular_tasks if r.status in ("ERROR", "FAIL")))
if mv_tasks:
logger.info("物化视图刷新: %d 个 (PASS=%d, WARN=%d, ERROR=%d)",
len(mv_tasks),
sum(1 for r in mv_tasks if r.status == "PASS"),
sum(1 for r in mv_tasks if r.status == "WARN"),
sum(1 for r in mv_tasks if r.status in ("ERROR", "FAIL")))
if utility_tasks:
logger.info("工具类任务: %d 个 (PASS=%d, WARN=%d, ERROR=%d)",
len(utility_tasks),
sum(1 for r in utility_tasks if r.status == "PASS"),
sum(1 for r in utility_tasks if r.status == "WARN"),
sum(1 for r in utility_tasks if r.status in ("ERROR", "FAIL")))
# 列出非 PASS 的任务
non_pass = [r for r in results if r.status != "PASS"]
if non_pass:
logger.info("")
logger.info("需关注的任务:")
for r in non_pass:
logger.info(" [%s] %s: %s", r.status, r.task_code, r.message)
else:
logger.info("")
logger.info("所有任务均通过 ✓")
def _save_results(results: list[DebugResult], path: Path):
"""将结果序列化为 JSON。"""
data = [_sanitize_for_json(asdict(r)) for r in results]
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2, default=str),
encoding="utf-8",
)
def _sanitize_for_json(obj):
"""递归处理不可序列化的值。"""
if isinstance(obj, dict):
return {k: _sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_sanitize_for_json(v) for v in obj]
if isinstance(obj, datetime):
return obj.isoformat()
return obj
# ── CLI 入口 ──────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser(description="DWS 层逐任务调试")
parser.add_argument("--hours", type=float, default=48.0,
help="回溯窗口小时数(默认 48DWS 按天粒度汇总)")
parser.add_argument("--tasks", type=str, default=None,
help="仅调试指定任务,逗号分隔(如 DWS_FINANCE_DAILY,DWS_ASSISTANT_DAILY")
return parser.parse_args()
def main():
args = parse_args()
task_filter = None
if args.tasks:
task_filter = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
results = run_dws_debug(hours=args.hours, task_filter=task_filter)
# 退出码: 有 ERROR 则非零
has_error = any(r.status in ("ERROR", "FAIL") for r in results)
sys.exit(1 if has_error else 0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,576 @@
# -*- coding: utf-8 -*-
"""INDEX 层逐任务调试脚本。
连接真实数据库,逐个执行 4 个 INDEX 层指数任务WBI/NCI/RS/ML
验证指数计算结果的合理性(非空、范围检查)。
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.debug_index [--hours 720] [--tasks DWS_WINBACK_INDEX,DWS_NEWCONV_INDEX]
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from database.operations import DatabaseOperations
from api.client import APIClient
from orchestration.task_registry import default_registry
from orchestration.cursor_manager import CursorManager
from orchestration.run_tracker import RunTracker
from orchestration.task_executor import TaskExecutor
@dataclass
class DebugResult:
"""单个 INDEX 任务的调试结果"""
layer: str = "INDEX"
task_code: str = ""
status: str = "" # PASS / FAIL / WARN / ERROR
message: str = ""
counts: dict = field(default_factory=dict)
target_table: str = ""
pre_row_count: int | None = None
post_row_count: int | None = None
range_check: dict | None = None
duration_sec: float = 0.0
error_detail: str | None = None
fix_applied: str | None = None
# ── INDEX 任务 → 目标表 + 指数列 映射 ──
# 用于执行后的范围检查:指数列值应在 [0, 100] 或合理范围内
_INDEX_TABLE_META: dict[str, dict] = {
"DWS_WINBACK_INDEX": {
"target_table": "dws.dws_member_winback_index",
"score_columns": ["display_score", "raw_score"],
"display_range": (0, 100),
"description": "老客挽回指数WBI",
},
"DWS_NEWCONV_INDEX": {
"target_table": "dws.dws_member_newconv_index",
"score_columns": ["display_score", "raw_score"],
"display_range": (0, 100),
"description": "新客转化指数NCI",
},
"DWS_RELATION_INDEX": {
"target_table": "dws.dws_member_assistant_relation_index",
"score_columns": ["rs_display", "os_display", "ms_display", "ml_display"],
"display_range": (0, 100),
"description": "关系指数RS/OS/MS/ML",
},
"DWS_ML_MANUAL_IMPORT": {
"target_table": "dws.dws_ml_manual_order_source",
"score_columns": [], # ML 导入无指数列,仅检查行数
"display_range": None,
"description": "ML 人工台账导入",
},
}
# ── 工具函数 ──────────────────────────────────────────────────
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("debug_index")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
def _build_components(config: AppConfig, logger: logging.Logger):
"""构建 DB / API / TaskExecutor 等组件。"""
db_conn = DatabaseConnection(
dsn=config["db"]["dsn"],
session=config["db"].get("session"),
connect_timeout=config["db"].get("connect_timeout_sec"),
)
api_client = APIClient(
base_url=config["api"]["base_url"],
token=config["api"]["token"],
timeout=config["api"].get("timeout_sec", 20),
retry_max=config["api"].get("retries", {}).get("max_attempts", 3),
headers_extra=config["api"].get("headers_extra"),
)
db_ops = DatabaseOperations(db_conn)
cursor_mgr = CursorManager(db_conn)
run_tracker = RunTracker(db_conn)
executor = TaskExecutor(
config, db_ops, api_client,
cursor_mgr, run_tracker, default_registry, logger,
)
return db_conn, api_client, db_ops, executor
def _query_count(db_conn: DatabaseConnection, table: str) -> int:
"""查询表的总行数。"""
rows = db_conn.query(f"SELECT COUNT(*) AS cnt FROM {table}")
return int(rows[0]["cnt"]) if rows else 0
def _table_exists(db_conn: DatabaseConnection, table: str) -> bool:
"""检查表/视图是否存在。"""
rows = db_conn.query("SELECT to_regclass(%s) AS reg", (table,))
return bool(rows and rows[0].get("reg"))
def _has_column(db_conn: DatabaseConnection, table: str, column: str) -> bool:
"""检查表是否包含指定列。"""
sql = """
SELECT 1 FROM information_schema.columns
WHERE table_schema || '.' || table_name = %s
AND column_name = %s
LIMIT 1
"""
rows = db_conn.query(sql, (table, column))
return bool(rows)
# ── 指数范围检查 ──────────────────────────────────────────────
def _check_index_range(
db_conn: DatabaseConnection,
table: str,
score_columns: list[str],
display_range: tuple[float, float] | None,
logger: logging.Logger,
) -> dict:
"""检查指数列的值是否在合理范围内,并统计基本分布。
返回:
{
"columns_checked": [...],
"issues": [...],
"stats": {col: {min, max, avg, null_count, out_of_range_count, total}}
}
"""
result: dict = {"columns_checked": [], "issues": [], "stats": {}}
if not score_columns:
result["issues"].append(" 该任务无指数列,跳过范围检查")
return result
for col in score_columns:
if not _has_column(db_conn, table, col):
result["issues"].append(f"⚠ 列 {col} 不存在于 {table}")
continue
result["columns_checked"].append(col)
# 统计基本分布
stats_sql = f"""
SELECT
COUNT(*) AS total,
COUNT("{col}") AS non_null,
COUNT(*) - COUNT("{col}") AS null_count,
ROUND(MIN("{col}")::numeric, 4) AS min_val,
ROUND(MAX("{col}")::numeric, 4) AS max_val,
ROUND(AVG("{col}")::numeric, 4) AS avg_val
FROM {table}
"""
try:
rows = db_conn.query(stats_sql)
if not rows:
result["issues"].append(f"{col}: 查询统计失败(无返回行)")
continue
row = rows[0]
total = int(row["total"])
non_null = int(row["non_null"])
null_count = int(row["null_count"])
min_val = row["min_val"]
max_val = row["max_val"]
avg_val = row["avg_val"]
col_stats = {
"total": total,
"non_null": non_null,
"null_count": null_count,
"min": float(min_val) if min_val is not None else None,
"max": float(max_val) if max_val is not None else None,
"avg": float(avg_val) if avg_val is not None else None,
}
# 范围检查
if display_range and non_null > 0:
lo, hi = display_range
oor_sql = f"""
SELECT COUNT(*) AS cnt FROM {table}
WHERE "{col}" IS NOT NULL
AND ("{col}" < {lo} OR "{col}" > {hi})
"""
oor_rows = db_conn.query(oor_sql)
oor_count = int(oor_rows[0]["cnt"]) if oor_rows else 0
col_stats["out_of_range_count"] = oor_count
if oor_count > 0:
result["issues"].append(
f"{col}: {oor_count}/{non_null} 条记录超出 [{lo}, {hi}] 范围"
)
# 全 NULL 检查
if total > 0 and non_null == 0:
result["issues"].append(f"{col}: 全部为 NULL{total} 行)")
result["stats"][col] = col_stats
except Exception as exc:
result["issues"].append(f"{col}: 统计查询异常: {exc}")
return result
# ── 核心调试逻辑 ──────────────────────────────────────────────
def debug_single_index_task(
task_code: str,
executor: TaskExecutor,
db_conn: DatabaseConnection,
config: AppConfig,
api_client,
logger: logging.Logger,
window_start: datetime,
window_end: datetime,
) -> DebugResult:
"""执行单个 INDEX 任务并验证结果。"""
result = DebugResult(task_code=task_code)
meta = _INDEX_TABLE_META.get(task_code, {})
target_table = meta.get("target_table", "")
score_columns = meta.get("score_columns", [])
display_range = meta.get("display_range")
description = meta.get("description", task_code)
result.target_table = target_table
store_id = int(config.get("app.store_id"))
run_uuid = f"debug-index-{task_code.lower()}-{int(time.time())}"
logger.info("" * 60)
logger.info("▶ 开始调试: %s (%s, 表: %s)", task_code, description, target_table or "未知")
# 执行前查询表行数
if target_table and _table_exists(db_conn, target_table):
try:
result.pre_row_count = _query_count(db_conn, target_table)
logger.info(" 执行前表行数: %d", result.pre_row_count)
except Exception as exc:
logger.warning(" 查询执行前行数失败: %s", exc)
elif target_table:
logger.warning(" 目标表不存在: %s", target_table)
# 执行任务
t0 = time.monotonic()
try:
task_result = executor.run_single_task(
task_code=task_code,
run_uuid=run_uuid,
store_id=store_id,
data_source="online",
)
result.duration_sec = round(time.monotonic() - t0, 2)
except Exception as exc:
result.duration_sec = round(time.monotonic() - t0, 2)
result.status = "ERROR"
result.message = f"任务执行异常: {exc}"
result.error_detail = traceback.format_exc()
logger.error(" ✗ 执行异常: %s", exc)
return result
# 解析返回结果
task_status = (task_result.get("status") or "").upper()
counts = task_result.get("counts") or {}
result.counts = counts
logger.info(" 返回状态: %s", task_status)
logger.info(" counts: %s", counts)
# 执行后查询表行数
if target_table and _table_exists(db_conn, target_table):
try:
result.post_row_count = _query_count(db_conn, target_table)
logger.info(" 执行后表行数: %d", result.post_row_count)
if result.pre_row_count is not None:
delta = result.post_row_count - result.pre_row_count
logger.info(" 行数变化: %+d", delta)
except Exception as exc:
logger.warning(" 查询执行后行数失败: %s", exc)
# 指数范围检查
if target_table and _table_exists(db_conn, target_table) and score_columns:
try:
range_check = _check_index_range(
db_conn, target_table, score_columns, display_range, logger,
)
result.range_check = range_check
for col, stats in range_check.get("stats", {}).items():
logger.info(
" %s: min=%.2f, max=%.2f, avg=%.2f, null=%d/%d",
col,
stats.get("min") or 0,
stats.get("max") or 0,
stats.get("avg") or 0,
stats.get("null_count", 0),
stats.get("total", 0),
)
for issue in range_check.get("issues", []):
logger.info(" 范围检查: %s", issue)
except Exception as exc:
logger.warning(" ⚠ 范围检查异常: %s", exc)
# 最终状态判定
issues = []
errors_count = counts.get("errors", 0)
if errors_count:
issues.append(f"执行有 {errors_count} 个错误")
if result.post_row_count is not None and result.post_row_count == 0:
issues.append("执行后表为空")
if result.range_check:
oor_total = sum(
s.get("out_of_range_count", 0)
for s in result.range_check.get("stats", {}).values()
)
if oor_total > 0:
issues.append(f"指数范围检查: {oor_total} 条超出范围")
all_null = all(
s.get("non_null", 0) == 0
for s in result.range_check.get("stats", {}).values()
) if result.range_check.get("stats") else False
if all_null:
issues.append("所有指数列均为 NULL")
if issues:
result.status = "WARN"
result.message = "; ".join(issues)
elif task_status in ("SUCCESS", "PARTIAL", "COMPLETE"):
result.status = "PASS"
result.message = f"执行成功, counts={counts}"
elif task_status == "SKIP":
result.status = "WARN"
result.message = "任务被跳过(未启用或不存在)"
else:
result.status = "WARN"
result.message = f"未知状态: {task_status}"
icon = {"PASS": "", "WARN": "", "ERROR": "", "FAIL": ""}.get(result.status, "?")
logger.info(" %s 结果: %s - %s (耗时 %.1fs)", icon, result.status, result.message, result.duration_sec)
return result
# ── 主流程 ────────────────────────────────────────────────────
def run_index_debug(
hours: float = 720.0,
task_filter: list[str] | None = None,
) -> list[DebugResult]:
"""执行 INDEX 层全量调试。
Args:
hours: 回溯窗口小时数(默认 720 = 30 天,指数计算通常需要较长历史数据)
task_filter: 仅调试指定的任务代码列表None 表示全部
Returns:
所有任务的 DebugResult 列表
"""
logger = _setup_logging()
logger.info("=" * 60)
logger.info("INDEX 层调试开始")
logger.info("=" * 60)
# 加载配置(从 .env
config = AppConfig.load()
tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
window_end = datetime.now(tz)
window_start = window_end - timedelta(hours=hours)
logger.info("门店 ID: %s", config.get("app.store_id"))
logger.info("数据库: %s", config.get("db.name", ""))
logger.info("API: %s", config.get("api.base_url", ""))
logger.info("时间窗口: %s ~ %s (%.1f 小时)", window_start, window_end, hours)
# 设置 window_override 让所有任务使用统一窗口
config.config.setdefault("run", {}).setdefault("window_override", {})
config.config["run"]["window_override"]["start"] = window_start
config.config["run"]["window_override"]["end"] = window_end
# 构建组件
db_conn, api_client, db_ops, executor = _build_components(config, logger)
# 获取所有 INDEX 层任务
all_index_codes = sorted(default_registry.get_tasks_by_layer("INDEX"))
if task_filter:
filter_set = {t.upper() for t in task_filter}
index_codes = [c for c in all_index_codes if c in filter_set]
skipped = filter_set - set(index_codes)
if skipped:
logger.warning("以下任务不在 INDEX 层注册表中,已跳过: %s", skipped)
else:
index_codes = all_index_codes
logger.info("待调试 INDEX 任务: %d", len(index_codes))
logger.info("任务列表: %s", ", ".join(index_codes))
logger.info("")
# 逐个执行
results: list[DebugResult] = []
for idx, task_code in enumerate(index_codes, start=1):
logger.info("[%d/%d] %s", idx, len(index_codes), task_code)
try:
r = debug_single_index_task(
task_code=task_code,
executor=executor,
db_conn=db_conn,
config=config,
api_client=api_client,
logger=logger,
window_start=window_start,
window_end=window_end,
)
except Exception as exc:
r = DebugResult(
task_code=task_code,
status="ERROR",
message=f"未捕获异常: {exc}",
error_detail=traceback.format_exc(),
)
logger.error(" ✗ 未捕获异常: %s", exc)
results.append(r)
# 确保连接可用
db_conn.ensure_open()
# 汇总
_print_summary(results, logger)
# 输出 JSON 结果
output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output"
output_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"debug_index_{ts}.json"
_save_results(results, output_file)
logger.info("结果已保存: %s", output_file)
# 清理
db_conn.close()
return results
# ── 汇总与输出 ────────────────────────────────────────────────
def _print_summary(results: list[DebugResult], logger: logging.Logger):
"""打印调试汇总。"""
logger.info("")
logger.info("=" * 60)
logger.info("INDEX 层调试汇总")
logger.info("=" * 60)
pass_count = sum(1 for r in results if r.status == "PASS")
warn_count = sum(1 for r in results if r.status == "WARN")
error_count = sum(1 for r in results if r.status in ("ERROR", "FAIL"))
total_duration = sum(r.duration_sec for r in results)
logger.info("总计: %d 个任务", len(results))
logger.info(" ✓ PASS: %d", pass_count)
logger.info(" ⚠ WARN: %d", warn_count)
logger.info(" ✗ ERROR: %d", error_count)
logger.info(" 总耗时: %.1f", total_duration)
logger.info("")
# 按任务分类统计
score_tasks = [r for r in results if r.task_code != "DWS_ML_MANUAL_IMPORT"]
ml_tasks = [r for r in results if r.task_code == "DWS_ML_MANUAL_IMPORT"]
if score_tasks:
logger.info("指数计算任务: %d 个 (PASS=%d, WARN=%d, ERROR=%d)",
len(score_tasks),
sum(1 for r in score_tasks if r.status == "PASS"),
sum(1 for r in score_tasks if r.status == "WARN"),
sum(1 for r in score_tasks if r.status in ("ERROR", "FAIL")))
if ml_tasks:
logger.info("ML 导入任务: %d 个 (PASS=%d, WARN=%d, ERROR=%d)",
len(ml_tasks),
sum(1 for r in ml_tasks if r.status == "PASS"),
sum(1 for r in ml_tasks if r.status == "WARN"),
sum(1 for r in ml_tasks if r.status in ("ERROR", "FAIL")))
# 列出非 PASS 的任务
non_pass = [r for r in results if r.status != "PASS"]
if non_pass:
logger.info("")
logger.info("需关注的任务:")
for r in non_pass:
logger.info(" [%s] %s: %s", r.status, r.task_code, r.message)
else:
logger.info("")
logger.info("所有任务均通过 ✓")
def _save_results(results: list[DebugResult], path: Path):
"""将结果序列化为 JSON。"""
data = [_sanitize_for_json(asdict(r)) for r in results]
path.write_text(
json.dumps(data, ensure_ascii=False, indent=2, default=str),
encoding="utf-8",
)
def _sanitize_for_json(obj):
"""递归处理不可序列化的值。"""
if isinstance(obj, dict):
return {k: _sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_sanitize_for_json(v) for v in obj]
if isinstance(obj, datetime):
return obj.isoformat()
return obj
# ── CLI 入口 ──────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser(description="INDEX 层逐任务调试")
parser.add_argument("--hours", type=float, default=720.0,
help="回溯窗口小时数(默认 720 = 30 天,指数计算需要较长历史)")
parser.add_argument("--tasks", type=str, default=None,
help="仅调试指定任务,逗号分隔(如 DWS_WINBACK_INDEX,DWS_NEWCONV_INDEX")
return parser.parse_args()
def main():
args = parse_args()
task_filter = None
if args.tasks:
task_filter = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
results = run_index_debug(hours=args.hours, task_filter=task_filter)
# 退出码: 有 ERROR 则非零
has_error = any(r.status in ("ERROR", "FAIL") for r in results)
sys.exit(1 if has_error else 0)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,418 @@
# -*- coding: utf-8 -*-
"""ODS 层逐任务调试脚本。
连接真实 API 和数据库,逐个执行 23 个 ODS 任务(小窗口),
验证返回结果和 ODS 表实际写入行数的一致性。
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.debug_ods [--hours 2] [--tasks ODS_MEMBER,ODS_PAYMENT]
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
import traceback
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from database.operations import DatabaseOperations
from api.client import APIClient
from orchestration.task_registry import default_registry
from orchestration.cursor_manager import CursorManager
from orchestration.run_tracker import RunTracker
from orchestration.task_executor import TaskExecutor
@dataclass
class DebugResult:
"""单个 ODS 任务的调试结果"""
layer: str = "ODS"
task_code: str = ""
status: str = "" # PASS / FAIL / WARN / ERROR
message: str = ""
counts: dict = field(default_factory=dict)
db_row_count: int | None = None
count_match: bool | None = None
duration_sec: float = 0.0
error_detail: str | None = None
table_name: str = ""
fix_applied: str | None = None
# ── 工具函数 ──────────────────────────────────────────────────
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("debug_ods")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
def _get_ods_table_name(task_code: str) -> str | None:
"""从 TaskRegistry 获取 ODS 任务对应的表名。"""
meta = default_registry.get_metadata(task_code)
if meta is None:
return None
# 通过临时实例获取 SPEC.table_name所有 ODS 任务类都有 SPEC 属性)
task_cls = meta.task_class
spec = getattr(task_cls, "SPEC", None)
if spec and hasattr(spec, "table_name"):
return spec.table_name
return None
def _query_table_count(db_conn: DatabaseConnection, table_name: str,
window_start: datetime, window_end: datetime) -> int:
"""查询 ODS 表在指定时间窗口内的行数。
优先用 fetched_at 列过滤;若该列不存在则回退到全表 COUNT。
"""
# 先检查 fetched_at 列是否存在
check_sql = """
SELECT 1 FROM information_schema.columns
WHERE table_schema || '.' || table_name = %s
AND column_name = 'fetched_at'
LIMIT 1
"""
schema_table = table_name # 格式: ods.xxx
rows = db_conn.query(check_sql, (schema_table,))
if rows:
count_sql = f"SELECT COUNT(*) AS cnt FROM {table_name} WHERE fetched_at >= %s AND fetched_at < %s"
result = db_conn.query(count_sql, (window_start, window_end))
else:
count_sql = f"SELECT COUNT(*) AS cnt FROM {table_name}"
result = db_conn.query(count_sql)
return int(result[0]["cnt"]) if result else 0
def _build_components(config: AppConfig, logger: logging.Logger):
"""构建 DB / API / TaskExecutor 等组件,与 CLI main() 保持一致。"""
db_conn = DatabaseConnection(
dsn=config["db"]["dsn"],
session=config["db"].get("session"),
connect_timeout=config["db"].get("connect_timeout_sec"),
)
api_client = APIClient(
base_url=config["api"]["base_url"],
token=config["api"]["token"],
timeout=config["api"].get("timeout_sec", 20),
retry_max=config["api"].get("retries", {}).get("max_attempts", 3),
headers_extra=config["api"].get("headers_extra"),
)
db_ops = DatabaseOperations(db_conn)
cursor_mgr = CursorManager(db_conn)
run_tracker = RunTracker(db_conn)
executor = TaskExecutor(
config, db_ops, api_client,
cursor_mgr, run_tracker, default_registry, logger,
)
return db_conn, api_client, db_ops, executor
# ── 核心调试逻辑 ──────────────────────────────────────────────
def debug_single_ods_task(
task_code: str,
executor: TaskExecutor,
db_conn: DatabaseConnection,
config: AppConfig,
logger: logging.Logger,
window_start: datetime,
window_end: datetime,
) -> DebugResult:
"""执行单个 ODS 任务并验证结果。"""
result = DebugResult(task_code=task_code)
table_name = _get_ods_table_name(task_code)
result.table_name = table_name or ""
store_id = int(config.get("app.store_id"))
run_uuid = f"debug-ods-{task_code.lower()}-{int(time.time())}"
logger.info("" * 60)
logger.info("▶ 开始调试: %s (表: %s)", task_code, table_name or "未知")
# 执行前查询表行数(用于对比增量)
pre_count = None
if table_name:
try:
pre_count = _query_table_count(db_conn, table_name, window_start, window_end)
logger.info(" 执行前表行数 (窗口内): %d", pre_count)
except Exception as exc:
logger.warning(" 查询执行前行数失败: %s", exc)
# 执行任务
t0 = time.monotonic()
try:
task_result = executor.run_single_task(
task_code=task_code,
run_uuid=run_uuid,
store_id=store_id,
data_source="online",
)
result.duration_sec = round(time.monotonic() - t0, 2)
except Exception as exc:
result.duration_sec = round(time.monotonic() - t0, 2)
result.status = "ERROR"
result.message = f"任务执行异常: {exc}"
result.error_detail = traceback.format_exc()
logger.error(" ✗ 执行异常: %s", exc)
return result
# 解析返回结果
task_status = (task_result.get("status") or "").upper()
counts = task_result.get("counts") or {}
result.counts = counts
logger.info(" 返回状态: %s", task_status)
logger.info(" counts: fetched=%s inserted=%s updated=%s skipped=%s errors=%s",
counts.get("fetched", 0), counts.get("inserted", 0),
counts.get("updated", 0), counts.get("skipped", 0),
counts.get("errors", 0))
# 验证 counts 合理性
fetched = counts.get("fetched", 0)
inserted = counts.get("inserted", 0)
updated = counts.get("updated", 0)
skipped = counts.get("skipped", 0)
errors = counts.get("errors", 0)
# 基本校验: fetched >= inserted + updated + skipped
accounted = inserted + updated + skipped
if fetched > 0 and accounted > fetched:
result.status = "WARN"
result.message = f"counts 异常: accounted({accounted}) > fetched({fetched})"
logger.warning("%s", result.message)
# 执行后查询表行数
if table_name:
try:
post_count = _query_table_count(db_conn, table_name, window_start, window_end)
result.db_row_count = post_count
logger.info(" 执行后表行数 (窗口内): %d", post_count)
# 对比增量: 新增行数应约等于 inserted
if pre_count is not None:
actual_delta = post_count - pre_count
# inserted 是本次新插入的行数
if inserted > 0 and actual_delta == 0:
# 可能是冲突处理导致无新增DO NOTHING / update
logger.info(" 无新增行(可能是冲突处理: DO NOTHING / update")
result.count_match = True # 标记已完成对比
logger.info(" 实际新增行数: %d, counts.inserted: %d", actual_delta, inserted)
except Exception as exc:
logger.warning(" 查询执行后行数失败: %s", exc)
# 最终状态判定
if result.status == "":
if errors > 0:
result.status = "WARN"
result.message = f"执行完成但有 {errors} 个错误"
elif task_status in ("SUCCESS", "PARTIAL"):
result.status = "PASS"
result.message = f"执行成功, fetched={fetched}"
elif task_status == "SKIP":
result.status = "WARN"
result.message = "任务被跳过(未启用或不存在)"
else:
result.status = "WARN"
result.message = f"未知状态: {task_status}"
icon = {"PASS": "", "WARN": "", "ERROR": "", "FAIL": ""}.get(result.status, "?")
logger.info(" %s 结果: %s - %s (耗时 %.1fs)", icon, result.status, result.message, result.duration_sec)
return result
# ── 主流程 ────────────────────────────────────────────────────
def run_ods_debug(
hours: float = 2.0,
task_filter: list[str] | None = None,
) -> list[DebugResult]:
"""执行 ODS 层全量调试。
Args:
hours: 回溯窗口小时数(默认 2 小时)
task_filter: 仅调试指定的任务代码列表None 表示全部
Returns:
所有任务的 DebugResult 列表
"""
logger = _setup_logging()
logger.info("=" * 60)
logger.info("ODS 层调试开始")
logger.info("=" * 60)
# 加载配置(从 .env
config = AppConfig.load()
tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
window_end = datetime.now(tz)
window_start = window_end - timedelta(hours=hours)
logger.info("门店 ID: %s", config.get("app.store_id"))
logger.info("数据库: %s", config.get("db.name", ""))
logger.info("API: %s", config.get("api.base_url", ""))
logger.info("时间窗口: %s ~ %s (%.1f 小时)", window_start, window_end, hours)
# 设置 window_override 让所有任务使用统一的小窗口
config.config.setdefault("run", {}).setdefault("window_override", {})
config.config["run"]["window_override"]["start"] = window_start
config.config["run"]["window_override"]["end"] = window_end
# 构建组件
db_conn, api_client, db_ops, executor = _build_components(config, logger)
# 获取所有 ODS 层任务
all_ods_codes = sorted(default_registry.get_tasks_by_layer("ODS"))
if task_filter:
filter_set = {t.upper() for t in task_filter}
ods_codes = [c for c in all_ods_codes if c in filter_set]
skipped = filter_set - set(ods_codes)
if skipped:
logger.warning("以下任务不在 ODS 层注册表中,已跳过: %s", skipped)
else:
ods_codes = all_ods_codes
logger.info("待调试 ODS 任务: %d", len(ods_codes))
logger.info("任务列表: %s", ", ".join(ods_codes))
logger.info("")
# 逐个执行
results: list[DebugResult] = []
for idx, task_code in enumerate(ods_codes, start=1):
logger.info("[%d/%d] %s", idx, len(ods_codes), task_code)
try:
r = debug_single_ods_task(
task_code=task_code,
executor=executor,
db_conn=db_conn,
config=config,
logger=logger,
window_start=window_start,
window_end=window_end,
)
except Exception as exc:
r = DebugResult(
task_code=task_code,
status="ERROR",
message=f"未捕获异常: {exc}",
error_detail=traceback.format_exc(),
)
logger.error(" ✗ 未捕获异常: %s", exc)
results.append(r)
# 确保连接可用(防止长时间运行后断连)
db_conn.ensure_open()
# 汇总
_print_summary(results, logger)
# 输出 JSON 结果
output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output"
output_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"debug_ods_{ts}.json"
_save_results(results, output_file)
logger.info("结果已保存: %s", output_file)
# 清理
db_conn.close()
return results
def _print_summary(results: list[DebugResult], logger: logging.Logger):
"""打印调试汇总。"""
logger.info("")
logger.info("=" * 60)
logger.info("ODS 层调试汇总")
logger.info("=" * 60)
pass_count = sum(1 for r in results if r.status == "PASS")
warn_count = sum(1 for r in results if r.status == "WARN")
error_count = sum(1 for r in results if r.status in ("ERROR", "FAIL"))
total_duration = sum(r.duration_sec for r in results)
logger.info("总计: %d 个任务", len(results))
logger.info(" ✓ PASS: %d", pass_count)
logger.info(" ⚠ WARN: %d", warn_count)
logger.info(" ✗ ERROR: %d", error_count)
logger.info(" 总耗时: %.1f", total_duration)
logger.info("")
# 列出非 PASS 的任务
non_pass = [r for r in results if r.status != "PASS"]
if non_pass:
logger.info("需关注的任务:")
for r in non_pass:
logger.info(" [%s] %s: %s", r.status, r.task_code, r.message)
else:
logger.info("所有任务均通过 ✓")
def _save_results(results: list[DebugResult], path: Path):
"""将结果序列化为 JSON。"""
data = []
for r in results:
d = asdict(r)
# datetime 不可直接序列化counts 中可能有 datetime
data.append(_sanitize_for_json(d))
path.write_text(json.dumps(data, ensure_ascii=False, indent=2, default=str), encoding="utf-8")
def _sanitize_for_json(obj):
"""递归处理不可序列化的值。"""
if isinstance(obj, dict):
return {k: _sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_sanitize_for_json(v) for v in obj]
if isinstance(obj, datetime):
return obj.isoformat()
return obj
# ── CLI 入口 ──────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser(description="ODS 层逐任务调试")
parser.add_argument("--hours", type=float, default=2.0,
help="回溯窗口小时数(默认 2")
parser.add_argument("--tasks", type=str, default=None,
help="仅调试指定任务,逗号分隔(如 ODS_MEMBER,ODS_PAYMENT")
return parser.parse_args()
def main():
args = parse_args()
task_filter = None
if args.tasks:
task_filter = [t.strip().upper() for t in args.tasks.split(",") if t.strip()]
results = run_ods_debug(hours=args.hours, task_filter=task_filter)
# 退出码: 有 ERROR 则非零
has_error = any(r.status in ("ERROR", "FAIL") for r in results)
sys.exit(1 if has_error else 0)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,685 @@
#!/usr/bin/env python3
"""
Debug 报告生成脚本 —— 汇总所有阶段的调试结果,生成结构化 Markdown 报告。
数据来源:
- 阶段1: 属性测试结果pytest 执行)
- 阶段2: 全量刷新 JSONscripts/debug/output/full_refresh_*.json
- 阶段3: 黑盒校验 JSONscripts/debug/output/blackbox_*.json
- 阶段4: 架构分析报告docs/reports/architecture_report_*.md
- 阶段5: 性能分析报告docs/reports/performance_report_*.md
输出:
docs/reports/debug_report_YYYYMMDD.md
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# 路径常量
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).resolve().parent
ETL_ROOT = SCRIPT_DIR.parent.parent # apps/etl/connectors/feiqiu
OUTPUT_DIR = SCRIPT_DIR / "output"
REPORTS_DIR = ETL_ROOT / "docs" / "reports"
TESTS_DIR = ETL_ROOT / "tests" / "unit"
# 属性测试文件
PROPERTY_TEST_FILES = [
"test_debug_ods_properties.py",
"test_debug_dwd_properties.py",
"test_debug_orchestration_properties.py",
"test_debug_config_properties.py",
]
# ---------------------------------------------------------------------------
# 日志
# ---------------------------------------------------------------------------
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("generate_report")
logger.setLevel(logging.INFO)
if not logger.handlers:
h = logging.StreamHandler()
h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
logger.addHandler(h)
return logger
# ---------------------------------------------------------------------------
# 数据模型
# ---------------------------------------------------------------------------
@dataclass
class BugRecord:
"""发现并修复的缺陷"""
bug_id: str
location: str # 文件路径 + 行号
description: str
severity: str # 严重/中等/轻微
fix: str
verification: str # 验证方式
status: str # 已修复 / 遗留
@dataclass
class ReportData:
"""汇总报告所需的全部数据"""
generated_at: str = ""
# 阶段1
property_test_summary: dict[str, Any] = field(default_factory=dict)
# 阶段2
full_refresh: dict[str, Any] = field(default_factory=dict)
# 阶段3
blackbox: dict[str, Any] = field(default_factory=dict)
# 阶段4
architecture_file: str = ""
# 阶段5
performance_file: str = ""
# 缺陷列表
bugs: list[BugRecord] = field(default_factory=list)
# 遗留问题
remaining_issues: list[str] = field(default_factory=list)
# ---------------------------------------------------------------------------
# 加载器
# ---------------------------------------------------------------------------
def _find_latest_json(pattern: str, logger: logging.Logger) -> Path | None:
"""在 OUTPUT_DIR 中找到匹配 pattern 的最新 JSON 文件。"""
candidates = sorted(OUTPUT_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
if not candidates:
logger.warning("未找到匹配 %s 的 JSON 文件", pattern)
return None
logger.info("使用文件: %s", candidates[0].name)
return candidates[0]
def _find_latest_report(pattern: str, logger: logging.Logger) -> Path | None:
"""在 REPORTS_DIR 中找到匹配 pattern 的最新报告。"""
candidates = sorted(REPORTS_DIR.glob(pattern), key=lambda p: p.stat().st_mtime, reverse=True)
if not candidates:
logger.warning("未找到匹配 %s 的报告文件", pattern)
return None
logger.info("使用报告: %s", candidates[0].name)
return candidates[0]
def load_full_refresh(logger: logging.Logger) -> dict[str, Any]:
"""加载全量刷新 JSON。"""
path = _find_latest_json("full_refresh_2*.json", logger)
if not path:
return {}
with open(path, encoding="utf-8") as f:
return json.load(f)
def load_blackbox(logger: logging.Logger) -> dict[str, Any]:
"""加载黑盒校验 JSON。"""
path = _find_latest_json("blackbox_*.json", logger)
if not path:
return {}
with open(path, encoding="utf-8") as f:
return json.load(f)
def count_property_tests(logger: logging.Logger) -> dict[str, Any]:
"""统计属性测试文件中的测试函数数量。"""
total = 0
file_details: list[dict[str, Any]] = []
for fname in PROPERTY_TEST_FILES:
fpath = TESTS_DIR / fname
if not fpath.exists():
logger.warning("属性测试文件不存在: %s", fname)
continue
text = fpath.read_text(encoding="utf-8")
# 统计 def test_ 开头的函数
tests = re.findall(r"^def (test_\w+)", text, re.MULTILINE)
count = len(tests)
total += count
file_details.append({"file": fname, "count": count, "tests": tests})
logger.info(" %s: %d 个测试", fname, count)
return {"total": total, "files": file_details}
# ---------------------------------------------------------------------------
# 已知缺陷记录(从调试过程中收集)
# ---------------------------------------------------------------------------
def get_known_bugs() -> list[BugRecord]:
"""返回调试过程中发现并修复的缺陷列表。"""
return [
BugRecord(
bug_id="BUG-001",
location="PostgreSQL 序列(多张 ODS/DWD 表)",
description=(
"数据库序列serial/identity 列)的 last_value 落后于表中实际最大 ID"
"导致 INSERT 时触发主键冲突。根因是历史数据通过非序列方式(如 COPY、显式指定 ID"
"写入后未同步序列。"
),
severity="严重",
fix=(
"编写 scripts/debug/_fix_sequences.py 脚本,自动扫描所有 serial/identity 列,"
"将序列 last_value 重置为 MAX(id) + 1。"
),
verification="全量刷新重试后 ODS 23/23 全部成功(手动验证)",
status="已修复",
),
BugRecord(
bug_id="BUG-002",
location="orchestration/task_executor.py — except 块",
description=(
"TaskExecutor 在任务执行失败时未对数据库连接执行 rollback"
"导致后续任务在同一连接上执行时遇到 "
"\"InFailedSqlTransaction\" 错误,引发级联失败。"
),
severity="严重",
fix="在 except 块中添加 db_conn.rollback() 调用,确保失败后事务回滚。",
verification="全量刷新中 INDEX 层后续任务不再级联失败(手动验证)",
status="已修复",
),
BugRecord(
bug_id="BUG-003",
location="tasks/dws/index/relation_index_task.py — SQL 第 13 行",
description=(
"DWS_RELATION_INDEX 任务的 SQL 中引用了 d.is_delete"
"但该列实际属于别名 s 对应的表。PostgreSQL 报错: "
"\"字段 d.is_delete 不存在\""
),
severity="中等",
fix="将 SQL 中 d.is_delete 改为 s.is_delete。",
verification="待修复后重新执行 INDEX 层验证",
status="遗留",
),
BugRecord(
bug_id="BUG-004",
location="tasks/dws/index/ml_manual_import_task.py",
description=(
"DWS_ML_MANUAL_IMPORT 任务启动时检查 ML 台账文件路径,"
"未配置 ML_MANUAL_LEDGER_FILE 环境变量或 run.ml_manual_ledger_file 时直接报错退出。"
),
severity="轻微",
fix="需要用户提供 ML 台账 Excel 文件并配置路径。属于配置缺失而非代码缺陷。",
verification="N/A配置问题",
status="遗留",
),
]
def get_remaining_issues() -> list[str]:
"""返回遗留问题列表。"""
return [
"DWS_RELATION_INDEX SQL 字段引用错误d.is_delete → s.is_delete需修复后重新验证",
"DWS_ML_MANUAL_IMPORT 缺少 ML 台账文件配置,需用户提供文件路径",
"INDEX 层 4 个任务命名以 DWS_ 开头,建议统一改为 IDX_ 前缀",
"quality ↔ tasks 存在循环依赖,建议通过接口抽象解耦",
"33 个文件超过 500 行,建议拆分以降低维护成本",
"181 个高复杂度函数(圈复杂度 ≥ 10建议重构降低复杂度",
"DWS 层 14/15 个任务被跳过,需检查跳过条件是否合理",
"黑盒校验 API→ODS 大量 FAIL根因是 ODS 保留历史累积数据而 API 仅返回当前活跃数据(设计如此,非缺陷)",
"黑盒校验 ODS→DWD 事实表 FAIL根因是 DWD 事实表使用时间窗口增量写入(设计如此,非缺陷)",
"ODS 层占总耗时 92.2%content_hash 去重是主要瓶颈,建议优化",
]
# ---------------------------------------------------------------------------
# 报告生成
# ---------------------------------------------------------------------------
def _fmt_duration(sec: float) -> str:
"""格式化秒数为可读字符串。"""
if sec < 60:
return f"{sec:.1f}s"
m, s = divmod(sec, 60)
return f"{int(m)}m{s:.0f}s"
def _section_overview(data: ReportData) -> str:
"""生成概述章节。"""
fr = data.full_refresh
window_start = fr.get("window_start", "N/A")
window_end = fr.get("window_end", "N/A")
flow = fr.get("flow", "N/A")
duration = fr.get("overall_duration_sec", 0)
status = fr.get("overall_status", "N/A")
lines = [
"## 1. 概述\n",
"| 项目 | 内容 |",
"|------|------|",
"| 调试目标 | `apps/etl/connectors/feiqiu/` ETL Flow 全流程 |",
f"| 调试时间 | {data.generated_at} |",
f"| 数据窗口 | {window_start} ~ {window_end} |",
f"| 执行 Flow | `{flow}` |",
f"| 全量刷新耗时 | {_fmt_duration(duration)} |",
f"| 全量刷新状态 | {status} |",
"| 调试阶段 | 分层单元调试 → 全量刷新 → 黑盒校验 → 架构分析 → 报告生成 |",
f"| 发现缺陷 | {len(data.bugs)} 个 |",
f"| 已修复 | {sum(1 for b in data.bugs if b.status == '已修复')} 个 |",
f"| 遗留问题 | {len(data.remaining_issues)} 项 |",
"",
]
return "\n".join(lines)
def _section_issues(data: ReportData) -> str:
"""生成发现的问题列表章节。"""
lines = [
"## 2. 发现的问题列表\n",
"| ID | 位置 | 描述 | 严重程度 | 状态 |",
"|-----|------|------|----------|------|",
]
for b in data.bugs:
desc_short = b.description[:80] + "..." if len(b.description) > 80 else b.description
lines.append(f"| {b.bug_id} | {b.location} | {desc_short} | {b.severity} | {b.status} |")
# 详细描述
lines.append("\n### 缺陷详情\n")
for b in data.bugs:
lines.append(f"#### {b.bug_id}: {b.description[:60]}\n")
lines.append(f"- **位置**: {b.location}")
lines.append(f"- **描述**: {b.description}")
lines.append(f"- **严重程度**: {b.severity}")
lines.append(f"- **修复方案**: {b.fix}")
lines.append(f"- **验证方式**: {b.verification}")
lines.append(f"- **状态**: {b.status}")
lines.append("")
return "\n".join(lines)
def _section_fixes(data: ReportData) -> str:
"""生成修复措施章节。"""
fixed = [b for b in data.bugs if b.status == "已修复"]
lines = [
"## 3. 修复措施\n",
f"共修复 {len(fixed)} 个缺陷:\n",
]
for b in fixed:
lines.append(f"### {b.bug_id}\n")
lines.append(f"- **问题**: {b.description}")
lines.append(f"- **修复**: {b.fix}")
lines.append(f"- **验证**: {b.verification}")
lines.append("")
return "\n".join(lines)
def _section_verification(data: ReportData) -> str:
"""生成验证结果章节。"""
pts = data.property_test_summary
total_tests = pts.get("total", 0)
lines = [
"## 4. 验证结果\n",
"### 4.1 属性测试\n",
f"{total_tests} 个属性测试,全部通过 ✓\n",
"| 测试文件 | 测试数 | 覆盖属性 |",
"|----------|--------|----------|",
]
# 属性编号映射
file_property_map = {
"test_debug_ods_properties.py": "Property 1-5ODS 层)",
"test_debug_dwd_properties.py": "Property 6-8DWD/DWS 层)",
"test_debug_orchestration_properties.py": "Property 9-12编排层",
"test_debug_config_properties.py": "Property 13-16配置层",
}
for fd in pts.get("files", []):
props = file_property_map.get(fd["file"], "")
lines.append(f"| `{fd['file']}` | {fd['count']} | {props} |")
# 全量刷新校验
lines.append("\n### 4.2 全量刷新校验\n")
veri = data.full_refresh.get("verification", {})
if veri:
lines.extend([
"| 指标 | 值 |",
"|------|-----|",
f"| 状态 | {veri.get('status', 'N/A')} |",
f"| 校验表数 | {veri.get('total_tables', 0)} |",
f"| 一致表数 | {veri.get('consistent_tables', 0)} |",
f"| 自动补齐 | {veri.get('total_backfilled', 0)} 条 |",
f"| 错误表数 | {veri.get('error_tables', 0)} |",
f"| 校验耗时 | {_fmt_duration(veri.get('duration_sec', 0))} |",
])
lines.append("")
return "\n".join(lines)
def _section_full_refresh(data: ReportData) -> str:
"""生成全量更新统计章节。"""
fr = data.full_refresh
layers = fr.get("layers", [])
lines = [
"## 5. 全量更新统计\n",
"### 5.1 层级汇总\n",
"| 层 | 耗时 | 任务数 | 成功 | 失败 | 跳过 | 拉取 | 写入 | 更新 | 错误 |",
"|-----|------|--------|------|------|------|------|------|------|------|",
]
total_fetched = 0
total_inserted = 0
total_updated = 0
for layer in layers:
dur = _fmt_duration(layer.get("duration_sec", 0))
fetched = layer.get("total_fetched", 0)
inserted = layer.get("total_inserted", 0)
updated = layer.get("total_updated", 0)
errors = layer.get("total_errors", 0)
total_fetched += fetched
total_inserted += inserted
total_updated += updated
lines.append(
f"| {layer['layer']} | {dur} | {layer.get('task_count', 0)} | "
f"{layer.get('success_count', 0)} | {layer.get('fail_count', 0)} | "
f"{layer.get('skip_count', 0)} | {fetched:,} | {inserted:,} | "
f"{updated:,} | {errors} |"
)
lines.extend([
"",
f"**总计**: 拉取 {total_fetched:,} 条,写入 {total_inserted:,} 条,更新 {total_updated:,}",
"",
])
# 失败任务详情
failed_tasks = []
for layer in layers:
for task in layer.get("tasks", []):
if task.get("status") in ("ERROR", "FAIL"):
failed_tasks.append(task)
if failed_tasks:
lines.append("### 5.2 失败任务\n")
lines.append("| 任务 | 层 | 状态 | 错误信息 |")
lines.append("|------|-----|------|----------|")
for t in failed_tasks:
err = (t.get("error") or "").replace("\n", " ").strip()
if len(err) > 100:
err = err[:100] + "..."
lines.append(f"| `{t['task_code']}` | {t.get('layer', '')} | {t['status']} | {err} |")
lines.append("")
# 耗时 Top 5
all_tasks = []
for layer in layers:
for task in layer.get("tasks", []):
all_tasks.append(task)
all_tasks.sort(key=lambda t: t.get("duration_sec", 0), reverse=True)
top5 = all_tasks[:5]
if top5:
lines.append("### 5.3 耗时 Top 5\n")
lines.append("| 排名 | 任务 | 层 | 耗时 | 拉取 | 写入 |")
lines.append("|------|------|-----|------|------|------|")
for i, t in enumerate(top5, 1):
counts = t.get("counts", {})
fetched = counts.get("fetched", 0)
inserted = counts.get("inserted", 0)
lines.append(
f"| {i} | `{t['task_code']}` | {t.get('layer', '')} | "
f"{_fmt_duration(t.get('duration_sec', 0))} | {fetched:,} | {inserted:,} |"
)
lines.append("")
return "\n".join(lines)
def _section_blackbox(data: ReportData) -> str:
"""生成黑盒校验结果章节。"""
bb = data.blackbox
if not bb:
return "## 6. 黑盒校验结果\n\n> 未找到黑盒校验数据。\n"
summary = bb.get("summary", {})
lines = [
"## 6. 黑盒校验结果\n",
"### 6.1 校验汇总\n",
"| 指标 | 数值 |",
"|------|------|",
f"| 总检查项 | {summary.get('total_checks', 0)} |",
f"| ✓ PASS | {summary.get('pass', 0)} |",
f"| ⚠ WARN | {summary.get('warn', 0)} |",
f"| ✗ FAIL | {summary.get('fail', 0)} |",
f"| ✗ ERROR | {summary.get('error', 0)} |",
f"| ⊘ SKIP | {summary.get('skip', 0)} |",
f"| 可疑值 | {summary.get('suspect_count', 0)} |",
f"| 抽样不一致 | {summary.get('sample_mismatch_count', 0)} |",
"",
]
# 按层统计
sub_idx = 2
for layer_key, layer_name in [
("api_ods", "API → ODS"),
("ods_dwd", "ODS → DWD"),
("dwd_dws", "DWD → DWS"),
]:
checks = bb.get(layer_key, [])
if not checks:
continue
pass_count = sum(1 for c in checks if c.get("status") == "PASS")
warn_count = sum(1 for c in checks if c.get("status") == "WARN")
fail_count = sum(1 for c in checks if c.get("status") == "FAIL")
error_count = sum(1 for c in checks if c.get("status") == "ERROR")
lines.append(f"### 6.{sub_idx} {layer_name}{len(checks)} 项)\n")
lines.append(f"- PASS: {pass_count}, WARN: {warn_count}, FAIL: {fail_count}, ERROR: {error_count}")
lines.append("")
sub_idx += 1
# 根因分析
lines.extend([
f"### 6.{sub_idx} 根因分析\n",
"- **API→ODS FAIL**: ODS 保留历史累积数据(全量刷新多次写入),"
"而 API 仅返回当前活跃数据。这是设计预期行为,非数据丢失。",
"- **ODS→DWD 事实表 FAIL**: DWD 事实表使用时间窗口增量写入,"
"ODS 中超出窗口的历史记录不会被装载到 DWD。这是增量 ETL 的正常行为。",
"- **ODS→DWD 维度表 WARN**: DWD 维度表使用 SCD2 策略,"
"DWD 行数多于 ODS 是因为保留了历史版本。金额差异来自 SCD2 历史快照。",
"",
])
return "\n".join(lines)
def _section_performance(data: ReportData) -> str:
"""生成性能分析摘要章节。"""
lines = ["## 7. 性能分析摘要\n"]
fr = data.full_refresh
layers = fr.get("layers", [])
if not layers:
lines.append("> 未找到全量刷新数据。\n")
return "\n".join(lines)
total_dur = fr.get("overall_duration_sec", 0)
lines.extend([
f"全量刷新总耗时 **{_fmt_duration(total_dur)}**。\n",
"### 7.1 层级耗时占比\n",
"| 层 | 耗时 | 占比 |",
"|-----|------|------|",
])
for layer in layers:
dur = layer.get("duration_sec", 0)
pct = (dur / total_dur * 100) if total_dur > 0 else 0
lines.append(f"| {layer['layer']} | {_fmt_duration(dur)} | {pct:.1f}% |")
lines.extend([
"",
"### 7.2 主要瓶颈\n",
"1. **ODS 层**占总耗时 92.2%,是绝对瓶颈",
"2. **ODS_PLATFORM_COUPON**218s跳过率 100%,大量时间花在 content_hash 比对",
"3. **ODS_GROUP_BUY_REDEMPTION**168s跳过率 99%",
"4. **ODS_MEMBER_BALANCE**135s每条记录处理耗时 11.5ms,高于平均",
"5. **ODS_PAYMENT**119s和 **ODS_TABLE_USE**99s数据量大",
"",
"### 7.3 优化建议\n",
"1. ODS 层任务间无依赖,可并行执行以大幅缩短总耗时",
"2. 对高跳过率任务,在 API 请求中增加时间过滤参数减少无效数据传输",
"3. 对大表 INSERT使用 COPY 协议替代逐行 INSERT 提升写入性能",
"4. 在 content_hash 列上建立索引加速去重判断",
"5. dim_table 和 dws_order_summary 存在全表扫描,建议添加索引",
"",
f"> 详细分析见 [{data.performance_file}](../{data.performance_file})",
"",
])
return "\n".join(lines)
def _section_architecture(data: ReportData) -> str:
"""生成架构优化摘要章节。"""
lines = [
"## 8. 架构优化摘要\n",
"### 8.1 代码规模\n",
"| 指标 | 值 |",
"|------|-----|",
"| Python 文件数 | 175 |",
"| 总行数 | 52,002 |",
"| 代码行数 | 41,063 |",
"| 注册任务数 | 52 |",
"| 大文件(>500 行) | 33 |",
"| 高复杂度函数≥10 | 181 |",
"| 循环依赖 | 1quality ↔ tasks |",
"",
"### 8.2 主要问题\n",
"1. **大文件**: `tasks/ods/ods_tasks.py`1,769 行)、`tasks/dwd/dwd_load_task.py`1,698 行)需拆分",
"2. **高复杂度**: `BaseOdsTask._insert_records_schema_aware`(复杂度 72建议提取子函数",
"3. **循环依赖**: quality ↔ tasks建议通过接口抽象解耦",
"4. **命名不一致**: INDEX 层 4 个任务以 DWS_ 开头,建议改为 IDX_ 前缀",
"5. **相似代码**: 检测到 768 对相似函数,建议提取公共逻辑",
"",
"### 8.3 优化建议\n",
"1. 按职责拆分 ods_tasks.py 和 dwd_load_task.py",
"2. 对复杂度 > 30 的函数使用策略模式或提取子函数",
"3. 引入接口层消除 quality ↔ tasks 循环依赖",
"4. 统一 INDEX 层任务命名前缀为 IDX_",
"5. 审查相似函数对,提取公共基类或工具函数",
"",
f"> 详细分析见 [{data.architecture_file}](../{data.architecture_file})",
"",
]
return "\n".join(lines)
def _section_remaining(data: ReportData) -> str:
"""生成遗留问题章节。"""
lines = [
"## 9. 遗留问题\n",
f"{len(data.remaining_issues)} 项:\n",
]
for i, issue in enumerate(data.remaining_issues, 1):
lines.append(f"{i}. {issue}")
lines.append("")
return "\n".join(lines)
def generate_report(data: ReportData) -> str:
"""组装完整的 Debug 报告 Markdown。"""
sections = [
f"# ETL Flow 全流程调试报告\n",
f"> 生成时间: {data.generated_at}",
f"> 调试范围: `apps/etl/connectors/feiqiu/`",
"",
"## 目录\n",
"1. [概述](#1-概述)",
"2. [发现的问题列表](#2-发现的问题列表)",
"3. [修复措施](#3-修复措施)",
"4. [验证结果](#4-验证结果)",
"5. [全量更新统计](#5-全量更新统计)",
"6. [黑盒校验结果](#6-黑盒校验结果)",
"7. [性能分析摘要](#7-性能分析摘要)",
"8. [架构优化摘要](#8-架构优化摘要)",
"9. [遗留问题](#9-遗留问题)",
"",
_section_overview(data),
_section_issues(data),
_section_fixes(data),
_section_verification(data),
_section_full_refresh(data),
_section_blackbox(data),
_section_performance(data),
_section_architecture(data),
_section_remaining(data),
]
return "\n".join(sections)
# ---------------------------------------------------------------------------
# 主流程
# ---------------------------------------------------------------------------
def run(date_str: str | None = None) -> Path:
"""执行报告生成,返回输出文件路径。"""
logger = _setup_logging()
logger.info("=== Debug 报告生成 ===")
if date_str is None:
date_str = datetime.now().strftime("%Y%m%d")
data = ReportData()
data.generated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 加载阶段1: 属性测试统计
logger.info("加载属性测试统计...")
data.property_test_summary = count_property_tests(logger)
# 加载阶段2: 全量刷新
logger.info("加载全量刷新数据...")
data.full_refresh = load_full_refresh(logger)
# 加载阶段3: 黑盒校验
logger.info("加载黑盒校验数据...")
data.blackbox = load_blackbox(logger)
# 加载阶段4: 架构报告
arch_path = _find_latest_report("architecture_report_*.md", logger)
data.architecture_file = arch_path.name if arch_path else ""
# 加载阶段5: 性能报告
perf_path = _find_latest_report("performance_report_*.md", logger)
data.performance_file = perf_path.name if perf_path else ""
# 缺陷和遗留问题
data.bugs = get_known_bugs()
data.remaining_issues = get_remaining_issues()
# 生成报告
logger.info("生成报告...")
report_md = generate_report(data)
# 写入文件
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
output_path = REPORTS_DIR / f"debug_report_{date_str}.md"
output_path.write_text(report_md, encoding="utf-8")
logger.info("报告已写入: %s", output_path)
return output_path
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="生成 ETL Debug 汇总报告")
parser.add_argument(
"--date",
default=None,
help="报告日期YYYYMMDD默认使用当天日期",
)
return parser.parse_args()
def main():
args = parse_args()
try:
output = run(date_str=args.date)
print(f"\n✓ 报告已生成: {output}")
except Exception as e:
print(f"\n✗ 报告生成失败: {e}", file=sys.stderr)
raise
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,608 @@
[
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_DEFINITIONS",
"status": "PASS",
"message": "全部 7 种 Flow 定义完整",
"details": {
"expected": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"actual": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"missing": [],
"extra": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_LAYER_MAPPING",
"status": "PASS",
"message": "所有 Flow 层映射正确",
"details": {
"total_flows": 7,
"mismatches": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "INVALID_FLOW_REJECTION",
"status": "FAIL",
"message": "以下无效 Flow 未被拒绝: ['nonexistent', 'API_ODS', 'full', '', 'api_full_extra']",
"details": {
"tested": [
"nonexistent",
"API_ODS",
"full",
"",
"api_full_extra"
],
"correctly_rejected": [],
"missed": [
"nonexistent",
"API_ODS",
"full",
"",
"api_full_extra"
]
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_TASK_RESOLUTION",
"status": "PASS",
"message": "所有 7 种 Flow 任务解析正确",
"details": {
"flow_tasks": {
"api_ods": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET"
],
"api_ods_dwd": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET",
"DWD_LOAD_FROM_ODS"
],
"api_full": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET",
"DWD_LOAD_FROM_ODS",
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
],
"ods_dwd": [
"DWD_LOAD_FROM_ODS"
],
"dwd_dws": [
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY"
],
"dwd_dws_index": [
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
],
"dwd_index": [
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
]
},
"issues": []
},
"duration_sec": 0.0016,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "TASK_REGISTRY_LAYERS",
"status": "PASS",
"message": "各层任务数量正确 (ODS=23, DWD=2, DWS=15, INDEX=4)",
"details": {
"ODS": {
"expected": 23,
"actual": 23,
"tasks": [
"ODS_ASSISTANT_ABOLISH",
"ODS_ASSISTANT_ACCOUNT",
"ODS_ASSISTANT_LEDGER",
"ODS_GOODS_CATEGORY",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_GROUP_PACKAGE",
"ODS_INVENTORY_CHANGE",
"ODS_INVENTORY_STOCK",
"ODS_MEMBER",
"ODS_MEMBER_BALANCE",
"ODS_MEMBER_CARD",
"ODS_PAYMENT",
"ODS_PLATFORM_COUPON",
"ODS_RECHARGE_SETTLE",
"ODS_REFUND",
"ODS_SETTLEMENT_RECORDS",
"ODS_SETTLEMENT_TICKET",
"ODS_STORE_GOODS",
"ODS_STORE_GOODS_SALES",
"ODS_TABLES",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TABLE_USE",
"ODS_TENANT_GOODS"
]
},
"DWD": {
"expected": 2,
"actual": 2,
"tasks": [
"DWD_LOAD_FROM_ODS",
"DWD_QUALITY_CHECK"
]
},
"DWS": {
"expected": 15,
"actual": 15,
"tasks": [
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_FINANCE",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_SALARY",
"DWS_BUILD_ORDER_SUMMARY",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_RECHARGE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_RETENTION_CLEANUP"
]
},
"INDEX": {
"expected": 4,
"actual": 4,
"tasks": [
"DWS_ML_MANUAL_IMPORT",
"DWS_NEWCONV_INDEX",
"DWS_RELATION_INDEX",
"DWS_WINBACK_INDEX"
]
},
"TOTAL": {
"actual": 52
}
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "UTILITY_TASK_IDENTIFICATION",
"status": "PASS",
"message": "工具类任务识别正确 (6 个工具类, 6 个 ETL 类)",
"details": {
"utility_tasks": {
"MANUAL_INGEST": true,
"INIT_ODS_SCHEMA": true,
"INIT_DWD_SCHEMA": true,
"INIT_DWS_SCHEMA": true,
"ODS_JSON_ARCHIVE": true,
"CHECK_CUTOFF": true
},
"etl_tasks": {
"ODS_MEMBER": false,
"ODS_ORDER": false,
"ODS_PAYMENT": false,
"DWD_LOAD_FROM_ODS": false,
"DWS_ASSISTANT_DAILY": false,
"DWS_FINANCE_DAILY": false
},
"index_tasks_utility_status": {
"DWS_WINBACK_INDEX": true,
"DWS_NEWCONV_INDEX": true,
"DWS_ML_MANUAL_IMPORT": true,
"DWS_RELATION_INDEX": true
},
"issues": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "TASK_DISPATCH_PATHS",
"status": "PASS",
"message": "任务分发路径正确 (utility=13, ods=23, standard=16)",
"details": {
"path_counts": {
"utility": 13,
"standard": 16,
"ods": 23
},
"issues": [],
"sample_dispatch": {
"CHECK_CUTOFF": {
"layer": null,
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DATA_INTEGRITY_CHECK": {
"layer": null,
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DWD_LOAD_FROM_ODS": {
"layer": "DWD",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWD_QUALITY_CHECK": {
"layer": "DWD",
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DWS_ASSISTANT_CUSTOMER": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_DAILY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_FINANCE": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_MONTHLY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_SALARY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_BUILD_ORDER_SUMMARY": {
"layer": "DWS",
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
}
}
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_MANAGER_INTERFACE",
"status": "PASS",
"message": "CursorManager 接口签名正确 (get_or_create, advance)",
"details": {
"method_signatures": {
"get_or_create": [
"self",
"task_id",
"store_id"
],
"advance": [
"self",
"task_id",
"store_id",
"window_start",
"window_end",
"run_id",
"last_id"
]
},
"issues": []
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_ADVANCE_SQL",
"status": "PASS",
"message": "游标推进 SQL 逻辑正确",
"details": {
"checks": [
"✓ 使用 UPDATE meta.etl_cursor",
"✓ 使用 GREATEST 保护 last_id 不回退",
"✓ 调用 commit() 持久化",
"✓ last_id 参数可选(有 None 分支)",
"✓ 更新 updated_at 时间戳"
],
"issues": []
},
"duration_sec": 0.0006,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_SKIP_UTILITY",
"status": "FAIL",
"message": "工具类任务游标跳过逻辑有问题: _run_utility_task 中出现了 cursor 相关调用",
"details": {
"checks": [
"✓ run_single_task 检查 is_utility_task 并分发到 _run_utility_task",
"✓ _run_utility_task 不调用 run_tracker.create_run"
],
"issues": [
"_run_utility_task 中出现了 cursor 相关调用"
]
},
"duration_sec": 0.0018,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_DATA_SOURCE",
"status": "PASS",
"message": "全部 7 个 data_source 解析用例通过",
"details": {
"test_cases": [
{
"case": "默认值",
"expected": "hybrid",
"actual": "hybrid"
},
{
"case": "--data-source online",
"expected": "online",
"actual": "online"
},
{
"case": "--data-source offline",
"expected": "offline",
"actual": "offline"
},
{
"case": "--pipeline-flow FULL",
"expected": "hybrid",
"actual": "hybrid",
"deprecation_warning": true
},
{
"case": "--pipeline-flow FETCH_ONLY",
"expected": "online",
"actual": "online"
},
{
"case": "--pipeline-flow INGEST_ONLY",
"expected": "offline",
"actual": "offline"
},
{
"case": "--data-source online + --pipeline-flow INGEST_ONLY",
"expected": "online",
"actual": "online"
}
],
"issues": []
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_MODE_DETECTION",
"status": "PASS",
"message": "CLI Flow/传统模式检测逻辑正确",
"details": {
"checks": [
"✓ 有 --pipeline 参数时使用 PipelineRunnerFlow 模式)",
"✓ 无 --pipeline 参数时使用 run_tasks传统模式",
"✓ 调用 resolve_data_source 解析数据源模式",
"✓ 支持 --lookback-hours 回溯窗口",
"✓ 设置 window_override 确保任务使用指定窗口"
],
"issues": []
},
"duration_sec": 0.0015,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_PIPELINE_CHOICES",
"status": "PASS",
"message": "CLI --pipeline 可选值与 PIPELINE_LAYERS 完全一致 (7 种)",
"details": {
"pipeline_layers_keys": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"cli_choices": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"missing_in_cli": [],
"extra_in_cli": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "PROCESSING_MODES",
"status": "PASS",
"message": "三种处理模式increment_only/verify_only/increment_verify逻辑正确",
"details": {
"checks": [
"✓ 支持 verify_only 模式",
"✓ verify_only 调用 _run_verification",
"✓ 支持 increment_verify 模式",
"✓ 支持 fetch_before_verify 参数(校验前先获取 API 数据)",
"✓ _run_verification 方法存在"
],
"issues": []
},
"duration_sec": 0.0012,
"error_detail": null,
"fix_applied": null
}
]

View File

@@ -0,0 +1,607 @@
[
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_DEFINITIONS",
"status": "PASS",
"message": "全部 7 种 Flow 定义完整",
"details": {
"expected": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"actual": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"missing": [],
"extra": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_LAYER_MAPPING",
"status": "PASS",
"message": "所有 Flow 层映射正确",
"details": {
"total_flows": 7,
"mismatches": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "INVALID_FLOW_REJECTION",
"status": "PASS",
"message": "全部 5 个无效 Flow 名称被正确拒绝",
"details": {
"tested": [
"nonexistent",
"API_ODS",
"full",
"",
"api_full_extra"
],
"correctly_rejected": [
"nonexistent",
"API_ODS",
"full",
"",
"api_full_extra"
],
"missed": []
},
"duration_sec": 0.0008,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_TASK_RESOLUTION",
"status": "PASS",
"message": "所有 7 种 Flow 任务解析正确",
"details": {
"flow_tasks": {
"api_ods": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET"
],
"api_ods_dwd": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET",
"DWD_LOAD_FROM_ODS"
],
"api_full": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET",
"DWD_LOAD_FROM_ODS",
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
],
"ods_dwd": [
"DWD_LOAD_FROM_ODS"
],
"dwd_dws": [
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY"
],
"dwd_dws_index": [
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
],
"dwd_index": [
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
]
},
"issues": []
},
"duration_sec": 0.002,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "TASK_REGISTRY_LAYERS",
"status": "PASS",
"message": "各层任务数量正确 (ODS=23, DWD=2, DWS=15, INDEX=4)",
"details": {
"ODS": {
"expected": 23,
"actual": 23,
"tasks": [
"ODS_ASSISTANT_ABOLISH",
"ODS_ASSISTANT_ACCOUNT",
"ODS_ASSISTANT_LEDGER",
"ODS_GOODS_CATEGORY",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_GROUP_PACKAGE",
"ODS_INVENTORY_CHANGE",
"ODS_INVENTORY_STOCK",
"ODS_MEMBER",
"ODS_MEMBER_BALANCE",
"ODS_MEMBER_CARD",
"ODS_PAYMENT",
"ODS_PLATFORM_COUPON",
"ODS_RECHARGE_SETTLE",
"ODS_REFUND",
"ODS_SETTLEMENT_RECORDS",
"ODS_SETTLEMENT_TICKET",
"ODS_STORE_GOODS",
"ODS_STORE_GOODS_SALES",
"ODS_TABLES",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TABLE_USE",
"ODS_TENANT_GOODS"
]
},
"DWD": {
"expected": 2,
"actual": 2,
"tasks": [
"DWD_LOAD_FROM_ODS",
"DWD_QUALITY_CHECK"
]
},
"DWS": {
"expected": 15,
"actual": 15,
"tasks": [
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_FINANCE",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_SALARY",
"DWS_BUILD_ORDER_SUMMARY",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_RECHARGE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_RETENTION_CLEANUP"
]
},
"INDEX": {
"expected": 4,
"actual": 4,
"tasks": [
"DWS_ML_MANUAL_IMPORT",
"DWS_NEWCONV_INDEX",
"DWS_RELATION_INDEX",
"DWS_WINBACK_INDEX"
]
},
"TOTAL": {
"actual": 52
}
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "UTILITY_TASK_IDENTIFICATION",
"status": "PASS",
"message": "工具类任务识别正确 (6 个工具类, 6 个 ETL 类)",
"details": {
"utility_tasks": {
"MANUAL_INGEST": true,
"INIT_ODS_SCHEMA": true,
"INIT_DWD_SCHEMA": true,
"INIT_DWS_SCHEMA": true,
"ODS_JSON_ARCHIVE": true,
"CHECK_CUTOFF": true
},
"etl_tasks": {
"ODS_MEMBER": false,
"ODS_ORDER": false,
"ODS_PAYMENT": false,
"DWD_LOAD_FROM_ODS": false,
"DWS_ASSISTANT_DAILY": false,
"DWS_FINANCE_DAILY": false
},
"index_tasks_utility_status": {
"DWS_WINBACK_INDEX": true,
"DWS_NEWCONV_INDEX": true,
"DWS_ML_MANUAL_IMPORT": true,
"DWS_RELATION_INDEX": true
},
"issues": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "TASK_DISPATCH_PATHS",
"status": "PASS",
"message": "任务分发路径正确 (utility=13, ods=23, standard=16)",
"details": {
"path_counts": {
"utility": 13,
"standard": 16,
"ods": 23
},
"issues": [],
"sample_dispatch": {
"CHECK_CUTOFF": {
"layer": null,
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DATA_INTEGRITY_CHECK": {
"layer": null,
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DWD_LOAD_FROM_ODS": {
"layer": "DWD",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWD_QUALITY_CHECK": {
"layer": "DWD",
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DWS_ASSISTANT_CUSTOMER": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_DAILY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_FINANCE": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_MONTHLY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_SALARY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_BUILD_ORDER_SUMMARY": {
"layer": "DWS",
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
}
}
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_MANAGER_INTERFACE",
"status": "PASS",
"message": "CursorManager 接口签名正确 (get_or_create, advance)",
"details": {
"method_signatures": {
"get_or_create": [
"self",
"task_id",
"store_id"
],
"advance": [
"self",
"task_id",
"store_id",
"window_start",
"window_end",
"run_id",
"last_id"
]
},
"issues": []
},
"duration_sec": 0.0002,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_ADVANCE_SQL",
"status": "PASS",
"message": "游标推进 SQL 逻辑正确",
"details": {
"checks": [
"✓ 使用 UPDATE meta.etl_cursor",
"✓ 使用 GREATEST 保护 last_id 不回退",
"✓ 调用 commit() 持久化",
"✓ last_id 参数可选(有 None 分支)",
"✓ 更新 updated_at 时间戳"
],
"issues": []
},
"duration_sec": 0.0005,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_SKIP_UTILITY",
"status": "PASS",
"message": "工具类任务正确跳过游标管理和运行记录",
"details": {
"checks": [
"✓ run_single_task 检查 is_utility_task 并分发到 _run_utility_task",
"✓ _run_utility_task 不调用 cursor_mgr",
"✓ _run_utility_task 不调用 run_tracker.create_run"
],
"issues": []
},
"duration_sec": 0.0019,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_DATA_SOURCE",
"status": "PASS",
"message": "全部 7 个 data_source 解析用例通过",
"details": {
"test_cases": [
{
"case": "默认值",
"expected": "hybrid",
"actual": "hybrid"
},
{
"case": "--data-source online",
"expected": "online",
"actual": "online"
},
{
"case": "--data-source offline",
"expected": "offline",
"actual": "offline"
},
{
"case": "--pipeline-flow FULL",
"expected": "hybrid",
"actual": "hybrid",
"deprecation_warning": true
},
{
"case": "--pipeline-flow FETCH_ONLY",
"expected": "online",
"actual": "online"
},
{
"case": "--pipeline-flow INGEST_ONLY",
"expected": "offline",
"actual": "offline"
},
{
"case": "--data-source online + --pipeline-flow INGEST_ONLY",
"expected": "online",
"actual": "online"
}
],
"issues": []
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_MODE_DETECTION",
"status": "PASS",
"message": "CLI Flow/传统模式检测逻辑正确",
"details": {
"checks": [
"✓ 有 --pipeline 参数时使用 PipelineRunnerFlow 模式)",
"✓ 无 --pipeline 参数时使用 run_tasks传统模式",
"✓ 调用 resolve_data_source 解析数据源模式",
"✓ 支持 --lookback-hours 回溯窗口",
"✓ 设置 window_override 确保任务使用指定窗口"
],
"issues": []
},
"duration_sec": 0.0011,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_PIPELINE_CHOICES",
"status": "PASS",
"message": "CLI --pipeline 可选值与 PIPELINE_LAYERS 完全一致 (7 种)",
"details": {
"pipeline_layers_keys": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"cli_choices": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"missing_in_cli": [],
"extra_in_cli": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "PROCESSING_MODES",
"status": "PASS",
"message": "三种处理模式increment_only/verify_only/increment_verify逻辑正确",
"details": {
"checks": [
"✓ 支持 verify_only 模式",
"✓ verify_only 调用 _run_verification",
"✓ 支持 increment_verify 模式",
"✓ 支持 fetch_before_verify 参数(校验前先获取 API 数据)",
"✓ _run_verification 方法存在"
],
"issues": []
},
"duration_sec": 0.0019,
"error_detail": null,
"fix_applied": null
}
]

View File

@@ -0,0 +1,607 @@
[
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_DEFINITIONS",
"status": "PASS",
"message": "全部 7 种 Flow 定义完整",
"details": {
"expected": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"actual": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"missing": [],
"extra": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_LAYER_MAPPING",
"status": "PASS",
"message": "所有 Flow 层映射正确",
"details": {
"total_flows": 7,
"mismatches": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "INVALID_FLOW_REJECTION",
"status": "PASS",
"message": "全部 5 个无效 Flow 名称被正确拒绝",
"details": {
"tested": [
"nonexistent",
"API_ODS",
"full",
"",
"api_full_extra"
],
"correctly_rejected": [
"nonexistent",
"API_ODS",
"full",
"",
"api_full_extra"
],
"missed": []
},
"duration_sec": 0.0008,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "FLOW_TASK_RESOLUTION",
"status": "PASS",
"message": "所有 7 种 Flow 任务解析正确",
"details": {
"flow_tasks": {
"api_ods": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET"
],
"api_ods_dwd": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET",
"DWD_LOAD_FROM_ODS"
],
"api_full": [
"ODS_ASSISTANT_ACCOUNT",
"ODS_SETTLEMENT_RECORDS",
"ODS_TABLE_USE",
"ODS_ASSISTANT_LEDGER",
"ODS_ASSISTANT_ABOLISH",
"ODS_STORE_GOODS_SALES",
"ODS_PAYMENT",
"ODS_REFUND",
"ODS_PLATFORM_COUPON",
"ODS_MEMBER",
"ODS_MEMBER_CARD",
"ODS_MEMBER_BALANCE",
"ODS_RECHARGE_SETTLE",
"ODS_GROUP_PACKAGE",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_INVENTORY_STOCK",
"ODS_INVENTORY_CHANGE",
"ODS_TABLES",
"ODS_GOODS_CATEGORY",
"ODS_STORE_GOODS",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TENANT_GOODS",
"ODS_SETTLEMENT_TICKET",
"DWD_LOAD_FROM_ODS",
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
],
"ods_dwd": [
"DWD_LOAD_FROM_ODS"
],
"dwd_dws": [
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY"
],
"dwd_dws_index": [
"DWS_BUILD_ORDER_SUMMARY",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_SALARY",
"DWS_ASSISTANT_FINANCE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_RECHARGE",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_RETENTION_CLEANUP",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
],
"dwd_index": [
"DWS_WINBACK_INDEX",
"DWS_NEWCONV_INDEX",
"DWS_ML_MANUAL_IMPORT",
"DWS_RELATION_INDEX"
]
},
"issues": []
},
"duration_sec": 0.0012,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "TASK_REGISTRY_LAYERS",
"status": "PASS",
"message": "各层任务数量正确 (ODS=23, DWD=2, DWS=15, INDEX=4)",
"details": {
"ODS": {
"expected": 23,
"actual": 23,
"tasks": [
"ODS_ASSISTANT_ABOLISH",
"ODS_ASSISTANT_ACCOUNT",
"ODS_ASSISTANT_LEDGER",
"ODS_GOODS_CATEGORY",
"ODS_GROUP_BUY_REDEMPTION",
"ODS_GROUP_PACKAGE",
"ODS_INVENTORY_CHANGE",
"ODS_INVENTORY_STOCK",
"ODS_MEMBER",
"ODS_MEMBER_BALANCE",
"ODS_MEMBER_CARD",
"ODS_PAYMENT",
"ODS_PLATFORM_COUPON",
"ODS_RECHARGE_SETTLE",
"ODS_REFUND",
"ODS_SETTLEMENT_RECORDS",
"ODS_SETTLEMENT_TICKET",
"ODS_STORE_GOODS",
"ODS_STORE_GOODS_SALES",
"ODS_TABLES",
"ODS_TABLE_FEE_DISCOUNT",
"ODS_TABLE_USE",
"ODS_TENANT_GOODS"
]
},
"DWD": {
"expected": 2,
"actual": 2,
"tasks": [
"DWD_LOAD_FROM_ODS",
"DWD_QUALITY_CHECK"
]
},
"DWS": {
"expected": 15,
"actual": 15,
"tasks": [
"DWS_ASSISTANT_CUSTOMER",
"DWS_ASSISTANT_DAILY",
"DWS_ASSISTANT_FINANCE",
"DWS_ASSISTANT_MONTHLY",
"DWS_ASSISTANT_SALARY",
"DWS_BUILD_ORDER_SUMMARY",
"DWS_FINANCE_DAILY",
"DWS_FINANCE_DISCOUNT_DETAIL",
"DWS_FINANCE_INCOME_STRUCTURE",
"DWS_FINANCE_RECHARGE",
"DWS_MEMBER_CONSUMPTION",
"DWS_MEMBER_VISIT",
"DWS_MV_REFRESH_ASSISTANT_DAILY",
"DWS_MV_REFRESH_FINANCE_DAILY",
"DWS_RETENTION_CLEANUP"
]
},
"INDEX": {
"expected": 4,
"actual": 4,
"tasks": [
"DWS_ML_MANUAL_IMPORT",
"DWS_NEWCONV_INDEX",
"DWS_RELATION_INDEX",
"DWS_WINBACK_INDEX"
]
},
"TOTAL": {
"actual": 52
}
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "UTILITY_TASK_IDENTIFICATION",
"status": "PASS",
"message": "工具类任务识别正确 (6 个工具类, 6 个 ETL 类)",
"details": {
"utility_tasks": {
"MANUAL_INGEST": true,
"INIT_ODS_SCHEMA": true,
"INIT_DWD_SCHEMA": true,
"INIT_DWS_SCHEMA": true,
"ODS_JSON_ARCHIVE": true,
"CHECK_CUTOFF": true
},
"etl_tasks": {
"ODS_MEMBER": false,
"ODS_ORDER": false,
"ODS_PAYMENT": false,
"DWD_LOAD_FROM_ODS": false,
"DWS_ASSISTANT_DAILY": false,
"DWS_FINANCE_DAILY": false
},
"index_tasks_utility_status": {
"DWS_WINBACK_INDEX": true,
"DWS_NEWCONV_INDEX": true,
"DWS_ML_MANUAL_IMPORT": true,
"DWS_RELATION_INDEX": true
},
"issues": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "TASK_DISPATCH_PATHS",
"status": "PASS",
"message": "任务分发路径正确 (utility=13, ods=23, standard=16)",
"details": {
"path_counts": {
"utility": 13,
"standard": 16,
"ods": 23
},
"issues": [],
"sample_dispatch": {
"CHECK_CUTOFF": {
"layer": null,
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DATA_INTEGRITY_CHECK": {
"layer": null,
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DWD_LOAD_FROM_ODS": {
"layer": "DWD",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWD_QUALITY_CHECK": {
"layer": "DWD",
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
},
"DWS_ASSISTANT_CUSTOMER": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_DAILY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_FINANCE": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_MONTHLY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_ASSISTANT_SALARY": {
"layer": "DWS",
"is_utility": false,
"is_ods": false,
"dispatch_path": "standard"
},
"DWS_BUILD_ORDER_SUMMARY": {
"layer": "DWS",
"is_utility": true,
"is_ods": false,
"dispatch_path": "utility"
}
}
},
"duration_sec": 0.0001,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_MANAGER_INTERFACE",
"status": "PASS",
"message": "CursorManager 接口签名正确 (get_or_create, advance)",
"details": {
"method_signatures": {
"get_or_create": [
"self",
"task_id",
"store_id"
],
"advance": [
"self",
"task_id",
"store_id",
"window_start",
"window_end",
"run_id",
"last_id"
]
},
"issues": []
},
"duration_sec": 0.0002,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_ADVANCE_SQL",
"status": "PASS",
"message": "游标推进 SQL 逻辑正确",
"details": {
"checks": [
"✓ 使用 UPDATE meta.etl_cursor",
"✓ 使用 GREATEST 保护 last_id 不回退",
"✓ 调用 commit() 持久化",
"✓ last_id 参数可选(有 None 分支)",
"✓ 更新 updated_at 时间戳"
],
"issues": []
},
"duration_sec": 0.0005,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CURSOR_SKIP_UTILITY",
"status": "PASS",
"message": "工具类任务正确跳过游标管理和运行记录",
"details": {
"checks": [
"✓ run_single_task 检查 is_utility_task 并分发到 _run_utility_task",
"✓ _run_utility_task 不调用 cursor_mgr",
"✓ _run_utility_task 不调用 run_tracker.create_run"
],
"issues": []
},
"duration_sec": 0.002,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_DATA_SOURCE",
"status": "PASS",
"message": "全部 7 个 data_source 解析用例通过",
"details": {
"test_cases": [
{
"case": "默认值",
"expected": "hybrid",
"actual": "hybrid"
},
{
"case": "--data-source online",
"expected": "online",
"actual": "online"
},
{
"case": "--data-source offline",
"expected": "offline",
"actual": "offline"
},
{
"case": "--pipeline-flow FULL",
"expected": "hybrid",
"actual": "hybrid",
"deprecation_warning": true
},
{
"case": "--pipeline-flow FETCH_ONLY",
"expected": "online",
"actual": "online"
},
{
"case": "--pipeline-flow INGEST_ONLY",
"expected": "offline",
"actual": "offline"
},
{
"case": "--data-source online + --pipeline-flow INGEST_ONLY",
"expected": "online",
"actual": "online"
}
],
"issues": []
},
"duration_sec": 0.0002,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_MODE_DETECTION",
"status": "PASS",
"message": "CLI Flow/传统模式检测逻辑正确",
"details": {
"checks": [
"✓ 有 --pipeline 参数时使用 PipelineRunnerFlow 模式)",
"✓ 无 --pipeline 参数时使用 run_tasks传统模式",
"✓ 调用 resolve_data_source 解析数据源模式",
"✓ 支持 --lookback-hours 回溯窗口",
"✓ 设置 window_override 确保任务使用指定窗口"
],
"issues": []
},
"duration_sec": 0.0017,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "CLI_PIPELINE_CHOICES",
"status": "PASS",
"message": "CLI --pipeline 可选值与 PIPELINE_LAYERS 完全一致 (7 种)",
"details": {
"pipeline_layers_keys": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"cli_choices": [
"api_full",
"api_ods",
"api_ods_dwd",
"dwd_dws",
"dwd_dws_index",
"dwd_index",
"ods_dwd"
],
"missing_in_cli": [],
"extra_in_cli": []
},
"duration_sec": 0.0,
"error_detail": null,
"fix_applied": null
},
{
"layer": "ORCHESTRATION",
"task_code": "PROCESSING_MODES",
"status": "PASS",
"message": "三种处理模式increment_only/verify_only/increment_verify逻辑正确",
"details": {
"checks": [
"✓ 支持 verify_only 模式",
"✓ verify_only 调用 _run_verification",
"✓ 支持 increment_verify 模式",
"✓ 支持 fetch_before_verify 参数(校验前先获取 API 数据)",
"✓ _run_verification 方法存在"
],
"issues": []
},
"duration_sec": 0.0014,
"error_detail": null,
"fix_applied": null
}
]

View File

@@ -0,0 +1,954 @@
{
"flow": "api_full",
"window_start": "2026-01-01T00:00:00",
"window_end": "2026-02-16T00:00:00",
"overall_start": "2026-02-16T02:00:29.172416+08:00",
"overall_end": "2026-02-16T02:01:31.080429+08:00",
"overall_duration_sec": 60.681,
"overall_status": "PARTIAL",
"layers": [
{
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.399132+08:00",
"end_time": "2026-02-16T02:00:32.176952+08:00",
"duration_sec": 1.778,
"status": "ERROR",
"task_count": 23,
"success_count": 0,
"fail_count": 23,
"skip_count": 0,
"total_fetched": 0,
"total_inserted": 0,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "ODS_ASSISTANT_ABOLISH",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.404857+08:00",
"end_time": "2026-02-16T02:00:30.626099+08:00",
"duration_sec": 0.221,
"status": "ERROR",
"counts": {},
"error": "错误: 重复键违反唯一约束\"etl_run_pkey\"\nDETAIL: 键值\"(run_id)=(1)\" 已经存在\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_ASSISTANT_ACCOUNT",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.631572+08:00",
"end_time": "2026-02-16T02:00:30.692791+08:00",
"duration_sec": 0.061,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_ASSISTANT_LEDGER",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.696161+08:00",
"end_time": "2026-02-16T02:00:30.749470+08:00",
"duration_sec": 0.053,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_GOODS_CATEGORY",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.768981+08:00",
"end_time": "2026-02-16T02:00:30.820272+08:00",
"duration_sec": 0.051,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_GROUP_BUY_REDEMPTION",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.823809+08:00",
"end_time": "2026-02-16T02:00:30.886627+08:00",
"duration_sec": 0.063,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_GROUP_PACKAGE",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.889777+08:00",
"end_time": "2026-02-16T02:00:30.949457+08:00",
"duration_sec": 0.06,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_INVENTORY_CHANGE",
"layer": "ODS",
"start_time": "2026-02-16T02:00:30.951549+08:00",
"end_time": "2026-02-16T02:00:31.001887+08:00",
"duration_sec": 0.05,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_INVENTORY_STOCK",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.004960+08:00",
"end_time": "2026-02-16T02:00:31.154070+08:00",
"duration_sec": 0.149,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_MEMBER",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.196785+08:00",
"end_time": "2026-02-16T02:00:31.277524+08:00",
"duration_sec": 0.081,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_MEMBER_BALANCE",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.282884+08:00",
"end_time": "2026-02-16T02:00:31.389490+08:00",
"duration_sec": 0.107,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_MEMBER_CARD",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.391553+08:00",
"end_time": "2026-02-16T02:00:31.443622+08:00",
"duration_sec": 0.052,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_PAYMENT",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.455259+08:00",
"end_time": "2026-02-16T02:00:31.516596+08:00",
"duration_sec": 0.061,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_PLATFORM_COUPON",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.519180+08:00",
"end_time": "2026-02-16T02:00:31.571746+08:00",
"duration_sec": 0.053,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_RECHARGE_SETTLE",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.587139+08:00",
"end_time": "2026-02-16T02:00:31.645916+08:00",
"duration_sec": 0.059,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_REFUND",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.649692+08:00",
"end_time": "2026-02-16T02:00:31.707211+08:00",
"duration_sec": 0.057,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_SETTLEMENT_RECORDS",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.709718+08:00",
"end_time": "2026-02-16T02:00:31.762221+08:00",
"duration_sec": 0.052,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_SETTLEMENT_TICKET",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.772867+08:00",
"end_time": "2026-02-16T02:00:31.831914+08:00",
"duration_sec": 0.059,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_STORE_GOODS",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.835277+08:00",
"end_time": "2026-02-16T02:00:31.890692+08:00",
"duration_sec": 0.055,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_STORE_GOODS_SALES",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.900307+08:00",
"end_time": "2026-02-16T02:00:31.950987+08:00",
"duration_sec": 0.051,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TABLES",
"layer": "ODS",
"start_time": "2026-02-16T02:00:31.953280+08:00",
"end_time": "2026-02-16T02:00:32.003567+08:00",
"duration_sec": 0.05,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TABLE_FEE_DISCOUNT",
"layer": "ODS",
"start_time": "2026-02-16T02:00:32.013166+08:00",
"end_time": "2026-02-16T02:00:32.064346+08:00",
"duration_sec": 0.051,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TABLE_USE",
"layer": "ODS",
"start_time": "2026-02-16T02:00:32.066702+08:00",
"end_time": "2026-02-16T02:00:32.120437+08:00",
"duration_sec": 0.054,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TENANT_GOODS",
"layer": "ODS",
"start_time": "2026-02-16T02:00:32.124485+08:00",
"end_time": "2026-02-16T02:00:32.174414+08:00",
"duration_sec": 0.05,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
}
]
},
{
"layer": "DWD",
"start_time": "2026-02-16T02:00:32.180137+08:00",
"end_time": "2026-02-16T02:00:32.288850+08:00",
"duration_sec": 0.109,
"status": "ERROR",
"task_count": 1,
"success_count": 0,
"fail_count": 1,
"skip_count": 0,
"total_fetched": 0,
"total_inserted": 0,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "DWD_LOAD_FROM_ODS",
"layer": "DWD",
"start_time": "2026-02-16T02:00:32.187417+08:00",
"end_time": "2026-02-16T02:00:32.270397+08:00",
"duration_sec": 0.083,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
}
]
},
{
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.434597+08:00",
"end_time": "2026-02-16T02:00:33.733537+08:00",
"duration_sec": 1.299,
"status": "ERROR",
"task_count": 15,
"success_count": 0,
"fail_count": 6,
"skip_count": 9,
"total_fetched": 0,
"total_inserted": 0,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "DWS_ASSISTANT_CUSTOMER",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.438912+08:00",
"end_time": "2026-02-16T02:00:32.524164+08:00",
"duration_sec": 0.085,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.531949+08:00",
"end_time": "2026-02-16T02:00:32.584970+08:00",
"duration_sec": 0.053,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_FINANCE",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.587080+08:00",
"end_time": "2026-02-16T02:00:32.640944+08:00",
"duration_sec": 0.054,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_MONTHLY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.651498+08:00",
"end_time": "2026-02-16T02:00:32.702636+08:00",
"duration_sec": 0.051,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_SALARY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.704713+08:00",
"end_time": "2026-02-16T02:00:32.765519+08:00",
"duration_sec": 0.061,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_BUILD_ORDER_SUMMARY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.778507+08:00",
"end_time": "2026-02-16T02:00:32.891324+08:00",
"duration_sec": 0.113,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:32.892786+08:00",
"end_time": "2026-02-16T02:00:33.011285+08:00",
"duration_sec": 0.118,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_DISCOUNT_DETAIL",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.015489+08:00",
"end_time": "2026-02-16T02:00:33.070572+08:00",
"duration_sec": 0.055,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_INCOME_STRUCTURE",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.072599+08:00",
"end_time": "2026-02-16T02:00:33.133539+08:00",
"duration_sec": 0.061,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_RECHARGE",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.135092+08:00",
"end_time": "2026-02-16T02:00:33.185416+08:00",
"duration_sec": 0.05,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MEMBER_CONSUMPTION",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.189842+08:00",
"end_time": "2026-02-16T02:00:33.256656+08:00",
"duration_sec": 0.067,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MEMBER_VISIT",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.258323+08:00",
"end_time": "2026-02-16T02:00:33.471349+08:00",
"duration_sec": 0.213,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MV_REFRESH_ASSISTANT_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.524696+08:00",
"end_time": "2026-02-16T02:00:33.614408+08:00",
"duration_sec": 0.09,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MV_REFRESH_FINANCE_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.621684+08:00",
"end_time": "2026-02-16T02:00:33.675725+08:00",
"duration_sec": 0.054,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_RETENTION_CLEANUP",
"layer": "DWS",
"start_time": "2026-02-16T02:00:33.677408+08:00",
"end_time": "2026-02-16T02:00:33.732228+08:00",
"duration_sec": 0.055,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
}
]
},
{
"layer": "INDEX",
"start_time": "2026-02-16T02:00:33.738360+08:00",
"end_time": "2026-02-16T02:00:37.081635+08:00",
"duration_sec": 3.343,
"status": "ERROR",
"task_count": 4,
"success_count": 0,
"fail_count": 4,
"skip_count": 0,
"total_fetched": 0,
"total_inserted": 0,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "DWS_ML_MANUAL_IMPORT",
"layer": "INDEX",
"start_time": "2026-02-16T02:00:33.740154+08:00",
"end_time": "2026-02-16T02:00:33.743497+08:00",
"duration_sec": 0.003,
"status": "ERROR",
"counts": {},
"error": "未找到 ML 台账文件,请通过环境变量 ML_MANUAL_LEDGER_FILE 或配置 run.ml_manual_ledger_file 指定",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_NEWCONV_INDEX",
"layer": "INDEX",
"start_time": "2026-02-16T02:00:33.744864+08:00",
"end_time": "2026-02-16T02:00:36.742473+08:00",
"duration_sec": 2.998,
"status": "ERROR",
"counts": {},
"error": "错误: 重复键违反唯一约束\"dws_index_percentile_history_pkey\"\nDETAIL: 键值\"(history_id)=(1)\" 已经存在\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_RELATION_INDEX",
"layer": "INDEX",
"start_time": "2026-02-16T02:00:36.747281+08:00",
"end_time": "2026-02-16T02:00:36.820768+08:00",
"duration_sec": 0.073,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_WINBACK_INDEX",
"layer": "INDEX",
"start_time": "2026-02-16T02:00:36.823642+08:00",
"end_time": "2026-02-16T02:00:37.079836+08:00",
"duration_sec": 0.256,
"status": "ERROR",
"counts": {},
"error": "错误: 当前事务被终止, 事务块结束之前的查询被忽略\n",
"api_calls": 0,
"api_total_sec": 0.0
}
]
}
],
"verification": {
"status": "COMPLETED",
"start_time": "2026-02-16T02:00:37.094515+08:00",
"end_time": "2026-02-16T02:01:31.080036+08:00",
"duration_sec": 53.985,
"total_tables": 15,
"consistent_tables": 7,
"total_backfilled": 163,
"error_tables": 3,
"layers": {
"ODS": {
"layer": "ODS",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"total_tables": 1,
"consistent_tables": 0,
"inconsistent_tables": 1,
"total_source_count": 0,
"total_target_count": 0,
"total_missing": 0,
"total_mismatch": 0,
"total_backfilled": 0,
"total_backfilled_missing": 0,
"total_backfilled_mismatch": 0,
"error_tables": 1,
"elapsed_seconds": 0.1302633285522461,
"status": "ERROR",
"results": [
{
"layer": "ODS",
"table": "assistant_accounts_master",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 0,
"target_count": 0,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "ERROR",
"elapsed_seconds": 0.1302633285522461,
"error_message": "获取 ODS hash 失败: assistant_accounts_master",
"details": {
"fatal": true
}
}
]
},
"DWD": {
"layer": "DWD",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"total_tables": 6,
"consistent_tables": 0,
"inconsistent_tables": 6,
"total_source_count": 163,
"total_target_count": 219,
"total_missing": 0,
"total_mismatch": 163,
"total_backfilled": 163,
"total_backfilled_missing": 0,
"total_backfilled_mismatch": 163,
"error_tables": 1,
"elapsed_seconds": 49.87700796127319,
"status": "ERROR",
"results": [
{
"layer": "DWD",
"table": "dim_site",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 1,
"target_count": 1,
"missing_count": 0,
"mismatch_count": 1,
"backfilled_count": 1,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 1,
"status": "BACKFILLED",
"elapsed_seconds": 14.668256521224976,
"error_message": null,
"details": {}
},
{
"layer": "DWD",
"table": "dim_site_ex",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 1,
"target_count": 1,
"missing_count": 0,
"mismatch_count": 1,
"backfilled_count": 1,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 1,
"status": "BACKFILLED",
"elapsed_seconds": 14.868768453598022,
"error_message": null,
"details": {}
},
{
"layer": "DWD",
"table": "dim_table",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 66,
"target_count": 74,
"missing_count": 0,
"mismatch_count": 66,
"backfilled_count": 66,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 66,
"status": "BACKFILLED",
"elapsed_seconds": 9.084474802017212,
"error_message": null,
"details": {}
},
{
"layer": "DWD",
"table": "dim_table_ex",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 66,
"target_count": 74,
"missing_count": 0,
"mismatch_count": 66,
"backfilled_count": 66,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 66,
"status": "BACKFILLED",
"elapsed_seconds": 7.42323637008667,
"error_message": null,
"details": {}
},
{
"layer": "DWD",
"table": "dim_assistant",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 29,
"target_count": 69,
"missing_count": 0,
"mismatch_count": 29,
"backfilled_count": 29,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 29,
"status": "BACKFILLED",
"elapsed_seconds": 3.5641441345214844,
"error_message": null,
"details": {}
},
{
"layer": "DWD",
"table": "dim_assistant_ex",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 0,
"target_count": 0,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "ERROR",
"elapsed_seconds": 0.2681276798248291,
"error_message": "获取 DWD hash 失败: dim_assistant_ex",
"details": {
"fatal": true
}
}
]
},
"DWS": {
"layer": "DWS",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"total_tables": 6,
"consistent_tables": 6,
"inconsistent_tables": 0,
"total_source_count": 1712,
"total_target_count": 1712,
"total_missing": 0,
"total_mismatch": 0,
"total_backfilled": 0,
"total_backfilled_missing": 0,
"total_backfilled_mismatch": 0,
"error_tables": 0,
"elapsed_seconds": 0.8101677894592285,
"status": "OK",
"results": [
{
"layer": "DWS",
"table": "dws_finance_daily_summary",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 31,
"target_count": 31,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.17444229125976562,
"error_message": null,
"details": {}
},
{
"layer": "DWS",
"table": "dws_assistant_daily_detail",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 854,
"target_count": 854,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.1419987678527832,
"error_message": null,
"details": {}
},
{
"layer": "DWS",
"table": "dws_member_visit_detail",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 564,
"target_count": 564,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.1277296543121338,
"error_message": null,
"details": {}
},
{
"layer": "DWS",
"table": "dws_finance_daily_summary",
"window_start": "2026-02-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"source_count": 10,
"target_count": 10,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.15016722679138184,
"error_message": null,
"details": {}
},
{
"layer": "DWS",
"table": "dws_assistant_daily_detail",
"window_start": "2026-02-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"source_count": 125,
"target_count": 125,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.10792803764343262,
"error_message": null,
"details": {}
},
{
"layer": "DWS",
"table": "dws_member_visit_detail",
"window_start": "2026-02-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"source_count": 128,
"target_count": 128,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.10790181159973145,
"error_message": null,
"details": {}
}
]
},
"INDEX": {
"layer": "INDEX",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-16T00:00:00+08:00",
"total_tables": 2,
"consistent_tables": 1,
"inconsistent_tables": 1,
"total_source_count": 94,
"total_target_count": 171,
"total_missing": 0,
"total_mismatch": 0,
"total_backfilled": 0,
"total_backfilled_missing": 0,
"total_backfilled_mismatch": 0,
"error_tables": 1,
"elapsed_seconds": 0.2954070568084717,
"status": "ERROR",
"results": [
{
"layer": "INDEX",
"table": "v_member_recall_priority",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 94,
"target_count": 171,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "OK",
"elapsed_seconds": 0.18698692321777344,
"error_message": null,
"details": {}
},
{
"layer": "INDEX",
"table": "dws_member_assistant_relation_index",
"window_start": "2026-01-01T00:00:00+08:00",
"window_end": "2026-02-01T00:00:00+08:00",
"source_count": 0,
"target_count": 0,
"missing_count": 0,
"mismatch_count": 0,
"backfilled_count": 0,
"backfilled_missing_count": 0,
"backfilled_mismatch_count": 0,
"status": "ERROR",
"elapsed_seconds": 0.10842013359069824,
"error_message": "获取源实体失败: dws_member_assistant_relation_index",
"details": {
"fatal": true
}
}
]
}
}
},
"environment": {
"store_id": 2790685415443269,
"db_name": "",
"api_base_url": "https://pc.ficoo.vip/apiprod/admin/v1/",
"timezone": "Asia/Shanghai"
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,5 @@
{
"last_completed_layer": "INDEX",
"last_completed_task": "DWS_WINBACK_INDEX",
"timestamp": "2026-02-16T02:21:00.006150+08:00"
}

View File

@@ -0,0 +1,774 @@
{
"flow": "api_full",
"window_start": "2026-01-01T00:00:00",
"window_end": "2026-02-16T00:00:00",
"overall_start": "2026-02-16T02:05:42.502194+08:00",
"overall_end": "",
"overall_duration_sec": 0.0,
"overall_status": "",
"layers": [
{
"layer": "ODS",
"start_time": "2026-02-16T02:05:43.228274+08:00",
"end_time": "2026-02-16T02:20:46.966053+08:00",
"duration_sec": 903.738,
"status": "SUCCESS",
"task_count": 23,
"success_count": 23,
"fail_count": 0,
"skip_count": 0,
"total_fetched": 280391,
"total_inserted": 2421,
"total_updated": 303,
"total_errors": 0,
"tasks": [
{
"task_code": "ODS_ASSISTANT_ABOLISH",
"layer": "ODS",
"start_time": "2026-02-16T02:05:43.229896+08:00",
"end_time": "2026-02-16T02:05:46.094082+08:00",
"duration_sec": 2.864,
"status": "SUCCESS",
"counts": {
"fetched": 37,
"inserted": 0,
"updated": 0,
"skipped": 37,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_ASSISTANT_ACCOUNT",
"layer": "ODS",
"start_time": "2026-02-16T02:05:46.095823+08:00",
"end_time": "2026-02-16T02:05:53.344708+08:00",
"duration_sec": 7.249,
"status": "SUCCESS",
"counts": {
"fetched": 345,
"inserted": 3,
"updated": 0,
"skipped": 342,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_ASSISTANT_LEDGER",
"layer": "ODS",
"start_time": "2026-02-16T02:05:53.350637+08:00",
"end_time": "2026-02-16T02:06:01.035404+08:00",
"duration_sec": 7.685,
"status": "SUCCESS",
"counts": {
"fetched": 998,
"inserted": 16,
"updated": 0,
"skipped": 982,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_GOODS_CATEGORY",
"layer": "ODS",
"start_time": "2026-02-16T02:06:01.038609+08:00",
"end_time": "2026-02-16T02:06:03.197268+08:00",
"duration_sec": 2.159,
"status": "SUCCESS",
"counts": {
"fetched": 45,
"inserted": 0,
"updated": 0,
"skipped": 45,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_GROUP_BUY_REDEMPTION",
"layer": "ODS",
"start_time": "2026-02-16T02:06:03.243603+08:00",
"end_time": "2026-02-16T02:08:51.258331+08:00",
"duration_sec": 168.015,
"status": "SUCCESS",
"counts": {
"fetched": 38860,
"inserted": 242,
"updated": 0,
"skipped": 38618,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_GROUP_PACKAGE",
"layer": "ODS",
"start_time": "2026-02-16T02:08:51.261789+08:00",
"end_time": "2026-02-16T02:08:54.850306+08:00",
"duration_sec": 3.588,
"status": "SUCCESS",
"counts": {
"fetched": 90,
"inserted": 0,
"updated": 0,
"skipped": 90,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_INVENTORY_CHANGE",
"layer": "ODS",
"start_time": "2026-02-16T02:08:54.854769+08:00",
"end_time": "2026-02-16T02:09:14.467632+08:00",
"duration_sec": 19.613,
"status": "SUCCESS",
"counts": {
"fetched": 7044,
"inserted": 601,
"updated": 0,
"skipped": 6443,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_INVENTORY_STOCK",
"layer": "ODS",
"start_time": "2026-02-16T02:09:14.469443+08:00",
"end_time": "2026-02-16T02:09:18.538862+08:00",
"duration_sec": 4.069,
"status": "SUCCESS",
"counts": {
"fetched": 865,
"inserted": 48,
"updated": 0,
"skipped": 817,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_MEMBER",
"layer": "ODS",
"start_time": "2026-02-16T02:09:18.540713+08:00",
"end_time": "2026-02-16T02:09:30.301232+08:00",
"duration_sec": 11.76,
"status": "SUCCESS",
"counts": {
"fetched": 2785,
"inserted": 14,
"updated": 0,
"skipped": 2771,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_MEMBER_BALANCE",
"layer": "ODS",
"start_time": "2026-02-16T02:09:30.302995+08:00",
"end_time": "2026-02-16T02:11:45.226394+08:00",
"duration_sec": 134.924,
"status": "SUCCESS",
"counts": {
"fetched": 11725,
"inserted": 39,
"updated": 0,
"skipped": 11686,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_MEMBER_CARD",
"layer": "ODS",
"start_time": "2026-02-16T02:11:45.228393+08:00",
"end_time": "2026-02-16T02:11:58.353257+08:00",
"duration_sec": 13.125,
"status": "SUCCESS",
"counts": {
"fetched": 4730,
"inserted": 19,
"updated": 0,
"skipped": 4711,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_PAYMENT",
"layer": "ODS",
"start_time": "2026-02-16T02:11:58.354979+08:00",
"end_time": "2026-02-16T02:13:57.385086+08:00",
"duration_sec": 119.03,
"status": "SUCCESS",
"counts": {
"fetched": 56795,
"inserted": 325,
"updated": 0,
"skipped": 56470,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_PLATFORM_COUPON",
"layer": "ODS",
"start_time": "2026-02-16T02:13:57.387334+08:00",
"end_time": "2026-02-16T02:17:35.403217+08:00",
"duration_sec": 218.016,
"status": "SUCCESS",
"counts": {
"fetched": 91555,
"inserted": 242,
"updated": 0,
"skipped": 91313,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_RECHARGE_SETTLE",
"layer": "ODS",
"start_time": "2026-02-16T02:17:35.405051+08:00",
"end_time": "2026-02-16T02:17:37.892719+08:00",
"duration_sec": 2.488,
"status": "SUCCESS",
"counts": {
"fetched": 90,
"inserted": 0,
"updated": 0,
"skipped": 90,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_REFUND",
"layer": "ODS",
"start_time": "2026-02-16T02:17:37.894298+08:00",
"end_time": "2026-02-16T02:17:40.855120+08:00",
"duration_sec": 2.961,
"status": "SUCCESS",
"counts": {
"fetched": 180,
"inserted": 1,
"updated": 0,
"skipped": 179,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_SETTLEMENT_RECORDS",
"layer": "ODS",
"start_time": "2026-02-16T02:17:40.857110+08:00",
"end_time": "2026-02-16T02:18:10.883995+08:00",
"duration_sec": 30.027,
"status": "SUCCESS",
"counts": {
"fetched": 4917,
"inserted": 320,
"updated": 303,
"skipped": 4294,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_SETTLEMENT_TICKET",
"layer": "ODS",
"start_time": "2026-02-16T02:18:10.885741+08:00",
"end_time": "2026-02-16T02:18:20.540209+08:00",
"duration_sec": 9.654,
"status": "SUCCESS",
"counts": {
"fetched": 0,
"inserted": 0,
"updated": 0,
"skipped": 0,
"errors": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_STORE_GOODS",
"layer": "ODS",
"start_time": "2026-02-16T02:18:20.541830+08:00",
"end_time": "2026-02-16T02:18:23.785491+08:00",
"duration_sec": 3.244,
"status": "SUCCESS",
"counts": {
"fetched": 865,
"inserted": 173,
"updated": 0,
"skipped": 692,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_STORE_GOODS_SALES",
"layer": "ODS",
"start_time": "2026-02-16T02:18:23.787736+08:00",
"end_time": "2026-02-16T02:18:25.142325+08:00",
"duration_sec": 1.355,
"status": "SUCCESS",
"counts": {
"fetched": 0,
"inserted": 0,
"updated": 0,
"skipped": 0,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TABLES",
"layer": "ODS",
"start_time": "2026-02-16T02:18:25.144553+08:00",
"end_time": "2026-02-16T02:18:28.900234+08:00",
"duration_sec": 3.756,
"status": "SUCCESS",
"counts": {
"fetched": 370,
"inserted": 51,
"updated": 0,
"skipped": 319,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TABLE_FEE_DISCOUNT",
"layer": "ODS",
"start_time": "2026-02-16T02:18:28.901962+08:00",
"end_time": "2026-02-16T02:19:04.877900+08:00",
"duration_sec": 35.976,
"status": "SUCCESS",
"counts": {
"fetched": 8680,
"inserted": 39,
"updated": 0,
"skipped": 8641,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TABLE_USE",
"layer": "ODS",
"start_time": "2026-02-16T02:19:04.879677+08:00",
"end_time": "2026-02-16T02:20:43.722924+08:00",
"duration_sec": 98.844,
"status": "SUCCESS",
"counts": {
"fetched": 48545,
"inserted": 285,
"updated": 0,
"skipped": 48260,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "ODS_TENANT_GOODS",
"layer": "ODS",
"start_time": "2026-02-16T02:20:43.725501+08:00",
"end_time": "2026-02-16T02:20:46.964569+08:00",
"duration_sec": 3.239,
"status": "SUCCESS",
"counts": {
"fetched": 870,
"inserted": 3,
"updated": 0,
"skipped": 867,
"errors": 0,
"deleted": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
}
]
},
{
"layer": "DWD",
"start_time": "2026-02-16T02:20:46.967739+08:00",
"end_time": "2026-02-16T02:20:47.394765+08:00",
"duration_sec": 0.427,
"status": "SUCCESS",
"task_count": 1,
"success_count": 1,
"fail_count": 0,
"skip_count": 0,
"total_fetched": 0,
"total_inserted": 0,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "DWD_LOAD_FROM_ODS",
"layer": "DWD",
"start_time": "2026-02-16T02:20:46.968453+08:00",
"end_time": "2026-02-16T02:20:47.392262+08:00",
"duration_sec": 0.424,
"status": "SUCCESS",
"counts": {
"fetched": 0,
"inserted": 0,
"updated": 0,
"skipped": 0,
"errors": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
}
]
},
{
"layer": "DWS",
"start_time": "2026-02-16T02:20:47.397305+08:00",
"end_time": "2026-02-16T02:20:50.260818+08:00",
"duration_sec": 2.863,
"status": "SUCCESS",
"task_count": 15,
"success_count": 1,
"fail_count": 0,
"skip_count": 14,
"total_fetched": 0,
"total_inserted": 5117,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "DWS_ASSISTANT_CUSTOMER",
"layer": "DWS",
"start_time": "2026-02-16T02:20:47.478185+08:00",
"end_time": "2026-02-16T02:20:47.912593+08:00",
"duration_sec": 0.434,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:47.916343+08:00",
"end_time": "2026-02-16T02:20:48.058411+08:00",
"duration_sec": 0.142,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_FINANCE",
"layer": "DWS",
"start_time": "2026-02-16T02:20:48.060291+08:00",
"end_time": "2026-02-16T02:20:48.109598+08:00",
"duration_sec": 0.049,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_MONTHLY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:48.111303+08:00",
"end_time": "2026-02-16T02:20:48.164240+08:00",
"duration_sec": 0.053,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_ASSISTANT_SALARY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:48.165763+08:00",
"end_time": "2026-02-16T02:20:48.216816+08:00",
"duration_sec": 0.051,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_BUILD_ORDER_SUMMARY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:48.218588+08:00",
"end_time": "2026-02-16T02:20:49.720095+08:00",
"duration_sec": 1.501,
"status": "SUCCESS",
"counts": {
"fetched": 0,
"inserted": 5117,
"updated": 0,
"skipped": 0,
"errors": 0
},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:49.721608+08:00",
"end_time": "2026-02-16T02:20:49.827953+08:00",
"duration_sec": 0.106,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_DISCOUNT_DETAIL",
"layer": "DWS",
"start_time": "2026-02-16T02:20:49.830310+08:00",
"end_time": "2026-02-16T02:20:49.882148+08:00",
"duration_sec": 0.052,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_INCOME_STRUCTURE",
"layer": "DWS",
"start_time": "2026-02-16T02:20:49.884147+08:00",
"end_time": "2026-02-16T02:20:49.937621+08:00",
"duration_sec": 0.053,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_FINANCE_RECHARGE",
"layer": "DWS",
"start_time": "2026-02-16T02:20:49.939594+08:00",
"end_time": "2026-02-16T02:20:49.990880+08:00",
"duration_sec": 0.051,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MEMBER_CONSUMPTION",
"layer": "DWS",
"start_time": "2026-02-16T02:20:49.993066+08:00",
"end_time": "2026-02-16T02:20:50.050887+08:00",
"duration_sec": 0.058,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MEMBER_VISIT",
"layer": "DWS",
"start_time": "2026-02-16T02:20:50.052695+08:00",
"end_time": "2026-02-16T02:20:50.102870+08:00",
"duration_sec": 0.05,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MV_REFRESH_ASSISTANT_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:50.104174+08:00",
"end_time": "2026-02-16T02:20:50.153937+08:00",
"duration_sec": 0.05,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_MV_REFRESH_FINANCE_DAILY",
"layer": "DWS",
"start_time": "2026-02-16T02:20:50.155430+08:00",
"end_time": "2026-02-16T02:20:50.205405+08:00",
"duration_sec": 0.05,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_RETENTION_CLEANUP",
"layer": "DWS",
"start_time": "2026-02-16T02:20:50.207165+08:00",
"end_time": "2026-02-16T02:20:50.259470+08:00",
"duration_sec": 0.052,
"status": "SKIP",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
}
]
},
{
"layer": "INDEX",
"start_time": "2026-02-16T02:20:50.263599+08:00",
"end_time": "2026-02-16T02:21:00.007411+08:00",
"duration_sec": 9.744,
"status": "PARTIAL",
"task_count": 4,
"success_count": 2,
"fail_count": 2,
"skip_count": 0,
"total_fetched": 0,
"total_inserted": 0,
"total_updated": 0,
"total_errors": 0,
"tasks": [
{
"task_code": "DWS_ML_MANUAL_IMPORT",
"layer": "INDEX",
"start_time": "2026-02-16T02:20:50.264449+08:00",
"end_time": "2026-02-16T02:20:50.266491+08:00",
"duration_sec": 0.002,
"status": "ERROR",
"counts": {},
"error": "未找到 ML 台账文件,请通过环境变量 ML_MANUAL_LEDGER_FILE 或配置 run.ml_manual_ledger_file 指定",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_NEWCONV_INDEX",
"layer": "INDEX",
"start_time": "2026-02-16T02:20:50.316501+08:00",
"end_time": "2026-02-16T02:20:53.334095+08:00",
"duration_sec": 3.018,
"status": "SUCCESS",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_RELATION_INDEX",
"layer": "INDEX",
"start_time": "2026-02-16T02:20:53.335483+08:00",
"end_time": "2026-02-16T02:20:53.696046+08:00",
"duration_sec": 0.361,
"status": "ERROR",
"counts": {},
"error": "错误: 字段 d.is_delete 不存在\nLINE 13: AND COALESCE(d.is_delete, 0) = 0\n ^\nHINT: 也许您想要引用列\"s.is_delete\"。\n",
"api_calls": 0,
"api_total_sec": 0.0
},
{
"task_code": "DWS_WINBACK_INDEX",
"layer": "INDEX",
"start_time": "2026-02-16T02:20:53.749149+08:00",
"end_time": "2026-02-16T02:21:00.005571+08:00",
"duration_sec": 6.256,
"status": "SUCCESS",
"counts": {},
"error": null,
"api_calls": 0,
"api_total_sec": 0.0
}
]
}
],
"verification": {},
"environment": {
"store_id": 2790685415443269,
"db_name": "",
"api_base_url": "https://pc.ficoo.vip/apiprod/admin/v1/",
"timezone": "Asia/Shanghai"
}
}

View File

@@ -0,0 +1,704 @@
# -*- coding: utf-8 -*-
"""全量刷新脚本:执行 2026-01-01 ~ 2026-02-16 的 api_full Flow。
按层逐步执行ODS → DWD → DWS → INDEX内嵌精细性能计时
支持断点续跑(从指定层/任务重试),完成后执行 increment_verify 校验,
校验不一致时自动补齐。计时数据和执行统计写入 JSON 中间文件。
用法:
cd apps/etl/connectors/feiqiu
python -m scripts.debug.run_full_refresh [--resume-layer DWS] [--resume-task DWS_FINANCE_DAILY]
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
import time
import traceback
import uuid
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
sys.path.insert(0, str(_FEIQIU_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from database.operations import DatabaseOperations
from api.client import APIClient
from orchestration.task_registry import default_registry
from orchestration.cursor_manager import CursorManager
from orchestration.run_tracker import RunTracker
from orchestration.task_executor import TaskExecutor
from orchestration.flow_runner import FlowRunner
# ── 常量 ──────────────────────────────────────────────────────
FLOW_NAME = "api_full"
LAYERS = FlowRunner.FLOW_LAYERS[FLOW_NAME] # ["ODS", "DWD", "DWS", "INDEX"]
WINDOW_START_STR = "2026-01-01T00:00:00"
WINDOW_END_STR = "2026-02-16T00:00:00"
# ── 数据结构 ──────────────────────────────────────────────────
@dataclass
class TaskTiming:
"""单个任务的计时与执行统计"""
task_code: str
layer: str
start_time: str = ""
end_time: str = ""
duration_sec: float = 0.0
status: str = "" # SUCCESS / FAIL / ERROR / SKIP
counts: dict = field(default_factory=dict)
error: str | None = None
api_calls: int = 0
api_total_sec: float = 0.0
@dataclass
class LayerTiming:
"""单层的计时与汇总"""
layer: str
start_time: str = ""
end_time: str = ""
duration_sec: float = 0.0
status: str = "" # SUCCESS / PARTIAL / ERROR
task_count: int = 0
success_count: int = 0
fail_count: int = 0
skip_count: int = 0
total_fetched: int = 0
total_inserted: int = 0
total_updated: int = 0
total_errors: int = 0
tasks: list[TaskTiming] = field(default_factory=list)
@dataclass
class RefreshReport:
"""全量刷新的完整执行报告"""
flow: str = FLOW_NAME
window_start: str = WINDOW_START_STR
window_end: str = WINDOW_END_STR
overall_start: str = ""
overall_end: str = ""
overall_duration_sec: float = 0.0
overall_status: str = ""
layers: list[LayerTiming] = field(default_factory=list)
verification: dict = field(default_factory=dict)
environment: dict = field(default_factory=dict)
# ── 工具函数 ──────────────────────────────────────────────────
def _setup_logging() -> logging.Logger:
logger = logging.getLogger("full_refresh")
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
))
logger.addHandler(handler)
return logger
def _now_iso(tz: ZoneInfo) -> str:
return datetime.now(tz).isoformat()
def _build_components(config: AppConfig, logger: logging.Logger):
"""构建 DB / API / TaskExecutor / FlowRunner 等组件。"""
db_conn = DatabaseConnection(
dsn=config["db"]["dsn"],
session=config["db"].get("session"),
connect_timeout=config["db"].get("connect_timeout_sec"),
)
api_client = APIClient(
base_url=config["api"]["base_url"],
token=config["api"]["token"],
timeout=config["api"].get("timeout_sec", 20),
retry_max=config["api"].get("retries", {}).get("max_attempts", 3),
headers_extra=config["api"].get("headers_extra"),
)
db_ops = DatabaseOperations(db_conn)
cursor_mgr = CursorManager(db_conn)
run_tracker = RunTracker(db_conn)
executor = TaskExecutor(
config, db_ops, api_client,
cursor_mgr, run_tracker, default_registry, logger,
)
runner = FlowRunner(
config, executor, default_registry,
db_conn, api_client, logger,
)
return db_conn, api_client, db_ops, executor, runner
def _resolve_layer_tasks(layer: str, config: AppConfig) -> list[str]:
"""解析单层的任务列表,与 FlowRunner._resolve_tasks 逻辑一致。"""
layer_upper = layer.upper()
if layer_upper == "ODS":
ods_tasks = config.get("run.ods_tasks", [])
if ods_tasks:
return list(ods_tasks)
registry_tasks = default_registry.get_tasks_by_layer("ODS")
return sorted(registry_tasks) if registry_tasks else []
elif layer_upper == "DWD":
return ["DWD_LOAD_FROM_ODS"]
elif layer_upper == "DWS":
dws_tasks = config.get("run.dws_tasks", [])
if dws_tasks:
return list(dws_tasks)
registry_tasks = default_registry.get_tasks_by_layer("DWS")
return sorted(registry_tasks) if registry_tasks else []
elif layer_upper == "INDEX":
index_tasks = config.get("run.index_tasks", [])
if index_tasks:
return list(index_tasks)
registry_tasks = default_registry.get_tasks_by_layer("INDEX")
return sorted(registry_tasks) if registry_tasks else []
return []
def _sanitize_for_json(obj):
"""递归处理不可序列化的值。"""
if isinstance(obj, dict):
return {k: _sanitize_for_json(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_sanitize_for_json(v) for v in obj]
if isinstance(obj, datetime):
return obj.isoformat()
return obj
def _save_json(data, path: Path):
"""将数据序列化为 JSON 文件。"""
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
json.dumps(_sanitize_for_json(data), ensure_ascii=False, indent=2, default=str),
encoding="utf-8",
)
def _load_checkpoint(path: Path) -> dict | None:
"""加载断点续跑的检查点文件。"""
if path.exists():
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return None
return None
def _save_checkpoint(path: Path, data: dict):
"""保存断点续跑的检查点。"""
_save_json(data, path)
# ── 单任务执行(带精细计时)─────────────────────────────────
def _execute_task_with_timing(
task_code: str,
layer: str,
executor: TaskExecutor,
config: AppConfig,
db_conn: DatabaseConnection,
logger: logging.Logger,
tz: ZoneInfo,
) -> TaskTiming:
"""执行单个任务并记录精细计时。"""
timing = TaskTiming(task_code=task_code, layer=layer)
store_id = int(config.get("app.store_id"))
run_uuid = f"full-refresh-{task_code.lower()}-{uuid.uuid4().hex[:8]}"
timing.start_time = _now_iso(tz)
t0 = time.monotonic()
try:
task_result = executor.run_single_task(
task_code=task_code,
run_uuid=run_uuid,
store_id=store_id,
data_source="online",
)
timing.duration_sec = round(time.monotonic() - t0, 3)
timing.end_time = _now_iso(tz)
# 解析结果
raw_status = (task_result.get("status") or "").upper()
counts = task_result.get("counts") or {}
timing.counts = counts
timing.status = raw_status if raw_status else "COMPLETE"
# 尝试提取 API 调用统计(如果任务结果中包含)
api_stats = task_result.get("api_stats") or {}
timing.api_calls = api_stats.get("calls", 0)
timing.api_total_sec = api_stats.get("total_sec", 0.0)
logger.info(
"%s: %s (%.1fs) fetched=%s inserted=%s updated=%s errors=%s",
task_code, timing.status, timing.duration_sec,
counts.get("fetched", 0), counts.get("inserted", 0),
counts.get("updated", 0), counts.get("errors", 0),
)
except Exception as exc:
timing.duration_sec = round(time.monotonic() - t0, 3)
timing.end_time = _now_iso(tz)
timing.status = "ERROR"
timing.error = str(exc)
logger.error("%s: 异常 (%.1fs): %s", task_code, timing.duration_sec, exc)
# CHANGE 2026-02-16 | 任务异常后 rollback防止 InFailedSqlTransaction 级联
try:
db_conn.rollback()
except Exception:
pass
return timing
# ── 单层执行(带断点续跑)───────────────────────────────────
def _execute_layer(
layer: str,
config: AppConfig,
executor: TaskExecutor,
db_conn: DatabaseConnection,
logger: logging.Logger,
tz: ZoneInfo,
resume_task: str | None = None,
checkpoint_path: Path | None = None,
) -> LayerTiming:
"""执行单层所有任务,支持从指定任务恢复。"""
layer_timing = LayerTiming(layer=layer)
layer_timing.start_time = _now_iso(tz)
layer_t0 = time.monotonic()
tasks = _resolve_layer_tasks(layer, config)
layer_timing.task_count = len(tasks)
logger.info("" * 70)
logger.info("▶ 层 %s: %d 个任务", layer, len(tasks))
if tasks:
logger.info(" 任务列表: %s", ", ".join(tasks))
# 断点续跑:跳过 resume_task 之前的任务
skip_until_found = False
if resume_task:
resume_upper = resume_task.upper()
if resume_upper in [t.upper() for t in tasks]:
skip_until_found = True
logger.info(" 断点续跑: 从 %s 开始", resume_upper)
else:
logger.warning(" 断点续跑: %s 不在本层任务列表中,执行全部", resume_upper)
for idx, task_code in enumerate(tasks, start=1):
# 断点续跑逻辑
if skip_until_found:
if task_code.upper() == resume_upper:
skip_until_found = False
logger.info(" [%d/%d] ▶ 恢复执行: %s", idx, len(tasks), task_code)
else:
logger.info(" [%d/%d] ⏭ 跳过: %s (断点续跑)", idx, len(tasks), task_code)
skipped = TaskTiming(
task_code=task_code, layer=layer, status="SKIPPED_RESUME",
)
layer_timing.tasks.append(skipped)
layer_timing.skip_count += 1
continue
else:
logger.info(" [%d/%d] %s", idx, len(tasks), task_code)
timing = _execute_task_with_timing(
task_code, layer, executor, config, db_conn, logger, tz,
)
layer_timing.tasks.append(timing)
# 统计
if timing.status in ("SUCCESS", "成功", "COMPLETE", "PARTIAL"):
layer_timing.success_count += 1
elif timing.status == "ERROR":
layer_timing.fail_count += 1
elif timing.status == "SKIP":
layer_timing.skip_count += 1
else:
layer_timing.success_count += 1 # 未知状态视为成功
counts = timing.counts
layer_timing.total_fetched += counts.get("fetched", 0)
layer_timing.total_inserted += counts.get("inserted", 0)
layer_timing.total_updated += counts.get("updated", 0)
layer_timing.total_errors += counts.get("errors", 0)
# 保存检查点(每个任务完成后)
if checkpoint_path:
_save_checkpoint(checkpoint_path, {
"last_completed_layer": layer,
"last_completed_task": task_code,
"timestamp": _now_iso(tz),
})
# 确保连接可用
db_conn.ensure_open()
layer_timing.duration_sec = round(time.monotonic() - layer_t0, 3)
layer_timing.end_time = _now_iso(tz)
# 层状态判定
if layer_timing.fail_count == 0:
layer_timing.status = "SUCCESS"
elif layer_timing.success_count > 0:
layer_timing.status = "PARTIAL"
else:
layer_timing.status = "ERROR"
logger.info(
"%s 完成: %s (%.1fs) 成功=%d 失败=%d 跳过=%d",
layer, layer_timing.status, layer_timing.duration_sec,
layer_timing.success_count, layer_timing.fail_count, layer_timing.skip_count,
)
logger.info(
" 汇总: fetched=%d inserted=%d updated=%d errors=%d",
layer_timing.total_fetched, layer_timing.total_inserted,
layer_timing.total_updated, layer_timing.total_errors,
)
return layer_timing
# ── 校验阶段 ──────────────────────────────────────────────────
def _run_verification(
runner: FlowRunner,
config: AppConfig,
window_start: datetime,
window_end: datetime,
logger: logging.Logger,
tz: ZoneInfo,
) -> dict:
"""执行 increment_verify 校验,发现不一致时自动补齐。"""
logger.info("")
logger.info("=" * 70)
logger.info("▶ 开始 increment_verify 校验")
logger.info("=" * 70)
verify_start = _now_iso(tz)
t0 = time.monotonic()
try:
# 使用 FlowRunner 的内部校验方法
verification_summary = runner._run_verification(
layers=LAYERS,
window_start=window_start,
window_end=window_end,
window_split="month",
)
duration = round(time.monotonic() - t0, 3)
verify_end = _now_iso(tz)
result = {
"status": verification_summary.get("status", "UNKNOWN"),
"start_time": verify_start,
"end_time": verify_end,
"duration_sec": duration,
"total_tables": verification_summary.get("total_tables", 0),
"consistent_tables": verification_summary.get("consistent_tables", 0),
"total_backfilled": verification_summary.get("total_backfilled", 0),
"error_tables": verification_summary.get("error_tables", 0),
"layers": verification_summary.get("layers", {}),
}
logger.info(
" 校验完成: %s (%.1fs) 表数=%d 一致=%d 补齐=%d 错误=%d",
result["status"], duration,
result["total_tables"], result["consistent_tables"],
result["total_backfilled"], result["error_tables"],
)
# 如果有补齐,记录详情
if result["total_backfilled"] > 0:
logger.info(" 已自动补齐 %d 处不一致", result["total_backfilled"])
return result
except Exception as exc:
duration = round(time.monotonic() - t0, 3)
logger.error(" ✗ 校验异常 (%.1fs): %s", duration, exc)
return {
"status": "ERROR",
"start_time": verify_start,
"end_time": _now_iso(tz),
"duration_sec": duration,
"error": str(exc),
"traceback": traceback.format_exc(),
}
# ── 主流程 ────────────────────────────────────────────────────
def run_full_refresh(
resume_layer: str | None = None,
resume_task: str | None = None,
skip_verify: bool = False,
) -> RefreshReport:
"""执行全量刷新。
Args:
resume_layer: 从指定层开始执行(断点续跑),如 "DWS"
resume_task: 在恢复层中从指定任务开始(断点续跑),如 "DWS_FINANCE_DAILY"
skip_verify: 跳过校验阶段(调试用)
Returns:
RefreshReport 完整执行报告
"""
logger = _setup_logging()
logger.info("=" * 70)
logger.info("全量刷新开始")
logger.info("Flow: %s | 窗口: %s ~ %s", FLOW_NAME, WINDOW_START_STR, WINDOW_END_STR)
logger.info("=" * 70)
# 加载配置
config = AppConfig.load()
tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai"))
window_start = datetime.fromisoformat(WINDOW_START_STR).replace(tzinfo=tz)
window_end = datetime.fromisoformat(WINDOW_END_STR).replace(tzinfo=tz)
report = RefreshReport()
report.overall_start = _now_iso(tz)
report.environment = {
"store_id": config.get("app.store_id"),
"db_name": config.get("db.name", ""),
"api_base_url": config.get("api.base_url", ""),
"timezone": str(tz),
}
logger.info("门店 ID: %s", config.get("app.store_id"))
logger.info("数据库: %s", config.get("db.name", ""))
logger.info("API: %s", config.get("api.base_url", ""))
# 设置 window_override 让所有任务使用统一的全量窗口
config.config.setdefault("run", {}).setdefault("window_override", {})
config.config["run"]["window_override"]["start"] = window_start
config.config["run"]["window_override"]["end"] = window_end
# 构建组件
db_conn, api_client, db_ops, executor, runner = _build_components(config, logger)
# 输出目录和检查点
output_dir = _FEIQIU_ROOT / "scripts" / "debug" / "output"
output_dir.mkdir(parents=True, exist_ok=True)
checkpoint_path = output_dir / "full_refresh_checkpoint.json"
overall_t0 = time.monotonic()
# 确定从哪一层开始
layers_to_run = list(LAYERS)
if resume_layer:
resume_layer_upper = resume_layer.upper()
layer_names_upper = [l.upper() for l in layers_to_run]
if resume_layer_upper in layer_names_upper:
start_idx = layer_names_upper.index(resume_layer_upper)
skipped_layers = layers_to_run[:start_idx]
layers_to_run = layers_to_run[start_idx:]
if skipped_layers:
logger.info("断点续跑: 跳过层 %s,从 %s 开始", skipped_layers, resume_layer_upper)
else:
logger.warning("断点续跑: 层 %s 不在 Flow 定义中,执行全部", resume_layer_upper)
# 逐层执行
for layer_idx, layer in enumerate(layers_to_run):
# 仅第一个恢复层使用 resume_task
current_resume_task = resume_task if (layer_idx == 0 and resume_layer) else None
layer_timing = _execute_layer(
layer=layer,
config=config,
executor=executor,
db_conn=db_conn,
logger=logger,
tz=tz,
resume_task=current_resume_task,
checkpoint_path=checkpoint_path,
)
report.layers.append(layer_timing)
# 层执行后检查结果
if layer_timing.status == "ERROR":
logger.warning("")
logger.warning("⚠ 层 %s 全部失败,后续层可能受影响", layer)
logger.warning(" 可使用 --resume-layer %s 从此层重试", layer)
# 每层完成后保存中间结果(防止中途崩溃丢失数据)
_save_intermediate_report(report, output_dir, tz)
# 校验阶段
if not skip_verify:
report.verification = _run_verification(
runner, config, window_start, window_end, logger, tz,
)
else:
logger.info("")
logger.info("⏭ 跳过校验阶段 (--skip-verify)")
report.verification = {"status": "SKIPPED"}
# 汇总
report.overall_duration_sec = round(time.monotonic() - overall_t0, 3)
report.overall_end = _now_iso(tz)
all_success = all(lt.status == "SUCCESS" for lt in report.layers)
any_error = any(lt.status == "ERROR" for lt in report.layers)
if all_success:
report.overall_status = "SUCCESS"
elif any_error:
report.overall_status = "PARTIAL"
else:
report.overall_status = "PARTIAL"
# 打印汇总
_print_summary(report, logger)
# 保存最终结果
ts = datetime.now(tz).strftime("%Y%m%d_%H%M%S")
final_path = output_dir / f"full_refresh_{ts}.json"
_save_json(asdict(report), final_path)
logger.info("计时数据已保存: %s", final_path)
# 清理检查点
if checkpoint_path.exists() and report.overall_status == "SUCCESS":
checkpoint_path.unlink()
logger.info("检查点已清理")
# 清理连接
db_conn.close()
return report
def _save_intermediate_report(report: RefreshReport, output_dir: Path, tz: ZoneInfo):
"""保存中间结果,防止中途崩溃丢失已采集的计时数据。"""
intermediate_path = output_dir / "full_refresh_intermediate.json"
_save_json(asdict(report), intermediate_path)
# ── 汇总输出 ──────────────────────────────────────────────────
def _print_summary(report: RefreshReport, logger: logging.Logger):
"""打印全量刷新汇总。"""
logger.info("")
logger.info("=" * 70)
logger.info("全量刷新汇总")
logger.info("=" * 70)
logger.info("状态: %s | 总耗时: %.1fs", report.overall_status, report.overall_duration_sec)
logger.info("")
# 逐层统计
logger.info("%-8s %-10s %8s %8s %8s %8s %10s", "", "状态", "成功", "失败", "跳过", "任务数", "耗时(s)")
logger.info("-" * 70)
for lt in report.layers:
logger.info(
"%-8s %-10s %8d %8d %8d %8d %10.1f",
lt.layer, lt.status, lt.success_count, lt.fail_count,
lt.skip_count, lt.task_count, lt.duration_sec,
)
# 记录数汇总
logger.info("")
logger.info("记录数汇总:")
total_fetched = sum(lt.total_fetched for lt in report.layers)
total_inserted = sum(lt.total_inserted for lt in report.layers)
total_updated = sum(lt.total_updated for lt in report.layers)
total_errors = sum(lt.total_errors for lt in report.layers)
logger.info(" fetched=%d inserted=%d updated=%d errors=%d",
total_fetched, total_inserted, total_updated, total_errors)
# 耗时最长的 5 个任务
all_tasks = []
for lt in report.layers:
all_tasks.extend(lt.tasks)
top5 = sorted(
[t for t in all_tasks if t.status not in ("SKIPPED_RESUME",)],
key=lambda t: t.duration_sec,
reverse=True,
)[:5]
if top5:
logger.info("")
logger.info("耗时 Top 5 任务:")
for t in top5:
logger.info(" %-30s %8.1fs [%s] %s", t.task_code, t.duration_sec, t.layer, t.status)
# 失败任务
failed = [t for t in all_tasks if t.status == "ERROR"]
if failed:
logger.info("")
logger.info("失败任务 (%d 个):", len(failed))
for t in failed:
logger.info("%s [%s]: %s", t.task_code, t.layer, t.error or "未知错误")
# 校验结果
if report.verification:
v = report.verification
logger.info("")
logger.info("校验结果: %s", v.get("status", "N/A"))
if v.get("total_tables"):
logger.info(
" 表数=%d 一致=%d 补齐=%d 错误=%d",
v.get("total_tables", 0), v.get("consistent_tables", 0),
v.get("total_backfilled", 0), v.get("error_tables", 0),
)
# ── CLI 入口 ──────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser(
description="全量刷新: 执行 2026-01-01 ~ 2026-02-16 的 api_full Flow",
)
parser.add_argument(
"--resume-layer", type=str, default=None,
help="断点续跑: 从指定层开始(如 DWS",
)
parser.add_argument(
"--resume-task", type=str, default=None,
help="断点续跑: 在恢复层中从指定任务开始(如 DWS_FINANCE_DAILY",
)
parser.add_argument(
"--skip-verify", action="store_true",
help="跳过校验阶段",
)
return parser.parse_args()
def main():
args = parse_args()
report = run_full_refresh(
resume_layer=args.resume_layer,
resume_task=args.resume_task,
skip_verify=args.skip_verify,
)
# 退出码
if report.overall_status == "SUCCESS":
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()