882 lines
34 KiB
Python
882 lines
34 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""ETL 架构分析脚本。
|
||
|
||
通过静态分析(AST 解析、import 扫描、文件统计)评估 ETL 代码结构,
|
||
生成架构优化报告(Markdown)。
|
||
|
||
分析维度:
|
||
1. 模块依赖关系 — 扫描 import,构建依赖图,识别循环依赖
|
||
2. 文件大小分析 — 统计行数,识别过大文件(>500 行)
|
||
3. 函数复杂度 — AST 分析圈复杂度(分支/嵌套深度)
|
||
4. 重复代码检测 — 比较函数签名和结构相似度
|
||
5. 耦合度评估 — 模块间导入关系密度
|
||
6. 任务分类分析 — 从 TaskRegistry 读取元数据,评估分类合理性
|
||
|
||
用法:
|
||
cd apps/etl/connectors/feiqiu
|
||
python -m scripts.debug.analyze_architecture
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import ast
|
||
import argparse
|
||
import logging
|
||
import os
|
||
import sys
|
||
from collections import Counter, defaultdict
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Iterator
|
||
|
||
# ── 确保项目根目录在 sys.path ──
|
||
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
|
||
if str(_FEIQIU_ROOT) not in sys.path:
|
||
sys.path.insert(0, str(_FEIQIU_ROOT))
|
||
|
||
# ── 分析范围:ETL 核心模块 ──
|
||
_CORE_MODULES = [
|
||
"api", "cli", "config", "database", "loaders", "models",
|
||
"orchestration", "quality", "scd", "tasks", "utils",
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 数据结构
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
@dataclass
|
||
class FileInfo:
|
||
"""单个 .py 文件的统计信息"""
|
||
path: Path
|
||
rel_path: str
|
||
lines: int = 0
|
||
code_lines: int = 0 # 非空非注释行
|
||
blank_lines: int = 0
|
||
comment_lines: int = 0
|
||
module: str = "" # 所属模块(api/cli/...)
|
||
|
||
|
||
@dataclass
|
||
class FunctionInfo:
|
||
"""函数/方法的分析信息"""
|
||
name: str
|
||
file: str
|
||
line: int
|
||
complexity: int = 1 # 圈复杂度
|
||
max_nesting: int = 0 # 最大嵌套深度
|
||
param_count: int = 0
|
||
lines: int = 0 # 函数体行数
|
||
is_method: bool = False
|
||
class_name: str = ""
|
||
|
||
|
||
@dataclass
|
||
class ImportEdge:
|
||
"""模块间的导入关系"""
|
||
source_module: str # 导入方
|
||
target_module: str # 被导入方
|
||
source_file: str
|
||
import_name: str # 具体导入的名称
|
||
|
||
|
||
@dataclass
|
||
class ArchitectureReport:
|
||
"""架构分析报告的完整数据"""
|
||
generated_at: datetime = field(default_factory=datetime.now)
|
||
# 文件统计
|
||
files: list[FileInfo] = field(default_factory=list)
|
||
# 函数分析
|
||
functions: list[FunctionInfo] = field(default_factory=list)
|
||
# 依赖关系
|
||
import_edges: list[ImportEdge] = field(default_factory=list)
|
||
circular_deps: list[tuple[str, str]] = field(default_factory=list)
|
||
# 任务分类
|
||
task_classification: dict = field(default_factory=dict)
|
||
# 重复代码
|
||
similar_functions: list[tuple[str, str, float]] = field(default_factory=list)
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 日志
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def _setup_logging() -> logging.Logger:
|
||
logger = logging.getLogger("analyze_architecture")
|
||
logger.setLevel(logging.INFO)
|
||
if not logger.handlers:
|
||
handler = logging.StreamHandler(sys.stdout)
|
||
handler.setFormatter(logging.Formatter(
|
||
"%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
|
||
))
|
||
logger.addHandler(handler)
|
||
return logger
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 1. 文件扫描与行数统计
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def _iter_py_files(root: Path) -> Iterator[Path]:
|
||
"""递归遍历核心模块下的 .py 文件,跳过 __pycache__ / .hypothesis 等。"""
|
||
skip_dirs = {"__pycache__", ".hypothesis", ".pytest_cache", "export", "Asia"}
|
||
for dirpath, dirnames, filenames in os.walk(root):
|
||
dirnames[:] = [d for d in dirnames if d not in skip_dirs]
|
||
for fn in filenames:
|
||
if fn.endswith(".py"):
|
||
yield Path(dirpath) / fn
|
||
|
||
|
||
def _classify_module(rel_path: str) -> str:
|
||
"""从相对路径提取所属模块名。"""
|
||
parts = Path(rel_path).parts
|
||
if parts:
|
||
top = parts[0]
|
||
if top in _CORE_MODULES:
|
||
return top
|
||
if top == "scripts":
|
||
return "scripts"
|
||
if top == "tests":
|
||
return "tests"
|
||
return "root"
|
||
|
||
|
||
def _count_lines(filepath: Path) -> FileInfo:
|
||
"""统计单个文件的行数分布。"""
|
||
info = FileInfo(path=filepath, rel_path="")
|
||
try:
|
||
text = filepath.read_text(encoding="utf-8", errors="replace")
|
||
except Exception:
|
||
return info
|
||
raw_lines = text.splitlines()
|
||
info.lines = len(raw_lines)
|
||
for line in raw_lines:
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
info.blank_lines += 1
|
||
elif stripped.startswith("#"):
|
||
info.comment_lines += 1
|
||
else:
|
||
info.code_lines += 1
|
||
return info
|
||
|
||
|
||
def scan_files(root: Path, logger: logging.Logger) -> list[FileInfo]:
|
||
"""扫描所有 .py 文件并统计行数。"""
|
||
results: list[FileInfo] = []
|
||
for fp in _iter_py_files(root):
|
||
info = _count_lines(fp)
|
||
info.path = fp
|
||
info.rel_path = str(fp.relative_to(root)).replace("\\", "/")
|
||
info.module = _classify_module(info.rel_path)
|
||
results.append(info)
|
||
logger.info("扫描完成:共 %d 个 .py 文件", len(results))
|
||
return results
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 2. AST 分析:函数复杂度
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
# 增加圈复杂度的 AST 节点类型
|
||
_COMPLEXITY_NODES = (
|
||
ast.If, ast.For, ast.While, ast.ExceptHandler,
|
||
ast.With, ast.Assert, ast.BoolOp,
|
||
)
|
||
# 仅 comprehension 内的 if 子句
|
||
_COMP_NODES = (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)
|
||
|
||
|
||
def _calc_complexity(node: ast.AST) -> int:
|
||
"""计算函数体的圈复杂度(McCabe)。"""
|
||
complexity = 1
|
||
for child in ast.walk(node):
|
||
if isinstance(child, _COMPLEXITY_NODES):
|
||
complexity += 1
|
||
# BoolOp 中每个额外的 and/or 加 1
|
||
if isinstance(child, ast.BoolOp):
|
||
complexity += len(child.values) - 2 if len(child.values) > 2 else 0
|
||
elif isinstance(child, _COMP_NODES):
|
||
for gen in child.generators:
|
||
complexity += len(gen.ifs)
|
||
return complexity
|
||
|
||
|
||
def _calc_max_nesting(node: ast.AST, depth: int = 0) -> int:
|
||
"""计算最大嵌套深度。"""
|
||
nesting_types = (ast.If, ast.For, ast.While, ast.With, ast.Try, ast.ExceptHandler)
|
||
max_depth = depth
|
||
for child in ast.iter_child_nodes(node):
|
||
if isinstance(child, nesting_types):
|
||
child_depth = _calc_max_nesting(child, depth + 1)
|
||
max_depth = max(max_depth, child_depth)
|
||
else:
|
||
child_depth = _calc_max_nesting(child, depth)
|
||
max_depth = max(max_depth, child_depth)
|
||
return max_depth
|
||
|
||
|
||
def _func_body_lines(node: ast.FunctionDef | ast.AsyncFunctionDef) -> int:
|
||
"""计算函数体行数。"""
|
||
if not node.body:
|
||
return 0
|
||
first_line = node.body[0].lineno
|
||
last_line = node.body[-1].end_lineno or node.body[-1].lineno
|
||
return last_line - first_line + 1
|
||
|
||
|
||
def _walk_with_parent(tree: ast.AST):
|
||
"""遍历 AST 并记录每个节点的父节点(避免 O(n²) 嵌套 walk)。"""
|
||
# 先给所有节点标记 parent
|
||
for node in ast.walk(tree):
|
||
for child in ast.iter_child_nodes(node):
|
||
child._parent = node # type: ignore[attr-defined]
|
||
|
||
|
||
def analyze_functions(files: list[FileInfo], logger: logging.Logger) -> list[FunctionInfo]:
|
||
"""对所有文件做 AST 分析,提取函数/方法信息。"""
|
||
results: list[FunctionInfo] = []
|
||
for fi in files:
|
||
try:
|
||
source = fi.path.read_text(encoding="utf-8", errors="replace")
|
||
tree = ast.parse(source, filename=fi.rel_path)
|
||
except (SyntaxError, UnicodeDecodeError):
|
||
continue
|
||
|
||
_walk_with_parent(tree)
|
||
|
||
for node in ast.walk(tree):
|
||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||
# 通过 _parent 属性判断是否为方法
|
||
parent = getattr(node, "_parent", None)
|
||
class_name = ""
|
||
is_method = False
|
||
if isinstance(parent, ast.ClassDef):
|
||
class_name = parent.name
|
||
is_method = True
|
||
|
||
param_count = len(node.args.args)
|
||
if is_method and param_count > 0:
|
||
param_count -= 1 # 去掉 self/cls
|
||
|
||
info = FunctionInfo(
|
||
name=node.name,
|
||
file=fi.rel_path,
|
||
line=node.lineno,
|
||
complexity=_calc_complexity(node),
|
||
max_nesting=_calc_max_nesting(node),
|
||
param_count=param_count,
|
||
lines=_func_body_lines(node),
|
||
is_method=is_method,
|
||
class_name=class_name,
|
||
)
|
||
results.append(info)
|
||
|
||
logger.info("函数分析完成:共 %d 个函数/方法", len(results))
|
||
return results
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 3. 依赖关系分析
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def _extract_imports(filepath: Path, rel_path: str) -> list[ImportEdge]:
|
||
"""从单个文件提取 import 语句,映射到模块级别。"""
|
||
edges: list[ImportEdge] = []
|
||
try:
|
||
source = filepath.read_text(encoding="utf-8", errors="replace")
|
||
tree = ast.parse(source, filename=rel_path)
|
||
except (SyntaxError, UnicodeDecodeError):
|
||
return edges
|
||
|
||
source_module = _classify_module(rel_path)
|
||
|
||
for node in ast.walk(tree):
|
||
if isinstance(node, ast.Import):
|
||
for alias in node.names:
|
||
target = _resolve_import_module(alias.name)
|
||
if target and target != source_module:
|
||
edges.append(ImportEdge(
|
||
source_module=source_module,
|
||
target_module=target,
|
||
source_file=rel_path,
|
||
import_name=alias.name,
|
||
))
|
||
elif isinstance(node, ast.ImportFrom):
|
||
if node.module:
|
||
target = _resolve_import_module(node.module)
|
||
if target and target != source_module:
|
||
names = ", ".join(a.name for a in (node.names or []))
|
||
edges.append(ImportEdge(
|
||
source_module=source_module,
|
||
target_module=target,
|
||
source_file=rel_path,
|
||
import_name=f"{node.module}.{{{names}}}",
|
||
))
|
||
return edges
|
||
|
||
|
||
def _resolve_import_module(import_path: str) -> str | None:
|
||
"""将 import 路径映射到核心模块名。"""
|
||
parts = import_path.split(".")
|
||
top = parts[0]
|
||
if top in _CORE_MODULES:
|
||
return top
|
||
return None
|
||
|
||
|
||
def analyze_dependencies(files: list[FileInfo], logger: logging.Logger) -> tuple[list[ImportEdge], list[tuple[str, str]]]:
|
||
"""分析模块间依赖关系,检测循环依赖。"""
|
||
all_edges: list[ImportEdge] = []
|
||
for fi in files:
|
||
all_edges.extend(_extract_imports(fi.path, fi.rel_path))
|
||
|
||
# 构建有向图检测循环
|
||
graph: dict[str, set[str]] = defaultdict(set)
|
||
for edge in all_edges:
|
||
graph[edge.source_module].add(edge.target_module)
|
||
|
||
circular: list[tuple[str, str]] = []
|
||
for src, targets in graph.items():
|
||
for tgt in targets:
|
||
if src in graph.get(tgt, set()):
|
||
pair = tuple(sorted([src, tgt]))
|
||
if pair not in circular:
|
||
circular.append(pair)
|
||
|
||
logger.info("依赖分析完成:%d 条导入边,%d 对循环依赖", len(all_edges), len(circular))
|
||
return all_edges, circular
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 4. 重复代码检测(基于函数签名相似度)
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def _func_signature_key(fn: FunctionInfo) -> str:
|
||
"""生成函数签名指纹:参数数量 + 行数范围 + 复杂度。"""
|
||
line_bucket = fn.lines // 10 * 10 # 按 10 行分桶
|
||
return f"p{fn.param_count}_l{line_bucket}_c{fn.complexity}"
|
||
|
||
|
||
def detect_similar_functions(
|
||
functions: list[FunctionInfo],
|
||
logger: logging.Logger,
|
||
min_lines: int = 15,
|
||
) -> list[tuple[str, str, float]]:
|
||
"""检测签名相似的函数对(可能是重复代码)。
|
||
|
||
只比较行数 >= min_lines 的函数,避免噪声。
|
||
"""
|
||
# 按签名分桶
|
||
buckets: dict[str, list[FunctionInfo]] = defaultdict(list)
|
||
for fn in functions:
|
||
if fn.lines >= min_lines:
|
||
key = _func_signature_key(fn)
|
||
buckets[key].append(fn)
|
||
|
||
similar: list[tuple[str, str, float]] = []
|
||
for key, group in buckets.items():
|
||
if len(group) < 2:
|
||
continue
|
||
# 同一桶内两两配对
|
||
for i in range(len(group)):
|
||
for j in range(i + 1, len(group)):
|
||
a, b = group[i], group[j]
|
||
# 跳过同文件内的重载/变体
|
||
if a.file == b.file:
|
||
continue
|
||
# 简单相似度:行数差异越小越相似
|
||
line_ratio = 1 - abs(a.lines - b.lines) / max(a.lines, b.lines)
|
||
if line_ratio >= 0.7:
|
||
label_a = f"{a.file}:{a.class_name}.{a.name}" if a.class_name else f"{a.file}:{a.name}"
|
||
label_b = f"{b.file}:{b.class_name}.{b.name}" if b.class_name else f"{b.file}:{b.name}"
|
||
similar.append((label_a, label_b, round(line_ratio, 2)))
|
||
|
||
logger.info("重复检测完成:%d 对相似函数", len(similar))
|
||
return similar
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 5. 任务分类分析
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def analyze_task_classification(logger: logging.Logger) -> dict:
|
||
"""从 TaskRegistry 读取 52 个任务的元数据,分析分类合理性。"""
|
||
try:
|
||
from orchestration.task_registry import default_registry, TaskMeta
|
||
except ImportError:
|
||
logger.warning("无法导入 TaskRegistry,跳过任务分类分析")
|
||
return {}
|
||
|
||
all_codes = default_registry.get_all_task_codes()
|
||
by_layer: dict[str, list[str]] = defaultdict(list)
|
||
by_type: dict[str, list[str]] = defaultdict(list)
|
||
anomalies: list[str] = []
|
||
|
||
for code in all_codes:
|
||
meta: TaskMeta | None = default_registry.get_metadata(code)
|
||
if not meta:
|
||
continue
|
||
layer = meta.layer or "NONE"
|
||
by_layer[layer].append(code)
|
||
by_type[meta.task_type].append(code)
|
||
|
||
# 检测命名与分类不一致
|
||
if code.startswith("DWS_") and layer not in ("DWS", "INDEX"):
|
||
anomalies.append(f"{code}: 前缀 DWS_ 但分类为 {layer}")
|
||
if code.startswith("ODS_") and layer != "ODS":
|
||
anomalies.append(f"{code}: 前缀 ODS_ 但分类为 {layer}")
|
||
if code.startswith("DWD_") and layer != "DWD":
|
||
anomalies.append(f"{code}: 前缀 DWD_ 但分类为 {layer}")
|
||
|
||
# 检测 INDEX 层任务命名
|
||
if layer == "INDEX" and not code.startswith("DWS_"):
|
||
anomalies.append(f"{code}: INDEX 层但不以 DWS_ 开头,可能造成混淆")
|
||
|
||
# INDEX 层任务以 DWS_ 开头的命名问题
|
||
index_tasks = by_layer.get("INDEX", [])
|
||
if index_tasks and all(c.startswith("DWS_") for c in index_tasks):
|
||
anomalies.append(
|
||
f"INDEX 层全部 {len(index_tasks)} 个任务以 DWS_ 开头,"
|
||
"建议改为 IDX_ 前缀以区分 DWS 汇总任务"
|
||
)
|
||
|
||
result = {
|
||
"total": len(all_codes),
|
||
"by_layer": {k: {"count": len(v), "tasks": sorted(v)} for k, v in sorted(by_layer.items())},
|
||
"by_type": {k: {"count": len(v), "tasks": sorted(v)} for k, v in sorted(by_type.items())},
|
||
"anomalies": anomalies,
|
||
}
|
||
logger.info("任务分类分析完成:共 %d 个任务,%d 个异常", len(all_codes), len(anomalies))
|
||
return result
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 6. 耦合度评估
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def evaluate_coupling(
|
||
edges: list[ImportEdge],
|
||
files: list[FileInfo],
|
||
) -> dict:
|
||
"""评估模块间耦合度。
|
||
|
||
指标:
|
||
- 传入耦合(Ca):有多少模块依赖本模块
|
||
- 传出耦合(Ce):本模块依赖多少其他模块
|
||
- 不稳定度 I = Ce / (Ca + Ce),越接近 1 越不稳定
|
||
"""
|
||
# 只统计核心模块
|
||
modules = set(m for m in _CORE_MODULES if any(f.module == m for f in files))
|
||
|
||
ca: Counter = Counter() # 传入
|
||
ce: Counter = Counter() # 传出
|
||
|
||
# 去重:同一 source_module → target_module 只计一次
|
||
seen = set()
|
||
for edge in edges:
|
||
pair = (edge.source_module, edge.target_module)
|
||
if pair in seen:
|
||
continue
|
||
seen.add(pair)
|
||
if edge.source_module in modules:
|
||
ce[edge.source_module] += 1
|
||
if edge.target_module in modules:
|
||
ca[edge.target_module] += 1
|
||
|
||
coupling: dict[str, dict] = {}
|
||
for m in sorted(modules):
|
||
ca_val = ca.get(m, 0)
|
||
ce_val = ce.get(m, 0)
|
||
total = ca_val + ce_val
|
||
instability = round(ce_val / total, 2) if total > 0 else 0.0
|
||
coupling[m] = {
|
||
"afferent_coupling": ca_val,
|
||
"efferent_coupling": ce_val,
|
||
"instability": instability,
|
||
}
|
||
return coupling
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 7. Markdown 报告生成
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def generate_report(report: ArchitectureReport, coupling: dict) -> str:
|
||
"""生成 Markdown 格式的架构优化报告。"""
|
||
lines: list[str] = []
|
||
_a = lines.append
|
||
|
||
_a(f"# ETL 架构分析报告")
|
||
_a(f"")
|
||
_a(f"> 生成时间:{report.generated_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||
_a(f"> 分析范围:`apps/etl/connectors/feiqiu/` 核心模块")
|
||
_a("")
|
||
|
||
# ── 概览 ──
|
||
total_files = len(report.files)
|
||
total_lines = sum(f.lines for f in report.files)
|
||
total_code = sum(f.code_lines for f in report.files)
|
||
_a("## 1. 概览")
|
||
_a("")
|
||
_a(f"| 指标 | 值 |")
|
||
_a(f"|------|-----|")
|
||
_a(f"| Python 文件数 | {total_files} |")
|
||
_a(f"| 总行数 | {total_lines:,} |")
|
||
_a(f"| 代码行数 | {total_code:,} |")
|
||
_a(f"| 函数/方法数 | {len(report.functions):,} |")
|
||
_a(f"| 注册任务数 | {report.task_classification.get('total', 'N/A')} |")
|
||
_a(f"| 循环依赖数 | {len(report.circular_deps)} |")
|
||
_a(f"| 相似函数对数 | {len(report.similar_functions)} |")
|
||
_a("")
|
||
|
||
# ── 模块规模 ──
|
||
_a("## 2. 模块规模分析")
|
||
_a("")
|
||
module_stats: dict[str, dict] = defaultdict(lambda: {"files": 0, "lines": 0, "code_lines": 0})
|
||
for f in report.files:
|
||
ms = module_stats[f.module]
|
||
ms["files"] += 1
|
||
ms["lines"] += f.lines
|
||
ms["code_lines"] += f.code_lines
|
||
|
||
_a("| 模块 | 文件数 | 总行数 | 代码行数 |")
|
||
_a("|------|--------|--------|----------|")
|
||
for mod in sorted(module_stats, key=lambda m: module_stats[m]["lines"], reverse=True):
|
||
s = module_stats[mod]
|
||
_a(f"| `{mod}` | {s['files']} | {s['lines']:,} | {s['code_lines']:,} |")
|
||
_a("")
|
||
|
||
# ── 大文件 ──
|
||
large_files = [f for f in report.files if f.lines > 500]
|
||
large_files.sort(key=lambda f: f.lines, reverse=True)
|
||
_a("## 3. 大文件识别(>500 行)")
|
||
_a("")
|
||
if large_files:
|
||
_a("| 文件 | 行数 | 代码行 | 模块 |")
|
||
_a("|------|------|--------|------|")
|
||
for f in large_files:
|
||
_a(f"| `{f.rel_path}` | {f.lines:,} | {f.code_lines:,} | {f.module} |")
|
||
_a("")
|
||
_a(f"> ⚠️ 共 {len(large_files)} 个文件超过 500 行,建议拆分以降低维护成本。")
|
||
else:
|
||
_a("所有文件均在 500 行以内。✅")
|
||
_a("")
|
||
|
||
# ── 函数复杂度 ──
|
||
_a("## 4. 函数复杂度分析")
|
||
_a("")
|
||
high_complexity = [fn for fn in report.functions if fn.complexity >= 10]
|
||
high_complexity.sort(key=lambda fn: fn.complexity, reverse=True)
|
||
_a(f"### 4.1 高复杂度函数(圈复杂度 ≥ 10)")
|
||
_a("")
|
||
if high_complexity:
|
||
_a("| 函数 | 文件 | 行号 | 复杂度 | 嵌套深度 | 函数行数 |")
|
||
_a("|------|------|------|--------|----------|----------|")
|
||
for fn in high_complexity[:20]:
|
||
name = f"{fn.class_name}.{fn.name}" if fn.class_name else fn.name
|
||
_a(f"| `{name}` | `{fn.file}` | {fn.line} | {fn.complexity} | {fn.max_nesting} | {fn.lines} |")
|
||
if len(high_complexity) > 20:
|
||
_a(f"| ... | 共 {len(high_complexity)} 个 | | | | |")
|
||
else:
|
||
_a("所有函数复杂度均在合理范围内。✅")
|
||
_a("")
|
||
|
||
# 长函数
|
||
long_funcs = [fn for fn in report.functions if fn.lines >= 80]
|
||
long_funcs.sort(key=lambda fn: fn.lines, reverse=True)
|
||
_a("### 4.2 长函数(≥ 80 行)")
|
||
_a("")
|
||
if long_funcs:
|
||
_a("| 函数 | 文件 | 行号 | 函数行数 | 复杂度 |")
|
||
_a("|------|------|------|----------|--------|")
|
||
for fn in long_funcs[:15]:
|
||
name = f"{fn.class_name}.{fn.name}" if fn.class_name else fn.name
|
||
_a(f"| `{name}` | `{fn.file}` | {fn.line} | {fn.lines} | {fn.complexity} |")
|
||
if len(long_funcs) > 15:
|
||
_a(f"| ... | 共 {len(long_funcs)} 个 | | | |")
|
||
else:
|
||
_a("所有函数行数均在合理范围内。✅")
|
||
_a("")
|
||
|
||
# ── 依赖关系 ──
|
||
_a("## 5. 模块依赖关系")
|
||
_a("")
|
||
|
||
# 依赖矩阵
|
||
dep_matrix: dict[str, Counter] = defaultdict(Counter)
|
||
for edge in report.import_edges:
|
||
dep_matrix[edge.source_module][edge.target_module] += 1
|
||
|
||
all_modules = sorted(set(
|
||
list(dep_matrix.keys()) +
|
||
[t for counts in dep_matrix.values() for t in counts]
|
||
))
|
||
# 只保留核心模块
|
||
all_modules = [m for m in all_modules if m in _CORE_MODULES]
|
||
|
||
if all_modules:
|
||
_a("### 5.1 依赖矩阵(行→列 = 导入次数)")
|
||
_a("")
|
||
header = "| 模块 | " + " | ".join(f"`{m}`" for m in all_modules) + " |"
|
||
_a(header)
|
||
_a("|" + "------|" * (len(all_modules) + 1))
|
||
for src in all_modules:
|
||
row = f"| `{src}` |"
|
||
for tgt in all_modules:
|
||
count = dep_matrix.get(src, {}).get(tgt, 0)
|
||
row += f" {count or '·'} |"
|
||
_a(row)
|
||
_a("")
|
||
|
||
# 循环依赖
|
||
_a("### 5.2 循环依赖")
|
||
_a("")
|
||
if report.circular_deps:
|
||
for a, b in report.circular_deps:
|
||
_a(f"- ⚠️ `{a}` ↔ `{b}`")
|
||
_a("")
|
||
_a("> 循环依赖增加模块间耦合,建议通过接口抽象或依赖注入解耦。")
|
||
else:
|
||
_a("未检测到模块级循环依赖。✅")
|
||
_a("")
|
||
|
||
# ── 耦合度 ──
|
||
_a("## 6. 耦合度评估")
|
||
_a("")
|
||
_a("| 模块 | 传入耦合 Ca | 传出耦合 Ce | 不稳定度 I |")
|
||
_a("|------|-----------|-----------|-----------|")
|
||
for mod, vals in sorted(coupling.items(), key=lambda x: x[1]["instability"], reverse=True):
|
||
flag = " ⚠️" if vals["instability"] > 0.8 else ""
|
||
_a(f"| `{mod}` | {vals['afferent_coupling']} | {vals['efferent_coupling']} | {vals['instability']}{flag} |")
|
||
_a("")
|
||
_a("> 不稳定度 I = Ce/(Ca+Ce)。I 接近 1 表示模块高度依赖外部,变更风险大。")
|
||
_a("> I 接近 0 表示模块被广泛依赖,是稳定基础设施。")
|
||
_a("")
|
||
|
||
# ── 重复代码 ──
|
||
_a("## 7. 重复代码检测")
|
||
_a("")
|
||
if report.similar_functions:
|
||
_a("以下函数对具有相似的签名特征(参数数量、行数、复杂度),可能存在重复逻辑:")
|
||
_a("")
|
||
_a("| 函数 A | 函数 B | 相似度 |")
|
||
_a("|--------|--------|--------|")
|
||
for a, b, sim in report.similar_functions[:20]:
|
||
_a(f"| `{a}` | `{b}` | {sim:.0%} |")
|
||
if len(report.similar_functions) > 20:
|
||
_a(f"| ... | 共 {len(report.similar_functions)} 对 | |")
|
||
_a("")
|
||
_a("> 建议人工审查上述函数对,确认是否可提取公共逻辑。")
|
||
else:
|
||
_a("未检测到明显的重复函数。✅")
|
||
_a("")
|
||
|
||
# ── 任务分类 ──
|
||
tc = report.task_classification
|
||
_a("## 8. 任务分类分析")
|
||
_a("")
|
||
if tc:
|
||
_a(f"### 8.1 按层分布(共 {tc['total']} 个任务)")
|
||
_a("")
|
||
_a("| 层 | 数量 | 任务列表 |")
|
||
_a("|-----|------|----------|")
|
||
for layer, info in tc.get("by_layer", {}).items():
|
||
tasks_str = ", ".join(f"`{t}`" for t in info["tasks"][:8])
|
||
if info["count"] > 8:
|
||
tasks_str += f" ... 共 {info['count']} 个"
|
||
_a(f"| {layer} | {info['count']} | {tasks_str} |")
|
||
_a("")
|
||
|
||
_a("### 8.2 按类型分布")
|
||
_a("")
|
||
_a("| 类型 | 数量 |")
|
||
_a("|------|------|")
|
||
for ttype, info in tc.get("by_type", {}).items():
|
||
_a(f"| {ttype} | {info['count']} |")
|
||
_a("")
|
||
|
||
anomalies = tc.get("anomalies", [])
|
||
_a("### 8.3 分类异常")
|
||
_a("")
|
||
if anomalies:
|
||
for a in anomalies:
|
||
_a(f"- ⚠️ {a}")
|
||
else:
|
||
_a("未发现分类异常。✅")
|
||
else:
|
||
_a("任务分类分析未执行(TaskRegistry 导入失败)。")
|
||
_a("")
|
||
|
||
# ── 优化建议 ──
|
||
_a("## 9. 架构优化建议")
|
||
_a("")
|
||
suggestions = _generate_suggestions(report, coupling)
|
||
for i, s in enumerate(suggestions, 1):
|
||
_a(f"{i}. {s}")
|
||
_a("")
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
def _generate_suggestions(report: ArchitectureReport, coupling: dict) -> list[str]:
|
||
"""基于分析结果生成具体优化建议。"""
|
||
suggestions: list[str] = []
|
||
|
||
# 大文件建议
|
||
large_files = [f for f in report.files if f.lines > 500]
|
||
if large_files:
|
||
biggest = max(large_files, key=lambda f: f.lines)
|
||
suggestions.append(
|
||
f"**拆分大文件**:`{biggest.rel_path}`({biggest.lines:,} 行)是最大文件,"
|
||
"建议按职责拆分为多个子模块。"
|
||
)
|
||
|
||
# 高复杂度建议
|
||
high_cx = [fn for fn in report.functions if fn.complexity >= 15]
|
||
if high_cx:
|
||
worst = max(high_cx, key=lambda fn: fn.complexity)
|
||
name = f"{worst.class_name}.{worst.name}" if worst.class_name else worst.name
|
||
suggestions.append(
|
||
f"**降低函数复杂度**:`{name}`(复杂度 {worst.complexity})建议提取子函数或使用策略模式。"
|
||
)
|
||
|
||
# 循环依赖建议
|
||
if report.circular_deps:
|
||
pairs = ", ".join(f"`{a}`↔`{b}`" for a, b in report.circular_deps)
|
||
suggestions.append(
|
||
f"**消除循环依赖**:{pairs}。可通过引入接口层或依赖注入解耦。"
|
||
)
|
||
|
||
# 高不稳定模块
|
||
unstable = [m for m, v in coupling.items() if v["instability"] > 0.8]
|
||
if unstable:
|
||
suggestions.append(
|
||
f"**稳定化高不稳定模块**:{', '.join(f'`{m}`' for m in unstable)} "
|
||
"的不稳定度 > 0.8,建议减少对外部模块的依赖。"
|
||
)
|
||
|
||
# 任务命名建议
|
||
tc = report.task_classification
|
||
if tc:
|
||
anomalies = tc.get("anomalies", [])
|
||
if any("INDEX" in a for a in anomalies):
|
||
suggestions.append(
|
||
"**统一 INDEX 层任务命名**:当前 INDEX 层任务以 `DWS_` 开头,"
|
||
"建议改为 `IDX_` 前缀以避免与 DWS 汇总任务混淆。"
|
||
)
|
||
|
||
# 重复代码建议
|
||
if len(report.similar_functions) > 5:
|
||
suggestions.append(
|
||
f"**消除重复代码**:检测到 {len(report.similar_functions)} 对相似函数,"
|
||
"建议提取公共基类或工具函数。"
|
||
)
|
||
|
||
if not suggestions:
|
||
suggestions.append("当前架构整体健康,未发现需要立即优化的问题。")
|
||
|
||
return suggestions
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════
|
||
# 主流程
|
||
# ═══════════════════════════════════════════════════════════════
|
||
|
||
def run_analysis(root: Path, logger: logging.Logger) -> tuple[ArchitectureReport, dict]:
|
||
"""执行完整架构分析,返回报告数据和耦合度评估。"""
|
||
report = ArchitectureReport()
|
||
|
||
logger.info("=" * 60)
|
||
logger.info("ETL 架构分析开始")
|
||
logger.info("分析根目录: %s", root)
|
||
logger.info("=" * 60)
|
||
|
||
# 1. 文件扫描
|
||
logger.info("── 阶段 1/6:文件扫描 ──")
|
||
report.files = scan_files(root, logger)
|
||
|
||
# 2. 函数复杂度
|
||
logger.info("── 阶段 2/6:函数复杂度分析 ──")
|
||
report.functions = analyze_functions(report.files, logger)
|
||
|
||
# 3. 依赖关系
|
||
logger.info("── 阶段 3/6:依赖关系分析 ──")
|
||
report.import_edges, report.circular_deps = analyze_dependencies(report.files, logger)
|
||
|
||
# 4. 重复代码
|
||
logger.info("── 阶段 4/6:重复代码检测 ──")
|
||
report.similar_functions = detect_similar_functions(report.functions, logger)
|
||
|
||
# 5. 任务分类
|
||
logger.info("── 阶段 5/6:任务分类分析 ──")
|
||
report.task_classification = analyze_task_classification(logger)
|
||
|
||
# 6. 耦合度
|
||
logger.info("── 阶段 6/6:耦合度评估 ──")
|
||
coupling = evaluate_coupling(report.import_edges, report.files)
|
||
|
||
logger.info("=" * 60)
|
||
logger.info("分析完成")
|
||
logger.info("=" * 60)
|
||
|
||
return report, coupling
|
||
|
||
|
||
def parse_args() -> argparse.Namespace:
|
||
parser = argparse.ArgumentParser(description="ETL 架构分析")
|
||
parser.add_argument(
|
||
"--output", "-o",
|
||
help="报告输出路径(默认自动生成带日期的文件名)",
|
||
default=None,
|
||
)
|
||
return parser.parse_args()
|
||
|
||
|
||
def main():
|
||
logger = _setup_logging()
|
||
args = parse_args()
|
||
|
||
root = _FEIQIU_ROOT
|
||
|
||
report, coupling = run_analysis(root, logger)
|
||
|
||
# 生成 Markdown 报告
|
||
md_content = generate_report(report, coupling)
|
||
|
||
# 确定输出路径
|
||
_report_root = os.environ.get("ETL_REPORT_ROOT")
|
||
if not _report_root:
|
||
raise KeyError("环境变量 ETL_REPORT_ROOT 未定义。请在根 .env 中配置。")
|
||
reports_dir = Path(_report_root)
|
||
reports_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
if args.output:
|
||
output_path = Path(args.output)
|
||
else:
|
||
date_str = datetime.now().strftime("%Y%m%d")
|
||
output_path = reports_dir / f"architecture_report_{date_str}.md"
|
||
|
||
output_path.write_text(md_content, encoding="utf-8")
|
||
logger.info("报告已保存: %s", output_path)
|
||
|
||
# 打印摘要
|
||
total_files = len(report.files)
|
||
total_lines = sum(f.lines for f in report.files)
|
||
large_count = sum(1 for f in report.files if f.lines > 500)
|
||
high_cx = sum(1 for fn in report.functions if fn.complexity >= 10)
|
||
|
||
logger.info("")
|
||
logger.info("═══ 分析摘要 ═══")
|
||
logger.info(" 文件数: %d", total_files)
|
||
logger.info(" 总行数: %s", f"{total_lines:,}")
|
||
logger.info(" 大文件(>500行): %d", large_count)
|
||
logger.info(" 高复杂度函数(≥10): %d", high_cx)
|
||
logger.info(" 循环依赖: %d", len(report.circular_deps))
|
||
logger.info(" 相似函数对: %d", len(report.similar_functions))
|
||
logger.info(" 注册任务: %s", report.task_classification.get("total", "N/A"))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|