Neo-ZQYY/apps/etl/connectors/feiqiu/scripts/debug/analyze_architecture.py

# -*- coding: utf-8 -*-
"""ETL 架构分析脚本。

通过静态分析（AST 解析、import 扫描、文件统计）评估 ETL 代码结构，
生成架构优化报告（Markdown）。

分析维度：
1. 模块依赖关系 — 扫描 import，构建依赖图，识别循环依赖
2. 文件大小分析 — 统计行数，识别过大文件（>500 行）
3. 函数复杂度   — AST 分析圈复杂度（分支/嵌套深度）
4. 重复代码检测 — 比较函数签名和结构相似度
5. 耦合度评估   — 模块间导入关系密度
6. 任务分类分析 — 从 TaskRegistry 读取元数据，评估分类合理性

用法:
    cd apps/etl/connectors/feiqiu
    python -m scripts.debug.analyze_architecture
"""
from __future__ import annotations

import ast
import argparse
import logging
import os
import sys
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Iterator

# ── 确保项目根目录在 sys.path ──
_FEIQIU_ROOT = Path(__file__).resolve().parents[2]
if str(_FEIQIU_ROOT) not in sys.path:
    sys.path.insert(0, str(_FEIQIU_ROOT))

# ── 分析范围：ETL 核心模块 ──
_CORE_MODULES = [
    "api", "cli", "config", "database", "loaders", "models",
    "orchestration", "quality", "scd", "tasks", "utils",
]


# ═══════════════════════════════════════════════════════════════
# 数据结构
# ═══════════════════════════════════════════════════════════════

@dataclass
class FileInfo:
    """单个 .py 文件的统计信息"""
    path: Path
    rel_path: str
    lines: int = 0
    code_lines: int = 0          # 非空非注释行
    blank_lines: int = 0
    comment_lines: int = 0
    module: str = ""             # 所属模块（api/cli/...）


@dataclass
class FunctionInfo:
    """函数/方法的分析信息"""
    name: str
    file: str
    line: int
    complexity: int = 1          # 圈复杂度
    max_nesting: int = 0         # 最大嵌套深度
    param_count: int = 0
    lines: int = 0               # 函数体行数
    is_method: bool = False
    class_name: str = ""


@dataclass
class ImportEdge:
    """模块间的导入关系"""
    source_module: str           # 导入方
    target_module: str           # 被导入方
    source_file: str
    import_name: str             # 具体导入的名称


@dataclass
class ArchitectureReport:
    """架构分析报告的完整数据"""
    generated_at: datetime = field(default_factory=datetime.now)
    # 文件统计
    files: list[FileInfo] = field(default_factory=list)
    # 函数分析
    functions: list[FunctionInfo] = field(default_factory=list)
    # 依赖关系
    import_edges: list[ImportEdge] = field(default_factory=list)
    circular_deps: list[tuple[str, str]] = field(default_factory=list)
    # 任务分类
    task_classification: dict = field(default_factory=dict)
    # 重复代码
    similar_functions: list[tuple[str, str, float]] = field(default_factory=list)


# ═══════════════════════════════════════════════════════════════
# 日志
# ═══════════════════════════════════════════════════════════════

def _setup_logging() -> logging.Logger:
    logger = logging.getLogger("analyze_architecture")
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter(
            "%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S"
        ))
        logger.addHandler(handler)
    return logger


# ═══════════════════════════════════════════════════════════════
# 1. 文件扫描与行数统计
# ═══════════════════════════════════════════════════════════════

def _iter_py_files(root: Path) -> Iterator[Path]:
    """递归遍历核心模块下的 .py 文件，跳过 __pycache__ / .hypothesis 等。"""
    skip_dirs = {"__pycache__", ".hypothesis", ".pytest_cache", "export", "Asia"}
    for dirpath, dirnames, filenames in os.walk(root):
        dirnames[:] = [d for d in dirnames if d not in skip_dirs]
        for fn in filenames:
            if fn.endswith(".py"):
                yield Path(dirpath) / fn


def _classify_module(rel_path: str) -> str:
    """从相对路径提取所属模块名。"""
    parts = Path(rel_path).parts
    if parts:
        top = parts[0]
        if top in _CORE_MODULES:
            return top
        if top == "scripts":
            return "scripts"
        if top == "tests":
            return "tests"
    return "root"


def _count_lines(filepath: Path) -> FileInfo:
    """统计单个文件的行数分布。"""
    info = FileInfo(path=filepath, rel_path="")
    try:
        text = filepath.read_text(encoding="utf-8", errors="replace")
    except Exception:
        return info
    raw_lines = text.splitlines()
    info.lines = len(raw_lines)
    for line in raw_lines:
        stripped = line.strip()
        if not stripped:
            info.blank_lines += 1
        elif stripped.startswith("#"):
            info.comment_lines += 1
        else:
            info.code_lines += 1
    return info


def scan_files(root: Path, logger: logging.Logger) -> list[FileInfo]:
    """扫描所有 .py 文件并统计行数。"""
    results: list[FileInfo] = []
    for fp in _iter_py_files(root):
        info = _count_lines(fp)
        info.path = fp
        info.rel_path = str(fp.relative_to(root)).replace("\\", "/")
        info.module = _classify_module(info.rel_path)
        results.append(info)
    logger.info("扫描完成：共 %d 个 .py 文件", len(results))
    return results


# ═══════════════════════════════════════════════════════════════
# 2. AST 分析：函数复杂度
# ═══════════════════════════════════════════════════════════════

# 增加圈复杂度的 AST 节点类型
_COMPLEXITY_NODES = (
    ast.If, ast.For, ast.While, ast.ExceptHandler,
    ast.With, ast.Assert, ast.BoolOp,
)
# 仅 comprehension 内的 if 子句
_COMP_NODES = (ast.ListComp, ast.SetComp, ast.DictComp, ast.GeneratorExp)


def _calc_complexity(node: ast.AST) -> int:
    """计算函数体的圈复杂度（McCabe）。"""
    complexity = 1
    for child in ast.walk(node):
        if isinstance(child, _COMPLEXITY_NODES):
            complexity += 1
            # BoolOp 中每个额外的 and/or 加 1
            if isinstance(child, ast.BoolOp):
                complexity += len(child.values) - 2 if len(child.values) > 2 else 0
        elif isinstance(child, _COMP_NODES):
            for gen in child.generators:
                complexity += len(gen.ifs)
    return complexity


def _calc_max_nesting(node: ast.AST, depth: int = 0) -> int:
    """计算最大嵌套深度。"""
    nesting_types = (ast.If, ast.For, ast.While, ast.With, ast.Try, ast.ExceptHandler)
    max_depth = depth
    for child in ast.iter_child_nodes(node):
        if isinstance(child, nesting_types):
            child_depth = _calc_max_nesting(child, depth + 1)
            max_depth = max(max_depth, child_depth)
        else:
            child_depth = _calc_max_nesting(child, depth)
            max_depth = max(max_depth, child_depth)
    return max_depth


def _func_body_lines(node: ast.FunctionDef | ast.AsyncFunctionDef) -> int:
    """计算函数体行数。"""
    if not node.body:
        return 0
    first_line = node.body[0].lineno
    last_line = node.body[-1].end_lineno or node.body[-1].lineno
    return last_line - first_line + 1


def _walk_with_parent(tree: ast.AST):
    """遍历 AST 并记录每个节点的父节点（避免 O(n²) 嵌套 walk）。"""
    # 先给所有节点标记 parent
    for node in ast.walk(tree):
        for child in ast.iter_child_nodes(node):
            child._parent = node  # type: ignore[attr-defined]


def analyze_functions(files: list[FileInfo], logger: logging.Logger) -> list[FunctionInfo]:
    """对所有文件做 AST 分析，提取函数/方法信息。"""
    results: list[FunctionInfo] = []
    for fi in files:
        try:
            source = fi.path.read_text(encoding="utf-8", errors="replace")
            tree = ast.parse(source, filename=fi.rel_path)
        except (SyntaxError, UnicodeDecodeError):
            continue

        _walk_with_parent(tree)

        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                # 通过 _parent 属性判断是否为方法
                parent = getattr(node, "_parent", None)
                class_name = ""
                is_method = False
                if isinstance(parent, ast.ClassDef):
                    class_name = parent.name
                    is_method = True

                param_count = len(node.args.args)
                if is_method and param_count > 0:
                    param_count -= 1  # 去掉 self/cls

                info = FunctionInfo(
                    name=node.name,
                    file=fi.rel_path,
                    line=node.lineno,
                    complexity=_calc_complexity(node),
                    max_nesting=_calc_max_nesting(node),
                    param_count=param_count,
                    lines=_func_body_lines(node),
                    is_method=is_method,
                    class_name=class_name,
                )
                results.append(info)

    logger.info("函数分析完成：共 %d 个函数/方法", len(results))
    return results


# ═══════════════════════════════════════════════════════════════
# 3. 依赖关系分析
# ═══════════════════════════════════════════════════════════════

def _extract_imports(filepath: Path, rel_path: str) -> list[ImportEdge]:
    """从单个文件提取 import 语句，映射到模块级别。"""
    edges: list[ImportEdge] = []
    try:
        source = filepath.read_text(encoding="utf-8", errors="replace")
        tree = ast.parse(source, filename=rel_path)
    except (SyntaxError, UnicodeDecodeError):
        return edges

    source_module = _classify_module(rel_path)

    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                target = _resolve_import_module(alias.name)
                if target and target != source_module:
                    edges.append(ImportEdge(
                        source_module=source_module,
                        target_module=target,
                        source_file=rel_path,
                        import_name=alias.name,
                    ))
        elif isinstance(node, ast.ImportFrom):
            if node.module:
                target = _resolve_import_module(node.module)
                if target and target != source_module:
                    names = ", ".join(a.name for a in (node.names or []))
                    edges.append(ImportEdge(
                        source_module=source_module,
                        target_module=target,
                        source_file=rel_path,
                        import_name=f"{node.module}.{{{names}}}",
                    ))
    return edges


def _resolve_import_module(import_path: str) -> str | None:
    """将 import 路径映射到核心模块名。"""
    parts = import_path.split(".")
    top = parts[0]
    if top in _CORE_MODULES:
        return top
    return None


def analyze_dependencies(files: list[FileInfo], logger: logging.Logger) -> tuple[list[ImportEdge], list[tuple[str, str]]]:
    """分析模块间依赖关系，检测循环依赖。"""
    all_edges: list[ImportEdge] = []
    for fi in files:
        all_edges.extend(_extract_imports(fi.path, fi.rel_path))

    # 构建有向图检测循环
    graph: dict[str, set[str]] = defaultdict(set)
    for edge in all_edges:
        graph[edge.source_module].add(edge.target_module)

    circular: list[tuple[str, str]] = []
    for src, targets in graph.items():
        for tgt in targets:
            if src in graph.get(tgt, set()):
                pair = tuple(sorted([src, tgt]))
                if pair not in circular:
                    circular.append(pair)

    logger.info("依赖分析完成：%d 条导入边，%d 对循环依赖", len(all_edges), len(circular))
    return all_edges, circular


# ═══════════════════════════════════════════════════════════════
# 4. 重复代码检测（基于函数签名相似度）
# ═══════════════════════════════════════════════════════════════

def _func_signature_key(fn: FunctionInfo) -> str:
    """生成函数签名指纹：参数数量 + 行数范围 + 复杂度。"""
    line_bucket = fn.lines // 10 * 10  # 按 10 行分桶
    return f"p{fn.param_count}_l{line_bucket}_c{fn.complexity}"


def detect_similar_functions(
    functions: list[FunctionInfo],
    logger: logging.Logger,
    min_lines: int = 15,
) -> list[tuple[str, str, float]]:
    """检测签名相似的函数对（可能是重复代码）。

    只比较行数 >= min_lines 的函数，避免噪声。
    """
    # 按签名分桶
    buckets: dict[str, list[FunctionInfo]] = defaultdict(list)
    for fn in functions:
        if fn.lines >= min_lines:
            key = _func_signature_key(fn)
            buckets[key].append(fn)

    similar: list[tuple[str, str, float]] = []
    for key, group in buckets.items():
        if len(group) < 2:
            continue
        # 同一桶内两两配对
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                a, b = group[i], group[j]
                # 跳过同文件内的重载/变体
                if a.file == b.file:
                    continue
                # 简单相似度：行数差异越小越相似
                line_ratio = 1 - abs(a.lines - b.lines) / max(a.lines, b.lines)
                if line_ratio >= 0.7:
                    label_a = f"{a.file}:{a.class_name}.{a.name}" if a.class_name else f"{a.file}:{a.name}"
                    label_b = f"{b.file}:{b.class_name}.{b.name}" if b.class_name else f"{b.file}:{b.name}"
                    similar.append((label_a, label_b, round(line_ratio, 2)))

    logger.info("重复检测完成：%d 对相似函数", len(similar))
    return similar


# ═══════════════════════════════════════════════════════════════
# 5. 任务分类分析
# ═══════════════════════════════════════════════════════════════

def analyze_task_classification(logger: logging.Logger) -> dict:
    """从 TaskRegistry 读取 52 个任务的元数据，分析分类合理性。"""
    try:
        from orchestration.task_registry import default_registry, TaskMeta
    except ImportError:
        logger.warning("无法导入 TaskRegistry，跳过任务分类分析")
        return {}

    all_codes = default_registry.get_all_task_codes()
    by_layer: dict[str, list[str]] = defaultdict(list)
    by_type: dict[str, list[str]] = defaultdict(list)
    anomalies: list[str] = []

    for code in all_codes:
        meta: TaskMeta | None = default_registry.get_metadata(code)
        if not meta:
            continue
        layer = meta.layer or "NONE"
        by_layer[layer].append(code)
        by_type[meta.task_type].append(code)

        # 检测命名与分类不一致
        if code.startswith("DWS_") and layer not in ("DWS", "INDEX"):
            anomalies.append(f"{code}: 前缀 DWS_ 但分类为 {layer}")
        if code.startswith("ODS_") and layer != "ODS":
            anomalies.append(f"{code}: 前缀 ODS_ 但分类为 {layer}")
        if code.startswith("DWD_") and layer != "DWD":
            anomalies.append(f"{code}: 前缀 DWD_ 但分类为 {layer}")

        # 检测 INDEX 层任务命名
        if layer == "INDEX" and not code.startswith("DWS_"):
            anomalies.append(f"{code}: INDEX 层但不以 DWS_ 开头，可能造成混淆")

    # INDEX 层任务以 DWS_ 开头的命名问题
    index_tasks = by_layer.get("INDEX", [])
    if index_tasks and all(c.startswith("DWS_") for c in index_tasks):
        anomalies.append(
            f"INDEX 层全部 {len(index_tasks)} 个任务以 DWS_ 开头，"
            "建议改为 IDX_ 前缀以区分 DWS 汇总任务"
        )

    result = {
        "total": len(all_codes),
        "by_layer": {k: {"count": len(v), "tasks": sorted(v)} for k, v in sorted(by_layer.items())},
        "by_type": {k: {"count": len(v), "tasks": sorted(v)} for k, v in sorted(by_type.items())},
        "anomalies": anomalies,
    }
    logger.info("任务分类分析完成：共 %d 个任务，%d 个异常", len(all_codes), len(anomalies))
    return result


# ═══════════════════════════════════════════════════════════════
# 6. 耦合度评估
# ═══════════════════════════════════════════════════════════════

def evaluate_coupling(
    edges: list[ImportEdge],
    files: list[FileInfo],
) -> dict:
    """评估模块间耦合度。

    指标：
    - 传入耦合（Ca）：有多少模块依赖本模块
    - 传出耦合（Ce）：本模块依赖多少其他模块
    - 不稳定度 I = Ce / (Ca + Ce)，越接近 1 越不稳定
    """
    # 只统计核心模块
    modules = set(m for m in _CORE_MODULES if any(f.module == m for f in files))

    ca: Counter = Counter()  # 传入
    ce: Counter = Counter()  # 传出

    # 去重：同一 source_module → target_module 只计一次
    seen = set()
    for edge in edges:
        pair = (edge.source_module, edge.target_module)
        if pair in seen:
            continue
        seen.add(pair)
        if edge.source_module in modules:
            ce[edge.source_module] += 1
        if edge.target_module in modules:
            ca[edge.target_module] += 1

    coupling: dict[str, dict] = {}
    for m in sorted(modules):
        ca_val = ca.get(m, 0)
        ce_val = ce.get(m, 0)
        total = ca_val + ce_val
        instability = round(ce_val / total, 2) if total > 0 else 0.0
        coupling[m] = {
            "afferent_coupling": ca_val,
            "efferent_coupling": ce_val,
            "instability": instability,
        }
    return coupling


# ═══════════════════════════════════════════════════════════════
# 7. Markdown 报告生成
# ═══════════════════════════════════════════════════════════════

def generate_report(report: ArchitectureReport, coupling: dict) -> str:
    """生成 Markdown 格式的架构优化报告。"""
    lines: list[str] = []
    _a = lines.append

    _a(f"# ETL 架构分析报告")
    _a(f"")
    _a(f"> 生成时间：{report.generated_at.strftime('%Y-%m-%d %H:%M:%S')}")
    _a(f"> 分析范围：`apps/etl/connectors/feiqiu/` 核心模块")
    _a("")

    # ── 概览 ──
    total_files = len(report.files)
    total_lines = sum(f.lines for f in report.files)
    total_code = sum(f.code_lines for f in report.files)
    _a("## 1. 概览")
    _a("")
    _a(f"| 指标 | 值 |")
    _a(f"|------|-----|")
    _a(f"| Python 文件数 | {total_files} |")
    _a(f"| 总行数 | {total_lines:,} |")
    _a(f"| 代码行数 | {total_code:,} |")
    _a(f"| 函数/方法数 | {len(report.functions):,} |")
    _a(f"| 注册任务数 | {report.task_classification.get('total', 'N/A')} |")
    _a(f"| 循环依赖数 | {len(report.circular_deps)} |")
    _a(f"| 相似函数对数 | {len(report.similar_functions)} |")
    _a("")

    # ── 模块规模 ──
    _a("## 2. 模块规模分析")
    _a("")
    module_stats: dict[str, dict] = defaultdict(lambda: {"files": 0, "lines": 0, "code_lines": 0})
    for f in report.files:
        ms = module_stats[f.module]
        ms["files"] += 1
        ms["lines"] += f.lines
        ms["code_lines"] += f.code_lines

    _a("| 模块 | 文件数 | 总行数 | 代码行数 |")
    _a("|------|--------|--------|----------|")
    for mod in sorted(module_stats, key=lambda m: module_stats[m]["lines"], reverse=True):
        s = module_stats[mod]
        _a(f"| `{mod}` | {s['files']} | {s['lines']:,} | {s['code_lines']:,} |")
    _a("")

    # ── 大文件 ──
    large_files = [f for f in report.files if f.lines > 500]
    large_files.sort(key=lambda f: f.lines, reverse=True)
    _a("## 3. 大文件识别（>500 行）")
    _a("")
    if large_files:
        _a("| 文件 | 行数 | 代码行 | 模块 |")
        _a("|------|------|--------|------|")
        for f in large_files:
            _a(f"| `{f.rel_path}` | {f.lines:,} | {f.code_lines:,} | {f.module} |")
        _a("")
        _a(f"> ⚠️ 共 {len(large_files)} 个文件超过 500 行，建议拆分以降低维护成本。")
    else:
        _a("所有文件均在 500 行以内。✅")
    _a("")

    # ── 函数复杂度 ──
    _a("## 4. 函数复杂度分析")
    _a("")
    high_complexity = [fn for fn in report.functions if fn.complexity >= 10]
    high_complexity.sort(key=lambda fn: fn.complexity, reverse=True)
    _a(f"### 4.1 高复杂度函数（圈复杂度 ≥ 10）")
    _a("")
    if high_complexity:
        _a("| 函数 | 文件 | 行号 | 复杂度 | 嵌套深度 | 函数行数 |")
        _a("|------|------|------|--------|----------|----------|")
        for fn in high_complexity[:20]:
            name = f"{fn.class_name}.{fn.name}" if fn.class_name else fn.name
            _a(f"| `{name}` | `{fn.file}` | {fn.line} | {fn.complexity} | {fn.max_nesting} | {fn.lines} |")
        if len(high_complexity) > 20:
            _a(f"| ... | 共 {len(high_complexity)} 个 | | | | |")
    else:
        _a("所有函数复杂度均在合理范围内。✅")
    _a("")

    # 长函数
    long_funcs = [fn for fn in report.functions if fn.lines >= 80]
    long_funcs.sort(key=lambda fn: fn.lines, reverse=True)
    _a("### 4.2 长函数（≥ 80 行）")
    _a("")
    if long_funcs:
        _a("| 函数 | 文件 | 行号 | 函数行数 | 复杂度 |")
        _a("|------|------|------|----------|--------|")
        for fn in long_funcs[:15]:
            name = f"{fn.class_name}.{fn.name}" if fn.class_name else fn.name
            _a(f"| `{name}` | `{fn.file}` | {fn.line} | {fn.lines} | {fn.complexity} |")
        if len(long_funcs) > 15:
            _a(f"| ... | 共 {len(long_funcs)} 个 | | | |")
    else:
        _a("所有函数行数均在合理范围内。✅")
    _a("")

    # ── 依赖关系 ──
    _a("## 5. 模块依赖关系")
    _a("")

    # 依赖矩阵
    dep_matrix: dict[str, Counter] = defaultdict(Counter)
    for edge in report.import_edges:
        dep_matrix[edge.source_module][edge.target_module] += 1

    all_modules = sorted(set(
        list(dep_matrix.keys()) +
        [t for counts in dep_matrix.values() for t in counts]
    ))
    # 只保留核心模块
    all_modules = [m for m in all_modules if m in _CORE_MODULES]

    if all_modules:
        _a("### 5.1 依赖矩阵（行→列 = 导入次数）")
        _a("")
        header = "| 模块 | " + " | ".join(f"`{m}`" for m in all_modules) + " |"
        _a(header)
        _a("|" + "------|" * (len(all_modules) + 1))
        for src in all_modules:
            row = f"| `{src}` |"
            for tgt in all_modules:
                count = dep_matrix.get(src, {}).get(tgt, 0)
                row += f" {count or '·'} |"
            _a(row)
        _a("")

    # 循环依赖
    _a("### 5.2 循环依赖")
    _a("")
    if report.circular_deps:
        for a, b in report.circular_deps:
            _a(f"- ⚠️ `{a}` ↔ `{b}`")
        _a("")
        _a("> 循环依赖增加模块间耦合，建议通过接口抽象或依赖注入解耦。")
    else:
        _a("未检测到模块级循环依赖。✅")
    _a("")

    # ── 耦合度 ──
    _a("## 6. 耦合度评估")
    _a("")
    _a("| 模块 | 传入耦合 Ca | 传出耦合 Ce | 不稳定度 I |")
    _a("|------|-----------|-----------|-----------|")
    for mod, vals in sorted(coupling.items(), key=lambda x: x[1]["instability"], reverse=True):
        flag = " ⚠️" if vals["instability"] > 0.8 else ""
        _a(f"| `{mod}` | {vals['afferent_coupling']} | {vals['efferent_coupling']} | {vals['instability']}{flag} |")
    _a("")
    _a("> 不稳定度 I = Ce/(Ca+Ce)。I 接近 1 表示模块高度依赖外部，变更风险大。")
    _a("> I 接近 0 表示模块被广泛依赖，是稳定基础设施。")
    _a("")

    # ── 重复代码 ──
    _a("## 7. 重复代码检测")
    _a("")
    if report.similar_functions:
        _a("以下函数对具有相似的签名特征（参数数量、行数、复杂度），可能存在重复逻辑：")
        _a("")
        _a("| 函数 A | 函数 B | 相似度 |")
        _a("|--------|--------|--------|")
        for a, b, sim in report.similar_functions[:20]:
            _a(f"| `{a}` | `{b}` | {sim:.0%} |")
        if len(report.similar_functions) > 20:
            _a(f"| ... | 共 {len(report.similar_functions)} 对 | |")
        _a("")
        _a("> 建议人工审查上述函数对，确认是否可提取公共逻辑。")
    else:
        _a("未检测到明显的重复函数。✅")
    _a("")

    # ── 任务分类 ──
    tc = report.task_classification
    _a("## 8. 任务分类分析")
    _a("")
    if tc:
        _a(f"### 8.1 按层分布（共 {tc['total']} 个任务）")
        _a("")
        _a("| 层 | 数量 | 任务列表 |")
        _a("|-----|------|----------|")
        for layer, info in tc.get("by_layer", {}).items():
            tasks_str = ", ".join(f"`{t}`" for t in info["tasks"][:8])
            if info["count"] > 8:
                tasks_str += f" ... 共 {info['count']} 个"
            _a(f"| {layer} | {info['count']} | {tasks_str} |")
        _a("")

        _a("### 8.2 按类型分布")
        _a("")
        _a("| 类型 | 数量 |")
        _a("|------|------|")
        for ttype, info in tc.get("by_type", {}).items():
            _a(f"| {ttype} | {info['count']} |")
        _a("")

        anomalies = tc.get("anomalies", [])
        _a("### 8.3 分类异常")
        _a("")
        if anomalies:
            for a in anomalies:
                _a(f"- ⚠️ {a}")
        else:
            _a("未发现分类异常。✅")
    else:
        _a("任务分类分析未执行（TaskRegistry 导入失败）。")
    _a("")

    # ── 优化建议 ──
    _a("## 9. 架构优化建议")
    _a("")
    suggestions = _generate_suggestions(report, coupling)
    for i, s in enumerate(suggestions, 1):
        _a(f"{i}. {s}")
    _a("")

    return "\n".join(lines)


def _generate_suggestions(report: ArchitectureReport, coupling: dict) -> list[str]:
    """基于分析结果生成具体优化建议。"""
    suggestions: list[str] = []

    # 大文件建议
    large_files = [f for f in report.files if f.lines > 500]
    if large_files:
        biggest = max(large_files, key=lambda f: f.lines)
        suggestions.append(
            f"**拆分大文件**：`{biggest.rel_path}`（{biggest.lines:,} 行）是最大文件，"
            "建议按职责拆分为多个子模块。"
        )

    # 高复杂度建议
    high_cx = [fn for fn in report.functions if fn.complexity >= 15]
    if high_cx:
        worst = max(high_cx, key=lambda fn: fn.complexity)
        name = f"{worst.class_name}.{worst.name}" if worst.class_name else worst.name
        suggestions.append(
            f"**降低函数复杂度**：`{name}`（复杂度 {worst.complexity}）建议提取子函数或使用策略模式。"
        )

    # 循环依赖建议
    if report.circular_deps:
        pairs = ", ".join(f"`{a}`↔`{b}`" for a, b in report.circular_deps)
        suggestions.append(
            f"**消除循环依赖**：{pairs}。可通过引入接口层或依赖注入解耦。"
        )

    # 高不稳定模块
    unstable = [m for m, v in coupling.items() if v["instability"] > 0.8]
    if unstable:
        suggestions.append(
            f"**稳定化高不稳定模块**：{', '.join(f'`{m}`' for m in unstable)} "
            "的不稳定度 > 0.8，建议减少对外部模块的依赖。"
        )

    # 任务命名建议
    tc = report.task_classification
    if tc:
        anomalies = tc.get("anomalies", [])
        if any("INDEX" in a for a in anomalies):
            suggestions.append(
                "**统一 INDEX 层任务命名**：当前 INDEX 层任务以 `DWS_` 开头，"
                "建议改为 `IDX_` 前缀以避免与 DWS 汇总任务混淆。"
            )

    # 重复代码建议
    if len(report.similar_functions) > 5:
        suggestions.append(
            f"**消除重复代码**：检测到 {len(report.similar_functions)} 对相似函数，"
            "建议提取公共基类或工具函数。"
        )

    if not suggestions:
        suggestions.append("当前架构整体健康，未发现需要立即优化的问题。")

    return suggestions


# ═══════════════════════════════════════════════════════════════
# 主流程
# ═══════════════════════════════════════════════════════════════

def run_analysis(root: Path, logger: logging.Logger) -> tuple[ArchitectureReport, dict]:
    """执行完整架构分析，返回报告数据和耦合度评估。"""
    report = ArchitectureReport()

    logger.info("=" * 60)
    logger.info("ETL 架构分析开始")
    logger.info("分析根目录: %s", root)
    logger.info("=" * 60)

    # 1. 文件扫描
    logger.info("── 阶段 1/6：文件扫描 ──")
    report.files = scan_files(root, logger)

    # 2. 函数复杂度
    logger.info("── 阶段 2/6：函数复杂度分析 ──")
    report.functions = analyze_functions(report.files, logger)

    # 3. 依赖关系
    logger.info("── 阶段 3/6：依赖关系分析 ──")
    report.import_edges, report.circular_deps = analyze_dependencies(report.files, logger)

    # 4. 重复代码
    logger.info("── 阶段 4/6：重复代码检测 ──")
    report.similar_functions = detect_similar_functions(report.functions, logger)

    # 5. 任务分类
    logger.info("── 阶段 5/6：任务分类分析 ──")
    report.task_classification = analyze_task_classification(logger)

    # 6. 耦合度
    logger.info("── 阶段 6/6：耦合度评估 ──")
    coupling = evaluate_coupling(report.import_edges, report.files)

    logger.info("=" * 60)
    logger.info("分析完成")
    logger.info("=" * 60)

    return report, coupling


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="ETL 架构分析")
    parser.add_argument(
        "--output", "-o",
        help="报告输出路径（默认自动生成带日期的文件名）",
        default=None,
    )
    return parser.parse_args()


def main():
    logger = _setup_logging()
    args = parse_args()

    root = _FEIQIU_ROOT

    report, coupling = run_analysis(root, logger)

    # 生成 Markdown 报告
    md_content = generate_report(report, coupling)

    # 确定输出路径
    reports_dir = root / "docs" / "reports"
    reports_dir.mkdir(parents=True, exist_ok=True)

    if args.output:
        output_path = Path(args.output)
    else:
        date_str = datetime.now().strftime("%Y%m%d")
        output_path = reports_dir / f"architecture_report_{date_str}.md"

    output_path.write_text(md_content, encoding="utf-8")
    logger.info("报告已保存: %s", output_path)

    # 打印摘要
    total_files = len(report.files)
    total_lines = sum(f.lines for f in report.files)
    large_count = sum(1 for f in report.files if f.lines > 500)
    high_cx = sum(1 for fn in report.functions if fn.complexity >= 10)

    logger.info("")
    logger.info("═══ 分析摘要 ═══")
    logger.info("  文件数: %d", total_files)
    logger.info("  总行数: %s", f"{total_lines:,}")
    logger.info("  大文件(>500行): %d", large_count)
    logger.info("  高复杂度函数(≥10): %d", high_cx)
    logger.info("  循环依赖: %d", len(report.circular_deps))
    logger.info("  相似函数对: %d", len(report.similar_functions))
    logger.info("  注册任务: %s", report.task_classification.get("total", "N/A"))


if __name__ == "__main__":
    main()