Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/audit/flow_analyzer.py

# -*- coding: utf-8 -*-
"""
流程树分析器 — 通过静态分析 Python 源码的 import 语句和类继承关系，
构建从入口到末端模块的调用树。

仅执行只读操作：读取并解析 Python 源文件，不修改任何文件。
"""

from __future__ import annotations

import ast
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path

from scripts.audit import FileEntry, FlowNode

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# 项目内部包名列表（顶层目录中属于项目代码的包）
# ---------------------------------------------------------------------------

_PROJECT_PACKAGES: set[str] = {
    "cli", "config", "api", "database", "tasks", "loaders",
    "scd", "orchestration", "quality", "models", "utils",
    "gui", "scripts",
}

# ---------------------------------------------------------------------------
# 已知的第三方包和标准库顶层模块（用于排除非项目导入）
# ---------------------------------------------------------------------------

_KNOWN_THIRD_PARTY: set[str] = {
    "psycopg2", "requests", "dateutil", "python_dateutil",
    "dotenv", "openpyxl", "PySide6", "flask", "pyinstaller",
    "PyInstaller", "hypothesis", "pytest", "_pytest", "py",
    "pluggy", "pkg_resources", "setuptools", "pip", "wheel",
    "tzdata", "six", "certifi", "urllib3", "charset_normalizer",
    "idna", "shiboken6",
}


def _is_project_module(module_name: str) -> bool:
    """判断模块名是否属于项目内部模块。"""
    top = module_name.split(".")[0]
    if top in _PROJECT_PACKAGES:
        return True
    return False


def _is_stdlib_or_third_party(module_name: str) -> bool:
    """判断模块名是否属于标准库或已知第三方包。"""
    top = module_name.split(".")[0]
    if top in _KNOWN_THIRD_PARTY:
        return True
    # 检查标准库
    if top in sys.stdlib_module_names:
        return True
    return False


# ---------------------------------------------------------------------------
# 文件读取（多编码回退）
# ---------------------------------------------------------------------------

def _read_source(filepath: Path) -> str | None:
    """读取 Python 源文件内容，尝试 utf-8 → gbk → latin-1 回退。

    返回文件内容字符串，读取失败时返回 None。
    """
    for encoding in ("utf-8", "gbk", "latin-1"):
        try:
            return filepath.read_text(encoding=encoding)
        except (UnicodeDecodeError, UnicodeError):
            continue
        except (OSError, PermissionError) as exc:
            logger.warning("无法读取文件 %s: %s", filepath, exc)
            return None
    logger.warning("无法以任何编码读取文件 %s", filepath)
    return None


# ---------------------------------------------------------------------------
# 路径 ↔ 模块名转换
# ---------------------------------------------------------------------------

def _path_to_module_name(rel_path: str) -> str:
    """将相对路径转换为 Python 模块名。

    例如：
    - "cli/main.py" → "cli.main"
    - "cli/__init__.py" → "cli"
    - "tasks/dws/assistant.py" → "tasks.dws.assistant"
    """
    p = rel_path.replace("\\", "/")
    if p.endswith("/__init__.py"):
        p = p[: -len("/__init__.py")]
    elif p.endswith(".py"):
        p = p[:-3]
    return p.replace("/", ".")


def _module_to_path(module_name: str) -> str:
    """将模块名转换为相对文件路径（优先 .py 文件）。

    例如：
    - "cli.main" → "cli/main.py"
    - "cli" → "cli/__init__.py"
    """
    return module_name.replace(".", "/") + ".py"


# ---------------------------------------------------------------------------
# parse_imports — 解析 Python 文件的 import 语句
# ---------------------------------------------------------------------------

def parse_imports(filepath: Path) -> list[str]:
    """使用 ast 模块解析 Python 文件的 import 语句，返回被导入的本地模块列表。

    - 仅返回项目内部模块（排除标准库和第三方包）
    - 结果去重
    - 语法错误或文件不存在时返回空列表
    """
    if not filepath.exists():
        return []

    source = _read_source(filepath)
    if source is None:
        return []

    try:
        tree = ast.parse(source, filename=str(filepath))
    except SyntaxError:
        logger.warning("语法错误，无法解析 %s", filepath)
        return []

    modules: list[str] = []

    for node in ast.walk(tree):
        if isinstance(node, ast.Import):
            for alias in node.names:
                name = alias.name
                if _is_project_module(name) and not _is_stdlib_or_third_party(name):
                    modules.append(name)
        elif isinstance(node, ast.ImportFrom):
            if node.module and node.level == 0:
                name = node.module
                if _is_project_module(name) and not _is_stdlib_or_third_party(name):
                    modules.append(name)

    # 去重并保持顺序
    seen: set[str] = set()
    result: list[str] = []
    for m in modules:
        if m not in seen:
            seen.add(m)
            result.append(m)
    return result


# ---------------------------------------------------------------------------
# build_flow_tree — 从入口递归追踪 import 链，构建流程树
# ---------------------------------------------------------------------------

def build_flow_tree(
    repo_root: Path,
    entry_file: str,
    _visited: set[str] | None = None,
) -> FlowNode:
    """从指定入口文件出发，递归追踪 import 链，构建流程树。

    Parameters
    ----------
    repo_root : Path
        仓库根目录。
    entry_file : str
        入口文件的相对路径（如 "cli/main.py"）。
    _visited : set[str] | None
        内部使用，防止循环导入导致无限递归。

    Returns
    -------
    FlowNode
        以入口文件为根的流程树。
    """
    is_root = _visited is None
    if _visited is None:
        _visited = set()

    module_name = _path_to_module_name(entry_file)
    node_type = "entry" if is_root else "module"

    _visited.add(entry_file)

    filepath = repo_root / entry_file
    children: list[FlowNode] = []

    if filepath.exists():
        imported_modules = parse_imports(filepath)
        for mod in imported_modules:
            child_path = _module_to_path(mod)
            # 如果 .py 文件不存在，尝试 __init__.py
            if not (repo_root / child_path).exists():
                alt_path = mod.replace(".", "/") + "/__init__.py"
                if (repo_root / alt_path).exists():
                    child_path = alt_path

            if child_path not in _visited:
                child_node = build_flow_tree(repo_root, child_path, _visited)
                children.append(child_node)

    return FlowNode(
        name=module_name,
        source_file=entry_file,
        node_type=node_type,
        children=children,
    )


# ---------------------------------------------------------------------------
# 批处理文件解析
# ---------------------------------------------------------------------------

def _parse_bat_python_target(bat_path: Path) -> str | None:
    """从批处理文件中解析 python -m 命令的目标模块名。

    返回模块名（如 "cli.main"），未找到时返回 None。
    """
    if not bat_path.exists():
        return None

    content = _read_source(bat_path)
    if content is None:
        return None

    # 匹配 python -m module.name 或 python3 -m module.name
    pattern = re.compile(r"python[3]?\s+-m\s+([\w.]+)", re.IGNORECASE)
    for line in content.splitlines():
        m = pattern.search(line)
        if m:
            return m.group(1)
    return None


# ---------------------------------------------------------------------------
# 入口点识别
# ---------------------------------------------------------------------------

def discover_entry_points(repo_root: Path) -> list[dict[str, str]]:
    """识别项目的所有入口点。

    返回字典列表，每个字典包含：
    - type: 入口类型（CLI / GUI / 批处理 / 运维脚本）
    - file: 相对路径
    - description: 简要说明

    识别规则：
    - cli/main.py → CLI 入口
    - gui/main.py → GUI 入口
    - *.bat 文件 → 解析其中的 python -m 命令
    - scripts/*.py（含 if __name__ == "__main__"，排除 __init__.py 和 audit/ 子目录）
    """
    entries: list[dict[str, str]] = []

    # CLI 入口
    cli_main = repo_root / "cli" / "main.py"
    if cli_main.exists():
        entries.append({
            "type": "CLI",
            "file": "cli/main.py",
            "description": "CLI 主入口 (`python -m cli.main`)",
        })

    # GUI 入口
    gui_main = repo_root / "gui" / "main.py"
    if gui_main.exists():
        entries.append({
            "type": "GUI",
            "file": "gui/main.py",
            "description": "GUI 主入口 (`python -m gui.main`)",
        })

    # 批处理文件
    for bat in sorted(repo_root.glob("*.bat")):
        target = _parse_bat_python_target(bat)
        desc = f"批处理脚本"
        if target:
            desc += f"，调用 `{target}`"
        entries.append({
            "type": "批处理",
            "file": bat.name,
            "description": desc,
        })

    # 运维脚本：scripts/ 下的 .py 文件（排除 __init__.py 和 audit/ 子目录）
    scripts_dir = repo_root / "scripts"
    if scripts_dir.is_dir():
        for py_file in sorted(scripts_dir.glob("*.py")):
            if py_file.name == "__init__.py":
                continue
            # 检查是否包含 if __name__ == "__main__"
            source = _read_source(py_file)
            if source and '__name__' in source and '__main__' in source:
                rel = py_file.relative_to(repo_root).as_posix()
                entries.append({
                    "type": "运维脚本",
                    "file": rel,
                    "description": f"运维脚本 `{py_file.name}`",
                })

    return entries


# ---------------------------------------------------------------------------
# 任务类型和加载器类型区分
# ---------------------------------------------------------------------------

def classify_task_type(rel_path: str) -> str:
    """根据文件路径区分任务类型。

    返回值：
    - "ODS 抓取任务"
    - "DWD 加载任务"
    - "DWS 汇总任务"
    - "校验任务"
    - "Schema 初始化任务"
    - "任务"（无法细分时的默认值）
    """
    p = rel_path.replace("\\", "/").lower()

    if "verification/" in p or "verification\\" in p:
        return "校验任务"
    if "dws/" in p or "dws\\" in p:
        return "DWS 汇总任务"
    # 文件名级别判断
    basename = p.rsplit("/", 1)[-1] if "/" in p else p
    if basename.startswith("ods_") or basename.startswith("ods."):
        return "ODS 抓取任务"
    if basename.startswith("dwd_") or basename.startswith("dwd."):
        return "DWD 加载任务"
    if basename.startswith("dws_"):
        return "DWS 汇总任务"
    if "init" in basename and "schema" in basename:
        return "Schema 初始化任务"
    return "任务"


def classify_loader_type(rel_path: str) -> str:
    """根据文件路径区分加载器类型。

    返回值：
    - "维度加载器 (SCD2)"
    - "事实表加载器"
    - "ODS 通用加载器"
    - "加载器"（无法细分时的默认值）
    """
    p = rel_path.replace("\\", "/").lower()

    if "dimensions/" in p or "dimensions\\" in p:
        return "维度加载器 (SCD2)"
    if "facts/" in p or "facts\\" in p:
        return "事实表加载器"
    if "ods/" in p or "ods\\" in p:
        return "ODS 通用加载器"
    return "加载器"


# ---------------------------------------------------------------------------
# find_orphan_modules — 找出未被任何入口直接或间接引用的 Python 模块
# ---------------------------------------------------------------------------

def find_orphan_modules(
    repo_root: Path,
    all_entries: list[FileEntry],
    reachable: set[str],
) -> list[str]:
    """找出未被任何入口直接或间接引用的 Python 模块。

    排除规则（不视为孤立）：
    - __init__.py 文件
    - tests/ 目录下的文件
    - scripts/audit/ 目录下的文件（审计脚本自身）
    - 目录条目
    - 非 .py 文件
    - 不属于项目包的文件

    返回按路径排序的孤立模块列表。
    """
    orphans: list[str] = []

    for entry in all_entries:
        # 跳过目录
        if entry.is_dir:
            continue
        # 只关注 .py 文件
        if entry.extension != ".py":
            continue

        rel = entry.rel_path.replace("\\", "/")

        # 排除 __init__.py
        if rel.endswith("/__init__.py") or rel == "__init__.py":
            continue
        # 排除测试文件
        if rel.startswith("tests/") or rel.startswith("tests\\"):
            continue
        # 排除审计脚本自身
        if rel.startswith("scripts/audit/") or rel.startswith("scripts\\audit\\"):
            continue

        # 只检查属于项目包的文件
        top_dir = rel.split("/")[0] if "/" in rel else ""
        if top_dir not in _PROJECT_PACKAGES:
            continue

        # 不在可达集合中 → 孤立
        if rel not in reachable:
            orphans.append(rel)

    orphans.sort()
    return orphans


# ---------------------------------------------------------------------------
# 统计辅助
# ---------------------------------------------------------------------------

def _count_nodes_by_type(trees: list[FlowNode]) -> dict[str, int]:
    """递归统计流程树中各类型节点的数量。"""
    counts: dict[str, int] = {"entry": 0, "module": 0, "class": 0, "function": 0}

    def _walk(node: FlowNode) -> None:
        t = node.node_type
        counts[t] = counts.get(t, 0) + 1
        for child in node.children:
            _walk(child)

    for tree in trees:
        _walk(tree)
    return counts


def _count_tasks_and_loaders(trees: list[FlowNode]) -> tuple[int, int]:
    """统计流程树中任务模块和加载器模块的数量。"""
    tasks = 0
    loaders = 0
    seen: set[str] = set()

    def _walk(node: FlowNode) -> None:
        nonlocal tasks, loaders
        if node.source_file in seen:
            return
        seen.add(node.source_file)
        sf = node.source_file.replace("\\", "/")
        if sf.startswith("tasks/") and not sf.endswith("__init__.py"):
            base = sf.rsplit("/", 1)[-1]
            if not base.startswith("base_"):
                tasks += 1
        if sf.startswith("loaders/") and not sf.endswith("__init__.py"):
            base = sf.rsplit("/", 1)[-1]
            if not base.startswith("base_"):
                loaders += 1
        for child in node.children:
            _walk(child)

    for tree in trees:
        _walk(tree)
    return tasks, loaders


# ---------------------------------------------------------------------------
# 类型标注辅助
# ---------------------------------------------------------------------------

def _get_type_annotation(source_file: str) -> str:
    """根据源文件路径返回类型标注字符串（用于报告中的节点标注）。"""
    sf = source_file.replace("\\", "/")
    if sf.startswith("tasks/"):
        return f" [{classify_task_type(sf)}]"
    if sf.startswith("loaders/"):
        return f" [{classify_loader_type(sf)}]"
    return ""


# ---------------------------------------------------------------------------
# Mermaid 图生成
# ---------------------------------------------------------------------------

def _render_mermaid(trees: list[FlowNode]) -> str:
    """生成 Mermaid 流程图代码。"""
    lines: list[str] = ["```mermaid", "graph TD"]
    seen_edges: set[tuple[str, str]] = set()
    node_ids: dict[str, str] = {}
    counter = [0]

    def _node_id(name: str) -> str:
        if name not in node_ids:
            node_ids[name] = f"N{counter[0]}"
            counter[0] += 1
        return node_ids[name]

    def _walk(node: FlowNode) -> None:
        nid = _node_id(node.name)
        annotation = _get_type_annotation(node.source_file)
        label = f"{node.name}{annotation}"
        # 声明节点
        lines.append(f"    {nid}[\"`{label}`\"]")
        for child in node.children:
            cid = _node_id(child.name)
            edge = (nid, cid)
            if edge not in seen_edges:
                seen_edges.add(edge)
                lines.append(f"    {nid} --> {cid}")
            _walk(child)

    for tree in trees:
        _walk(tree)

    lines.append("```")
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# 缩进文本树生成
# ---------------------------------------------------------------------------

def _render_text_tree(trees: list[FlowNode]) -> str:
    """生成缩进文本形式的流程树。"""
    lines: list[str] = []
    seen: set[str] = set()

    def _walk(node: FlowNode, depth: int) -> None:
        indent = "  " * depth
        annotation = _get_type_annotation(node.source_file)
        line = f"{indent}- `{node.name}` (`{node.source_file}`){annotation}"
        lines.append(line)

        key = node.source_file
        if key in seen:
            # 已展开过，不再递归（避免循环）
            if node.children:
                lines.append(f"{indent}  - *(已展开)*")
            return
        seen.add(key)

        for child in node.children:
            _walk(child, depth + 1)

    for tree in trees:
        _walk(tree, 0)

    return "\n".join(lines)


# ---------------------------------------------------------------------------
# render_flow_report — 生成 Markdown 格式的流程树报告
# ---------------------------------------------------------------------------

def render_flow_report(
    trees: list[FlowNode],
    orphans: list[str],
    repo_root: str,
) -> str:
    """生成 Markdown 格式的流程树报告（含 Mermaid 图和缩进文本）。

    报告结构：
    1. 头部（时间戳、仓库路径）
    2. Mermaid 流程图
    3. 缩进文本树
    4. 孤立模块列表
    5. 统计摘要
    """
    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    sections: list[str] = []

    # --- 头部 ---
    sections.append("# 项目流程树报告\n")
    sections.append(f"- 生成时间: {timestamp}")
    sections.append(f"- 仓库路径: `{repo_root}`\n")

    # --- Mermaid 图 ---
    sections.append("## 流程图（Mermaid）\n")
    sections.append(_render_mermaid(trees))
    sections.append("")

    # --- 缩进文本树 ---
    sections.append("## 流程树（缩进文本）\n")
    sections.append(_render_text_tree(trees))
    sections.append("")

    # --- 孤立模块 ---
    sections.append("## 孤立模块\n")
    if orphans:
        for o in orphans:
            sections.append(f"- `{o}`")
    else:
        sections.append("未发现孤立模块。")
    sections.append("")

    # --- 统计摘要 ---
    entry_count = sum(1 for t in trees if t.node_type == "entry")
    task_count, loader_count = _count_tasks_and_loaders(trees)
    orphan_count = len(orphans)

    sections.append("## 统计摘要\n")
    sections.append(f"| 指标 | 数量 |")
    sections.append(f"|------|------|")
    sections.append(f"| 入口点 | {entry_count} |")
    sections.append(f"| 任务 | {task_count} |")
    sections.append(f"| 加载器 | {loader_count} |")
    sections.append(f"| 孤立模块 | {orphan_count} |")
    sections.append("")

    return "\n".join(sections)