# -*- coding: utf-8 -*- """ 流程树分析器 — 通过静态分析 Python 源码的 import 语句和类继承关系, 构建从入口到末端模块的调用树。 仅执行只读操作:读取并解析 Python 源文件,不修改任何文件。 """ from __future__ import annotations import ast import logging import re import sys from datetime import datetime, timezone from pathlib import Path from scripts.audit import FileEntry, FlowNode logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # 项目内部包名列表(顶层目录中属于项目代码的包) # --------------------------------------------------------------------------- _PROJECT_PACKAGES: set[str] = { "cli", "config", "api", "database", "tasks", "loaders", "scd", "orchestration", "quality", "models", "utils", "gui", "scripts", } # --------------------------------------------------------------------------- # 已知的第三方包和标准库顶层模块(用于排除非项目导入) # --------------------------------------------------------------------------- _KNOWN_THIRD_PARTY: set[str] = { "psycopg2", "requests", "dateutil", "python_dateutil", "dotenv", "openpyxl", "PySide6", "flask", "pyinstaller", "PyInstaller", "hypothesis", "pytest", "_pytest", "py", "pluggy", "pkg_resources", "setuptools", "pip", "wheel", "tzdata", "six", "certifi", "urllib3", "charset_normalizer", "idna", "shiboken6", } def _is_project_module(module_name: str) -> bool: """判断模块名是否属于项目内部模块。""" top = module_name.split(".")[0] if top in _PROJECT_PACKAGES: return True return False def _is_stdlib_or_third_party(module_name: str) -> bool: """判断模块名是否属于标准库或已知第三方包。""" top = module_name.split(".")[0] if top in _KNOWN_THIRD_PARTY: return True # 检查标准库 if top in sys.stdlib_module_names: return True return False # --------------------------------------------------------------------------- # 文件读取(多编码回退) # --------------------------------------------------------------------------- def _read_source(filepath: Path) -> str | None: """读取 Python 源文件内容,尝试 utf-8 → gbk → latin-1 回退。 返回文件内容字符串,读取失败时返回 None。 """ for encoding in ("utf-8", "gbk", "latin-1"): try: return filepath.read_text(encoding=encoding) except (UnicodeDecodeError, UnicodeError): continue except (OSError, PermissionError) as exc: logger.warning("无法读取文件 %s: %s", filepath, exc) return None logger.warning("无法以任何编码读取文件 %s", filepath) return None # --------------------------------------------------------------------------- # 路径 ↔ 模块名转换 # --------------------------------------------------------------------------- def _path_to_module_name(rel_path: str) -> str: """将相对路径转换为 Python 模块名。 例如: - "cli/main.py" → "cli.main" - "cli/__init__.py" → "cli" - "tasks/dws/assistant.py" → "tasks.dws.assistant" """ p = rel_path.replace("\\", "/") if p.endswith("/__init__.py"): p = p[: -len("/__init__.py")] elif p.endswith(".py"): p = p[:-3] return p.replace("/", ".") def _module_to_path(module_name: str) -> str: """将模块名转换为相对文件路径(优先 .py 文件)。 例如: - "cli.main" → "cli/main.py" - "cli" → "cli/__init__.py" """ return module_name.replace(".", "/") + ".py" # --------------------------------------------------------------------------- # parse_imports — 解析 Python 文件的 import 语句 # --------------------------------------------------------------------------- def parse_imports(filepath: Path) -> list[str]: """使用 ast 模块解析 Python 文件的 import 语句,返回被导入的本地模块列表。 - 仅返回项目内部模块(排除标准库和第三方包) - 结果去重 - 语法错误或文件不存在时返回空列表 """ if not filepath.exists(): return [] source = _read_source(filepath) if source is None: return [] try: tree = ast.parse(source, filename=str(filepath)) except SyntaxError: logger.warning("语法错误,无法解析 %s", filepath) return [] modules: list[str] = [] for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: name = alias.name if _is_project_module(name) and not _is_stdlib_or_third_party(name): modules.append(name) elif isinstance(node, ast.ImportFrom): if node.module and node.level == 0: name = node.module if _is_project_module(name) and not _is_stdlib_or_third_party(name): modules.append(name) # 去重并保持顺序 seen: set[str] = set() result: list[str] = [] for m in modules: if m not in seen: seen.add(m) result.append(m) return result # --------------------------------------------------------------------------- # build_flow_tree — 从入口递归追踪 import 链,构建流程树 # --------------------------------------------------------------------------- def build_flow_tree( repo_root: Path, entry_file: str, _visited: set[str] | None = None, ) -> FlowNode: """从指定入口文件出发,递归追踪 import 链,构建流程树。 Parameters ---------- repo_root : Path 仓库根目录。 entry_file : str 入口文件的相对路径(如 "cli/main.py")。 _visited : set[str] | None 内部使用,防止循环导入导致无限递归。 Returns ------- FlowNode 以入口文件为根的流程树。 """ is_root = _visited is None if _visited is None: _visited = set() module_name = _path_to_module_name(entry_file) node_type = "entry" if is_root else "module" _visited.add(entry_file) filepath = repo_root / entry_file children: list[FlowNode] = [] if filepath.exists(): imported_modules = parse_imports(filepath) for mod in imported_modules: child_path = _module_to_path(mod) # 如果 .py 文件不存在,尝试 __init__.py if not (repo_root / child_path).exists(): alt_path = mod.replace(".", "/") + "/__init__.py" if (repo_root / alt_path).exists(): child_path = alt_path if child_path not in _visited: child_node = build_flow_tree(repo_root, child_path, _visited) children.append(child_node) return FlowNode( name=module_name, source_file=entry_file, node_type=node_type, children=children, ) # --------------------------------------------------------------------------- # 批处理文件解析 # --------------------------------------------------------------------------- def _parse_bat_python_target(bat_path: Path) -> str | None: """从批处理文件中解析 python -m 命令的目标模块名。 返回模块名(如 "cli.main"),未找到时返回 None。 """ if not bat_path.exists(): return None content = _read_source(bat_path) if content is None: return None # 匹配 python -m module.name 或 python3 -m module.name pattern = re.compile(r"python[3]?\s+-m\s+([\w.]+)", re.IGNORECASE) for line in content.splitlines(): m = pattern.search(line) if m: return m.group(1) return None # --------------------------------------------------------------------------- # 入口点识别 # --------------------------------------------------------------------------- def discover_entry_points(repo_root: Path) -> list[dict[str, str]]: """识别项目的所有入口点。 返回字典列表,每个字典包含: - type: 入口类型(CLI / GUI / 批处理 / 运维脚本) - file: 相对路径 - description: 简要说明 识别规则: - cli/main.py → CLI 入口 - gui/main.py → GUI 入口 - *.bat 文件 → 解析其中的 python -m 命令 - scripts/*.py(含 if __name__ == "__main__",排除 __init__.py 和 audit/ 子目录) """ entries: list[dict[str, str]] = [] # CLI 入口 cli_main = repo_root / "cli" / "main.py" if cli_main.exists(): entries.append({ "type": "CLI", "file": "cli/main.py", "description": "CLI 主入口 (`python -m cli.main`)", }) # GUI 入口 gui_main = repo_root / "gui" / "main.py" if gui_main.exists(): entries.append({ "type": "GUI", "file": "gui/main.py", "description": "GUI 主入口 (`python -m gui.main`)", }) # 批处理文件 for bat in sorted(repo_root.glob("*.bat")): target = _parse_bat_python_target(bat) desc = f"批处理脚本" if target: desc += f",调用 `{target}`" entries.append({ "type": "批处理", "file": bat.name, "description": desc, }) # 运维脚本:scripts/ 下的 .py 文件(排除 __init__.py 和 audit/ 子目录) scripts_dir = repo_root / "scripts" if scripts_dir.is_dir(): for py_file in sorted(scripts_dir.glob("*.py")): if py_file.name == "__init__.py": continue # 检查是否包含 if __name__ == "__main__" source = _read_source(py_file) if source and '__name__' in source and '__main__' in source: rel = py_file.relative_to(repo_root).as_posix() entries.append({ "type": "运维脚本", "file": rel, "description": f"运维脚本 `{py_file.name}`", }) return entries # --------------------------------------------------------------------------- # 任务类型和加载器类型区分 # --------------------------------------------------------------------------- def classify_task_type(rel_path: str) -> str: """根据文件路径区分任务类型。 返回值: - "ODS 抓取任务" - "DWD 加载任务" - "DWS 汇总任务" - "校验任务" - "Schema 初始化任务" - "任务"(无法细分时的默认值) """ p = rel_path.replace("\\", "/").lower() if "verification/" in p or "verification\\" in p: return "校验任务" if "dws/" in p or "dws\\" in p: return "DWS 汇总任务" # 文件名级别判断 basename = p.rsplit("/", 1)[-1] if "/" in p else p if basename.startswith("ods_") or basename.startswith("ods."): return "ODS 抓取任务" if basename.startswith("dwd_") or basename.startswith("dwd."): return "DWD 加载任务" if basename.startswith("dws_"): return "DWS 汇总任务" if "init" in basename and "schema" in basename: return "Schema 初始化任务" return "任务" def classify_loader_type(rel_path: str) -> str: """根据文件路径区分加载器类型。 返回值: - "维度加载器 (SCD2)" - "事实表加载器" - "ODS 通用加载器" - "加载器"(无法细分时的默认值) """ p = rel_path.replace("\\", "/").lower() if "dimensions/" in p or "dimensions\\" in p: return "维度加载器 (SCD2)" if "facts/" in p or "facts\\" in p: return "事实表加载器" if "ods/" in p or "ods\\" in p: return "ODS 通用加载器" return "加载器" # --------------------------------------------------------------------------- # find_orphan_modules — 找出未被任何入口直接或间接引用的 Python 模块 # --------------------------------------------------------------------------- def find_orphan_modules( repo_root: Path, all_entries: list[FileEntry], reachable: set[str], ) -> list[str]: """找出未被任何入口直接或间接引用的 Python 模块。 排除规则(不视为孤立): - __init__.py 文件 - tests/ 目录下的文件 - scripts/audit/ 目录下的文件(审计脚本自身) - 目录条目 - 非 .py 文件 - 不属于项目包的文件 返回按路径排序的孤立模块列表。 """ orphans: list[str] = [] for entry in all_entries: # 跳过目录 if entry.is_dir: continue # 只关注 .py 文件 if entry.extension != ".py": continue rel = entry.rel_path.replace("\\", "/") # 排除 __init__.py if rel.endswith("/__init__.py") or rel == "__init__.py": continue # 排除测试文件 if rel.startswith("tests/") or rel.startswith("tests\\"): continue # 排除审计脚本自身 if rel.startswith("scripts/audit/") or rel.startswith("scripts\\audit\\"): continue # 只检查属于项目包的文件 top_dir = rel.split("/")[0] if "/" in rel else "" if top_dir not in _PROJECT_PACKAGES: continue # 不在可达集合中 → 孤立 if rel not in reachable: orphans.append(rel) orphans.sort() return orphans # --------------------------------------------------------------------------- # 统计辅助 # --------------------------------------------------------------------------- def _count_nodes_by_type(trees: list[FlowNode]) -> dict[str, int]: """递归统计流程树中各类型节点的数量。""" counts: dict[str, int] = {"entry": 0, "module": 0, "class": 0, "function": 0} def _walk(node: FlowNode) -> None: t = node.node_type counts[t] = counts.get(t, 0) + 1 for child in node.children: _walk(child) for tree in trees: _walk(tree) return counts def _count_tasks_and_loaders(trees: list[FlowNode]) -> tuple[int, int]: """统计流程树中任务模块和加载器模块的数量。""" tasks = 0 loaders = 0 seen: set[str] = set() def _walk(node: FlowNode) -> None: nonlocal tasks, loaders if node.source_file in seen: return seen.add(node.source_file) sf = node.source_file.replace("\\", "/") if sf.startswith("tasks/") and not sf.endswith("__init__.py"): base = sf.rsplit("/", 1)[-1] if not base.startswith("base_"): tasks += 1 if sf.startswith("loaders/") and not sf.endswith("__init__.py"): base = sf.rsplit("/", 1)[-1] if not base.startswith("base_"): loaders += 1 for child in node.children: _walk(child) for tree in trees: _walk(tree) return tasks, loaders # --------------------------------------------------------------------------- # 类型标注辅助 # --------------------------------------------------------------------------- def _get_type_annotation(source_file: str) -> str: """根据源文件路径返回类型标注字符串(用于报告中的节点标注)。""" sf = source_file.replace("\\", "/") if sf.startswith("tasks/"): return f" [{classify_task_type(sf)}]" if sf.startswith("loaders/"): return f" [{classify_loader_type(sf)}]" return "" # --------------------------------------------------------------------------- # Mermaid 图生成 # --------------------------------------------------------------------------- def _render_mermaid(trees: list[FlowNode]) -> str: """生成 Mermaid 流程图代码。""" lines: list[str] = ["```mermaid", "graph TD"] seen_edges: set[tuple[str, str]] = set() node_ids: dict[str, str] = {} counter = [0] def _node_id(name: str) -> str: if name not in node_ids: node_ids[name] = f"N{counter[0]}" counter[0] += 1 return node_ids[name] def _walk(node: FlowNode) -> None: nid = _node_id(node.name) annotation = _get_type_annotation(node.source_file) label = f"{node.name}{annotation}" # 声明节点 lines.append(f" {nid}[\"`{label}`\"]") for child in node.children: cid = _node_id(child.name) edge = (nid, cid) if edge not in seen_edges: seen_edges.add(edge) lines.append(f" {nid} --> {cid}") _walk(child) for tree in trees: _walk(tree) lines.append("```") return "\n".join(lines) # --------------------------------------------------------------------------- # 缩进文本树生成 # --------------------------------------------------------------------------- def _render_text_tree(trees: list[FlowNode]) -> str: """生成缩进文本形式的流程树。""" lines: list[str] = [] seen: set[str] = set() def _walk(node: FlowNode, depth: int) -> None: indent = " " * depth annotation = _get_type_annotation(node.source_file) line = f"{indent}- `{node.name}` (`{node.source_file}`){annotation}" lines.append(line) key = node.source_file if key in seen: # 已展开过,不再递归(避免循环) if node.children: lines.append(f"{indent} - *(已展开)*") return seen.add(key) for child in node.children: _walk(child, depth + 1) for tree in trees: _walk(tree, 0) return "\n".join(lines) # --------------------------------------------------------------------------- # render_flow_report — 生成 Markdown 格式的流程树报告 # --------------------------------------------------------------------------- def render_flow_report( trees: list[FlowNode], orphans: list[str], repo_root: str, ) -> str: """生成 Markdown 格式的流程树报告(含 Mermaid 图和缩进文本)。 报告结构: 1. 头部(时间戳、仓库路径) 2. Mermaid 流程图 3. 缩进文本树 4. 孤立模块列表 5. 统计摘要 """ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") sections: list[str] = [] # --- 头部 --- sections.append("# 项目流程树报告\n") sections.append(f"- 生成时间: {timestamp}") sections.append(f"- 仓库路径: `{repo_root}`\n") # --- Mermaid 图 --- sections.append("## 流程图(Mermaid)\n") sections.append(_render_mermaid(trees)) sections.append("") # --- 缩进文本树 --- sections.append("## 流程树(缩进文本)\n") sections.append(_render_text_tree(trees)) sections.append("") # --- 孤立模块 --- sections.append("## 孤立模块\n") if orphans: for o in orphans: sections.append(f"- `{o}`") else: sections.append("未发现孤立模块。") sections.append("") # --- 统计摘要 --- entry_count = sum(1 for t in trees if t.node_type == "entry") task_count, loader_count = _count_tasks_and_loaders(trees) orphan_count = len(orphans) sections.append("## 统计摘要\n") sections.append(f"| 指标 | 数量 |") sections.append(f"|------|------|") sections.append(f"| 入口点 | {entry_count} |") sections.append(f"| 任务 | {task_count} |") sections.append(f"| 加载器 | {loader_count} |") sections.append(f"| 孤立模块 | {orphan_count} |") sections.append("") return "\n".join(sections)