619 lines
20 KiB
Python
619 lines
20 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
流程树分析器 — 通过静态分析 Python 源码的 import 语句和类继承关系,
|
||
构建从入口到末端模块的调用树。
|
||
|
||
仅执行只读操作:读取并解析 Python 源文件,不修改任何文件。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import ast
|
||
import logging
|
||
import re
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
from scripts.audit import FileEntry, FlowNode
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 项目内部包名列表(顶层目录中属于项目代码的包)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_PROJECT_PACKAGES: set[str] = {
|
||
"cli", "config", "api", "database", "tasks", "loaders",
|
||
"scd", "orchestration", "quality", "models", "utils",
|
||
"gui", "scripts",
|
||
}
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 已知的第三方包和标准库顶层模块(用于排除非项目导入)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_KNOWN_THIRD_PARTY: set[str] = {
|
||
"psycopg2", "requests", "dateutil", "python_dateutil",
|
||
"dotenv", "openpyxl", "PySide6", "flask", "pyinstaller",
|
||
"PyInstaller", "hypothesis", "pytest", "_pytest", "py",
|
||
"pluggy", "pkg_resources", "setuptools", "pip", "wheel",
|
||
"tzdata", "six", "certifi", "urllib3", "charset_normalizer",
|
||
"idna", "shiboken6",
|
||
}
|
||
|
||
|
||
def _is_project_module(module_name: str) -> bool:
|
||
"""判断模块名是否属于项目内部模块。"""
|
||
top = module_name.split(".")[0]
|
||
if top in _PROJECT_PACKAGES:
|
||
return True
|
||
return False
|
||
|
||
|
||
def _is_stdlib_or_third_party(module_name: str) -> bool:
|
||
"""判断模块名是否属于标准库或已知第三方包。"""
|
||
top = module_name.split(".")[0]
|
||
if top in _KNOWN_THIRD_PARTY:
|
||
return True
|
||
# 检查标准库
|
||
if top in sys.stdlib_module_names:
|
||
return True
|
||
return False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 文件读取(多编码回退)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _read_source(filepath: Path) -> str | None:
|
||
"""读取 Python 源文件内容,尝试 utf-8 → gbk → latin-1 回退。
|
||
|
||
返回文件内容字符串,读取失败时返回 None。
|
||
"""
|
||
for encoding in ("utf-8", "gbk", "latin-1"):
|
||
try:
|
||
return filepath.read_text(encoding=encoding)
|
||
except (UnicodeDecodeError, UnicodeError):
|
||
continue
|
||
except (OSError, PermissionError) as exc:
|
||
logger.warning("无法读取文件 %s: %s", filepath, exc)
|
||
return None
|
||
logger.warning("无法以任何编码读取文件 %s", filepath)
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 路径 ↔ 模块名转换
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _path_to_module_name(rel_path: str) -> str:
|
||
"""将相对路径转换为 Python 模块名。
|
||
|
||
例如:
|
||
- "cli/main.py" → "cli.main"
|
||
- "cli/__init__.py" → "cli"
|
||
- "tasks/dws/assistant.py" → "tasks.dws.assistant"
|
||
"""
|
||
p = rel_path.replace("\\", "/")
|
||
if p.endswith("/__init__.py"):
|
||
p = p[: -len("/__init__.py")]
|
||
elif p.endswith(".py"):
|
||
p = p[:-3]
|
||
return p.replace("/", ".")
|
||
|
||
|
||
def _module_to_path(module_name: str) -> str:
|
||
"""将模块名转换为相对文件路径(优先 .py 文件)。
|
||
|
||
例如:
|
||
- "cli.main" → "cli/main.py"
|
||
- "cli" → "cli/__init__.py"
|
||
"""
|
||
return module_name.replace(".", "/") + ".py"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# parse_imports — 解析 Python 文件的 import 语句
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def parse_imports(filepath: Path) -> list[str]:
|
||
"""使用 ast 模块解析 Python 文件的 import 语句,返回被导入的本地模块列表。
|
||
|
||
- 仅返回项目内部模块(排除标准库和第三方包)
|
||
- 结果去重
|
||
- 语法错误或文件不存在时返回空列表
|
||
"""
|
||
if not filepath.exists():
|
||
return []
|
||
|
||
source = _read_source(filepath)
|
||
if source is None:
|
||
return []
|
||
|
||
try:
|
||
tree = ast.parse(source, filename=str(filepath))
|
||
except SyntaxError:
|
||
logger.warning("语法错误,无法解析 %s", filepath)
|
||
return []
|
||
|
||
modules: list[str] = []
|
||
|
||
for node in ast.walk(tree):
|
||
if isinstance(node, ast.Import):
|
||
for alias in node.names:
|
||
name = alias.name
|
||
if _is_project_module(name) and not _is_stdlib_or_third_party(name):
|
||
modules.append(name)
|
||
elif isinstance(node, ast.ImportFrom):
|
||
if node.module and node.level == 0:
|
||
name = node.module
|
||
if _is_project_module(name) and not _is_stdlib_or_third_party(name):
|
||
modules.append(name)
|
||
|
||
# 去重并保持顺序
|
||
seen: set[str] = set()
|
||
result: list[str] = []
|
||
for m in modules:
|
||
if m not in seen:
|
||
seen.add(m)
|
||
result.append(m)
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# build_flow_tree — 从入口递归追踪 import 链,构建流程树
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def build_flow_tree(
|
||
repo_root: Path,
|
||
entry_file: str,
|
||
_visited: set[str] | None = None,
|
||
) -> FlowNode:
|
||
"""从指定入口文件出发,递归追踪 import 链,构建流程树。
|
||
|
||
Parameters
|
||
----------
|
||
repo_root : Path
|
||
仓库根目录。
|
||
entry_file : str
|
||
入口文件的相对路径(如 "cli/main.py")。
|
||
_visited : set[str] | None
|
||
内部使用,防止循环导入导致无限递归。
|
||
|
||
Returns
|
||
-------
|
||
FlowNode
|
||
以入口文件为根的流程树。
|
||
"""
|
||
is_root = _visited is None
|
||
if _visited is None:
|
||
_visited = set()
|
||
|
||
module_name = _path_to_module_name(entry_file)
|
||
node_type = "entry" if is_root else "module"
|
||
|
||
_visited.add(entry_file)
|
||
|
||
filepath = repo_root / entry_file
|
||
children: list[FlowNode] = []
|
||
|
||
if filepath.exists():
|
||
imported_modules = parse_imports(filepath)
|
||
for mod in imported_modules:
|
||
child_path = _module_to_path(mod)
|
||
# 如果 .py 文件不存在,尝试 __init__.py
|
||
if not (repo_root / child_path).exists():
|
||
alt_path = mod.replace(".", "/") + "/__init__.py"
|
||
if (repo_root / alt_path).exists():
|
||
child_path = alt_path
|
||
|
||
if child_path not in _visited:
|
||
child_node = build_flow_tree(repo_root, child_path, _visited)
|
||
children.append(child_node)
|
||
|
||
return FlowNode(
|
||
name=module_name,
|
||
source_file=entry_file,
|
||
node_type=node_type,
|
||
children=children,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 批处理文件解析
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _parse_bat_python_target(bat_path: Path) -> str | None:
|
||
"""从批处理文件中解析 python -m 命令的目标模块名。
|
||
|
||
返回模块名(如 "cli.main"),未找到时返回 None。
|
||
"""
|
||
if not bat_path.exists():
|
||
return None
|
||
|
||
content = _read_source(bat_path)
|
||
if content is None:
|
||
return None
|
||
|
||
# 匹配 python -m module.name 或 python3 -m module.name
|
||
pattern = re.compile(r"python[3]?\s+-m\s+([\w.]+)", re.IGNORECASE)
|
||
for line in content.splitlines():
|
||
m = pattern.search(line)
|
||
if m:
|
||
return m.group(1)
|
||
return None
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 入口点识别
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def discover_entry_points(repo_root: Path) -> list[dict[str, str]]:
|
||
"""识别项目的所有入口点。
|
||
|
||
返回字典列表,每个字典包含:
|
||
- type: 入口类型(CLI / GUI / 批处理 / 运维脚本)
|
||
- file: 相对路径
|
||
- description: 简要说明
|
||
|
||
识别规则:
|
||
- cli/main.py → CLI 入口
|
||
- gui/main.py → GUI 入口
|
||
- *.bat 文件 → 解析其中的 python -m 命令
|
||
- scripts/*.py(含 if __name__ == "__main__",排除 __init__.py 和 audit/ 子目录)
|
||
"""
|
||
entries: list[dict[str, str]] = []
|
||
|
||
# CLI 入口
|
||
cli_main = repo_root / "cli" / "main.py"
|
||
if cli_main.exists():
|
||
entries.append({
|
||
"type": "CLI",
|
||
"file": "cli/main.py",
|
||
"description": "CLI 主入口 (`python -m cli.main`)",
|
||
})
|
||
|
||
# GUI 入口
|
||
gui_main = repo_root / "gui" / "main.py"
|
||
if gui_main.exists():
|
||
entries.append({
|
||
"type": "GUI",
|
||
"file": "gui/main.py",
|
||
"description": "GUI 主入口 (`python -m gui.main`)",
|
||
})
|
||
|
||
# 批处理文件
|
||
for bat in sorted(repo_root.glob("*.bat")):
|
||
target = _parse_bat_python_target(bat)
|
||
desc = f"批处理脚本"
|
||
if target:
|
||
desc += f",调用 `{target}`"
|
||
entries.append({
|
||
"type": "批处理",
|
||
"file": bat.name,
|
||
"description": desc,
|
||
})
|
||
|
||
# 运维脚本:scripts/ 下的 .py 文件(排除 __init__.py 和 audit/ 子目录)
|
||
scripts_dir = repo_root / "scripts"
|
||
if scripts_dir.is_dir():
|
||
for py_file in sorted(scripts_dir.glob("*.py")):
|
||
if py_file.name == "__init__.py":
|
||
continue
|
||
# 检查是否包含 if __name__ == "__main__"
|
||
source = _read_source(py_file)
|
||
if source and '__name__' in source and '__main__' in source:
|
||
rel = py_file.relative_to(repo_root).as_posix()
|
||
entries.append({
|
||
"type": "运维脚本",
|
||
"file": rel,
|
||
"description": f"运维脚本 `{py_file.name}`",
|
||
})
|
||
|
||
return entries
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 任务类型和加载器类型区分
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def classify_task_type(rel_path: str) -> str:
|
||
"""根据文件路径区分任务类型。
|
||
|
||
返回值:
|
||
- "ODS 抓取任务"
|
||
- "DWD 加载任务"
|
||
- "DWS 汇总任务"
|
||
- "校验任务"
|
||
- "Schema 初始化任务"
|
||
- "任务"(无法细分时的默认值)
|
||
"""
|
||
p = rel_path.replace("\\", "/").lower()
|
||
|
||
if "verification/" in p or "verification\\" in p:
|
||
return "校验任务"
|
||
if "dws/" in p or "dws\\" in p:
|
||
return "DWS 汇总任务"
|
||
# 文件名级别判断
|
||
basename = p.rsplit("/", 1)[-1] if "/" in p else p
|
||
if basename.startswith("ods_") or basename.startswith("ods."):
|
||
return "ODS 抓取任务"
|
||
if basename.startswith("dwd_") or basename.startswith("dwd."):
|
||
return "DWD 加载任务"
|
||
if basename.startswith("dws_"):
|
||
return "DWS 汇总任务"
|
||
if "init" in basename and "schema" in basename:
|
||
return "Schema 初始化任务"
|
||
return "任务"
|
||
|
||
|
||
def classify_loader_type(rel_path: str) -> str:
|
||
"""根据文件路径区分加载器类型。
|
||
|
||
返回值:
|
||
- "维度加载器 (SCD2)"
|
||
- "事实表加载器"
|
||
- "ODS 通用加载器"
|
||
- "加载器"(无法细分时的默认值)
|
||
"""
|
||
p = rel_path.replace("\\", "/").lower()
|
||
|
||
if "dimensions/" in p or "dimensions\\" in p:
|
||
return "维度加载器 (SCD2)"
|
||
if "facts/" in p or "facts\\" in p:
|
||
return "事实表加载器"
|
||
if "ods/" in p or "ods\\" in p:
|
||
return "ODS 通用加载器"
|
||
return "加载器"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# find_orphan_modules — 找出未被任何入口直接或间接引用的 Python 模块
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def find_orphan_modules(
|
||
repo_root: Path,
|
||
all_entries: list[FileEntry],
|
||
reachable: set[str],
|
||
) -> list[str]:
|
||
"""找出未被任何入口直接或间接引用的 Python 模块。
|
||
|
||
排除规则(不视为孤立):
|
||
- __init__.py 文件
|
||
- tests/ 目录下的文件
|
||
- scripts/audit/ 目录下的文件(审计脚本自身)
|
||
- 目录条目
|
||
- 非 .py 文件
|
||
- 不属于项目包的文件
|
||
|
||
返回按路径排序的孤立模块列表。
|
||
"""
|
||
orphans: list[str] = []
|
||
|
||
for entry in all_entries:
|
||
# 跳过目录
|
||
if entry.is_dir:
|
||
continue
|
||
# 只关注 .py 文件
|
||
if entry.extension != ".py":
|
||
continue
|
||
|
||
rel = entry.rel_path.replace("\\", "/")
|
||
|
||
# 排除 __init__.py
|
||
if rel.endswith("/__init__.py") or rel == "__init__.py":
|
||
continue
|
||
# 排除测试文件
|
||
if rel.startswith("tests/") or rel.startswith("tests\\"):
|
||
continue
|
||
# 排除审计脚本自身
|
||
if rel.startswith("scripts/audit/") or rel.startswith("scripts\\audit\\"):
|
||
continue
|
||
|
||
# 只检查属于项目包的文件
|
||
top_dir = rel.split("/")[0] if "/" in rel else ""
|
||
if top_dir not in _PROJECT_PACKAGES:
|
||
continue
|
||
|
||
# 不在可达集合中 → 孤立
|
||
if rel not in reachable:
|
||
orphans.append(rel)
|
||
|
||
orphans.sort()
|
||
return orphans
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 统计辅助
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _count_nodes_by_type(trees: list[FlowNode]) -> dict[str, int]:
|
||
"""递归统计流程树中各类型节点的数量。"""
|
||
counts: dict[str, int] = {"entry": 0, "module": 0, "class": 0, "function": 0}
|
||
|
||
def _walk(node: FlowNode) -> None:
|
||
t = node.node_type
|
||
counts[t] = counts.get(t, 0) + 1
|
||
for child in node.children:
|
||
_walk(child)
|
||
|
||
for tree in trees:
|
||
_walk(tree)
|
||
return counts
|
||
|
||
|
||
def _count_tasks_and_loaders(trees: list[FlowNode]) -> tuple[int, int]:
|
||
"""统计流程树中任务模块和加载器模块的数量。"""
|
||
tasks = 0
|
||
loaders = 0
|
||
seen: set[str] = set()
|
||
|
||
def _walk(node: FlowNode) -> None:
|
||
nonlocal tasks, loaders
|
||
if node.source_file in seen:
|
||
return
|
||
seen.add(node.source_file)
|
||
sf = node.source_file.replace("\\", "/")
|
||
if sf.startswith("tasks/") and not sf.endswith("__init__.py"):
|
||
base = sf.rsplit("/", 1)[-1]
|
||
if not base.startswith("base_"):
|
||
tasks += 1
|
||
if sf.startswith("loaders/") and not sf.endswith("__init__.py"):
|
||
base = sf.rsplit("/", 1)[-1]
|
||
if not base.startswith("base_"):
|
||
loaders += 1
|
||
for child in node.children:
|
||
_walk(child)
|
||
|
||
for tree in trees:
|
||
_walk(tree)
|
||
return tasks, loaders
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 类型标注辅助
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _get_type_annotation(source_file: str) -> str:
|
||
"""根据源文件路径返回类型标注字符串(用于报告中的节点标注)。"""
|
||
sf = source_file.replace("\\", "/")
|
||
if sf.startswith("tasks/"):
|
||
return f" [{classify_task_type(sf)}]"
|
||
if sf.startswith("loaders/"):
|
||
return f" [{classify_loader_type(sf)}]"
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mermaid 图生成
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _render_mermaid(trees: list[FlowNode]) -> str:
|
||
"""生成 Mermaid 流程图代码。"""
|
||
lines: list[str] = ["```mermaid", "graph TD"]
|
||
seen_edges: set[tuple[str, str]] = set()
|
||
node_ids: dict[str, str] = {}
|
||
counter = [0]
|
||
|
||
def _node_id(name: str) -> str:
|
||
if name not in node_ids:
|
||
node_ids[name] = f"N{counter[0]}"
|
||
counter[0] += 1
|
||
return node_ids[name]
|
||
|
||
def _walk(node: FlowNode) -> None:
|
||
nid = _node_id(node.name)
|
||
annotation = _get_type_annotation(node.source_file)
|
||
label = f"{node.name}{annotation}"
|
||
# 声明节点
|
||
lines.append(f" {nid}[\"`{label}`\"]")
|
||
for child in node.children:
|
||
cid = _node_id(child.name)
|
||
edge = (nid, cid)
|
||
if edge not in seen_edges:
|
||
seen_edges.add(edge)
|
||
lines.append(f" {nid} --> {cid}")
|
||
_walk(child)
|
||
|
||
for tree in trees:
|
||
_walk(tree)
|
||
|
||
lines.append("```")
|
||
return "\n".join(lines)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 缩进文本树生成
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _render_text_tree(trees: list[FlowNode]) -> str:
|
||
"""生成缩进文本形式的流程树。"""
|
||
lines: list[str] = []
|
||
seen: set[str] = set()
|
||
|
||
def _walk(node: FlowNode, depth: int) -> None:
|
||
indent = " " * depth
|
||
annotation = _get_type_annotation(node.source_file)
|
||
line = f"{indent}- `{node.name}` (`{node.source_file}`){annotation}"
|
||
lines.append(line)
|
||
|
||
key = node.source_file
|
||
if key in seen:
|
||
# 已展开过,不再递归(避免循环)
|
||
if node.children:
|
||
lines.append(f"{indent} - *(已展开)*")
|
||
return
|
||
seen.add(key)
|
||
|
||
for child in node.children:
|
||
_walk(child, depth + 1)
|
||
|
||
for tree in trees:
|
||
_walk(tree, 0)
|
||
|
||
return "\n".join(lines)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# render_flow_report — 生成 Markdown 格式的流程树报告
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def render_flow_report(
|
||
trees: list[FlowNode],
|
||
orphans: list[str],
|
||
repo_root: str,
|
||
) -> str:
|
||
"""生成 Markdown 格式的流程树报告(含 Mermaid 图和缩进文本)。
|
||
|
||
报告结构:
|
||
1. 头部(时间戳、仓库路径)
|
||
2. Mermaid 流程图
|
||
3. 缩进文本树
|
||
4. 孤立模块列表
|
||
5. 统计摘要
|
||
"""
|
||
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||
|
||
sections: list[str] = []
|
||
|
||
# --- 头部 ---
|
||
sections.append("# 项目流程树报告\n")
|
||
sections.append(f"- 生成时间: {timestamp}")
|
||
sections.append(f"- 仓库路径: `{repo_root}`\n")
|
||
|
||
# --- Mermaid 图 ---
|
||
sections.append("## 流程图(Mermaid)\n")
|
||
sections.append(_render_mermaid(trees))
|
||
sections.append("")
|
||
|
||
# --- 缩进文本树 ---
|
||
sections.append("## 流程树(缩进文本)\n")
|
||
sections.append(_render_text_tree(trees))
|
||
sections.append("")
|
||
|
||
# --- 孤立模块 ---
|
||
sections.append("## 孤立模块\n")
|
||
if orphans:
|
||
for o in orphans:
|
||
sections.append(f"- `{o}`")
|
||
else:
|
||
sections.append("未发现孤立模块。")
|
||
sections.append("")
|
||
|
||
# --- 统计摘要 ---
|
||
entry_count = sum(1 for t in trees if t.node_type == "entry")
|
||
task_count, loader_count = _count_tasks_and_loaders(trees)
|
||
orphan_count = len(orphans)
|
||
|
||
sections.append("## 统计摘要\n")
|
||
sections.append(f"| 指标 | 数量 |")
|
||
sections.append(f"|------|------|")
|
||
sections.append(f"| 入口点 | {entry_count} |")
|
||
sections.append(f"| 任务 | {task_count} |")
|
||
sections.append(f"| 加载器 | {loader_count} |")
|
||
sections.append(f"| 孤立模块 | {orphan_count} |")
|
||
sections.append("")
|
||
|
||
return "\n".join(sections)
|