在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

View File

@@ -0,0 +1,618 @@
# -*- coding: utf-8 -*-
"""
流程树分析器 — 通过静态分析 Python 源码的 import 语句和类继承关系,
构建从入口到末端模块的调用树。
仅执行只读操作:读取并解析 Python 源文件,不修改任何文件。
"""
from __future__ import annotations
import ast
import logging
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from scripts.audit import FileEntry, FlowNode
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# 项目内部包名列表(顶层目录中属于项目代码的包)
# ---------------------------------------------------------------------------
_PROJECT_PACKAGES: set[str] = {
"cli", "config", "api", "database", "tasks", "loaders",
"scd", "orchestration", "quality", "models", "utils",
"gui", "scripts",
}
# ---------------------------------------------------------------------------
# 已知的第三方包和标准库顶层模块(用于排除非项目导入)
# ---------------------------------------------------------------------------
_KNOWN_THIRD_PARTY: set[str] = {
"psycopg2", "requests", "dateutil", "python_dateutil",
"dotenv", "openpyxl", "PySide6", "flask", "pyinstaller",
"PyInstaller", "hypothesis", "pytest", "_pytest", "py",
"pluggy", "pkg_resources", "setuptools", "pip", "wheel",
"tzdata", "six", "certifi", "urllib3", "charset_normalizer",
"idna", "shiboken6",
}
def _is_project_module(module_name: str) -> bool:
"""判断模块名是否属于项目内部模块。"""
top = module_name.split(".")[0]
if top in _PROJECT_PACKAGES:
return True
return False
def _is_stdlib_or_third_party(module_name: str) -> bool:
"""判断模块名是否属于标准库或已知第三方包。"""
top = module_name.split(".")[0]
if top in _KNOWN_THIRD_PARTY:
return True
# 检查标准库
if top in sys.stdlib_module_names:
return True
return False
# ---------------------------------------------------------------------------
# 文件读取(多编码回退)
# ---------------------------------------------------------------------------
def _read_source(filepath: Path) -> str | None:
"""读取 Python 源文件内容,尝试 utf-8 → gbk → latin-1 回退。
返回文件内容字符串,读取失败时返回 None。
"""
for encoding in ("utf-8", "gbk", "latin-1"):
try:
return filepath.read_text(encoding=encoding)
except (UnicodeDecodeError, UnicodeError):
continue
except (OSError, PermissionError) as exc:
logger.warning("无法读取文件 %s: %s", filepath, exc)
return None
logger.warning("无法以任何编码读取文件 %s", filepath)
return None
# ---------------------------------------------------------------------------
# 路径 ↔ 模块名转换
# ---------------------------------------------------------------------------
def _path_to_module_name(rel_path: str) -> str:
"""将相对路径转换为 Python 模块名。
例如:
- "cli/main.py""cli.main"
- "cli/__init__.py""cli"
- "tasks/dws/assistant.py""tasks.dws.assistant"
"""
p = rel_path.replace("\\", "/")
if p.endswith("/__init__.py"):
p = p[: -len("/__init__.py")]
elif p.endswith(".py"):
p = p[:-3]
return p.replace("/", ".")
def _module_to_path(module_name: str) -> str:
"""将模块名转换为相对文件路径(优先 .py 文件)。
例如:
- "cli.main""cli/main.py"
- "cli""cli/__init__.py"
"""
return module_name.replace(".", "/") + ".py"
# ---------------------------------------------------------------------------
# parse_imports — 解析 Python 文件的 import 语句
# ---------------------------------------------------------------------------
def parse_imports(filepath: Path) -> list[str]:
"""使用 ast 模块解析 Python 文件的 import 语句,返回被导入的本地模块列表。
- 仅返回项目内部模块(排除标准库和第三方包)
- 结果去重
- 语法错误或文件不存在时返回空列表
"""
if not filepath.exists():
return []
source = _read_source(filepath)
if source is None:
return []
try:
tree = ast.parse(source, filename=str(filepath))
except SyntaxError:
logger.warning("语法错误,无法解析 %s", filepath)
return []
modules: list[str] = []
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
name = alias.name
if _is_project_module(name) and not _is_stdlib_or_third_party(name):
modules.append(name)
elif isinstance(node, ast.ImportFrom):
if node.module and node.level == 0:
name = node.module
if _is_project_module(name) and not _is_stdlib_or_third_party(name):
modules.append(name)
# 去重并保持顺序
seen: set[str] = set()
result: list[str] = []
for m in modules:
if m not in seen:
seen.add(m)
result.append(m)
return result
# ---------------------------------------------------------------------------
# build_flow_tree — 从入口递归追踪 import 链,构建流程树
# ---------------------------------------------------------------------------
def build_flow_tree(
repo_root: Path,
entry_file: str,
_visited: set[str] | None = None,
) -> FlowNode:
"""从指定入口文件出发,递归追踪 import 链,构建流程树。
Parameters
----------
repo_root : Path
仓库根目录。
entry_file : str
入口文件的相对路径(如 "cli/main.py")。
_visited : set[str] | None
内部使用,防止循环导入导致无限递归。
Returns
-------
FlowNode
以入口文件为根的流程树。
"""
is_root = _visited is None
if _visited is None:
_visited = set()
module_name = _path_to_module_name(entry_file)
node_type = "entry" if is_root else "module"
_visited.add(entry_file)
filepath = repo_root / entry_file
children: list[FlowNode] = []
if filepath.exists():
imported_modules = parse_imports(filepath)
for mod in imported_modules:
child_path = _module_to_path(mod)
# 如果 .py 文件不存在,尝试 __init__.py
if not (repo_root / child_path).exists():
alt_path = mod.replace(".", "/") + "/__init__.py"
if (repo_root / alt_path).exists():
child_path = alt_path
if child_path not in _visited:
child_node = build_flow_tree(repo_root, child_path, _visited)
children.append(child_node)
return FlowNode(
name=module_name,
source_file=entry_file,
node_type=node_type,
children=children,
)
# ---------------------------------------------------------------------------
# 批处理文件解析
# ---------------------------------------------------------------------------
def _parse_bat_python_target(bat_path: Path) -> str | None:
"""从批处理文件中解析 python -m 命令的目标模块名。
返回模块名(如 "cli.main"),未找到时返回 None。
"""
if not bat_path.exists():
return None
content = _read_source(bat_path)
if content is None:
return None
# 匹配 python -m module.name 或 python3 -m module.name
pattern = re.compile(r"python[3]?\s+-m\s+([\w.]+)", re.IGNORECASE)
for line in content.splitlines():
m = pattern.search(line)
if m:
return m.group(1)
return None
# ---------------------------------------------------------------------------
# 入口点识别
# ---------------------------------------------------------------------------
def discover_entry_points(repo_root: Path) -> list[dict[str, str]]:
"""识别项目的所有入口点。
返回字典列表,每个字典包含:
- type: 入口类型CLI / GUI / 批处理 / 运维脚本)
- file: 相对路径
- description: 简要说明
识别规则:
- cli/main.py → CLI 入口
- gui/main.py → GUI 入口
- *.bat 文件 → 解析其中的 python -m 命令
- scripts/*.py含 if __name__ == "__main__",排除 __init__.py 和 audit/ 子目录)
"""
entries: list[dict[str, str]] = []
# CLI 入口
cli_main = repo_root / "cli" / "main.py"
if cli_main.exists():
entries.append({
"type": "CLI",
"file": "cli/main.py",
"description": "CLI 主入口 (`python -m cli.main`)",
})
# GUI 入口
gui_main = repo_root / "gui" / "main.py"
if gui_main.exists():
entries.append({
"type": "GUI",
"file": "gui/main.py",
"description": "GUI 主入口 (`python -m gui.main`)",
})
# 批处理文件
for bat in sorted(repo_root.glob("*.bat")):
target = _parse_bat_python_target(bat)
desc = f"批处理脚本"
if target:
desc += f",调用 `{target}`"
entries.append({
"type": "批处理",
"file": bat.name,
"description": desc,
})
# 运维脚本scripts/ 下的 .py 文件(排除 __init__.py 和 audit/ 子目录)
scripts_dir = repo_root / "scripts"
if scripts_dir.is_dir():
for py_file in sorted(scripts_dir.glob("*.py")):
if py_file.name == "__init__.py":
continue
# 检查是否包含 if __name__ == "__main__"
source = _read_source(py_file)
if source and '__name__' in source and '__main__' in source:
rel = py_file.relative_to(repo_root).as_posix()
entries.append({
"type": "运维脚本",
"file": rel,
"description": f"运维脚本 `{py_file.name}`",
})
return entries
# ---------------------------------------------------------------------------
# 任务类型和加载器类型区分
# ---------------------------------------------------------------------------
def classify_task_type(rel_path: str) -> str:
"""根据文件路径区分任务类型。
返回值:
- "ODS 抓取任务"
- "DWD 加载任务"
- "DWS 汇总任务"
- "校验任务"
- "Schema 初始化任务"
- "任务"(无法细分时的默认值)
"""
p = rel_path.replace("\\", "/").lower()
if "verification/" in p or "verification\\" in p:
return "校验任务"
if "dws/" in p or "dws\\" in p:
return "DWS 汇总任务"
# 文件名级别判断
basename = p.rsplit("/", 1)[-1] if "/" in p else p
if basename.startswith("ods_") or basename.startswith("ods."):
return "ODS 抓取任务"
if basename.startswith("dwd_") or basename.startswith("dwd."):
return "DWD 加载任务"
if basename.startswith("dws_"):
return "DWS 汇总任务"
if "init" in basename and "schema" in basename:
return "Schema 初始化任务"
return "任务"
def classify_loader_type(rel_path: str) -> str:
"""根据文件路径区分加载器类型。
返回值:
- "维度加载器 (SCD2)"
- "事实表加载器"
- "ODS 通用加载器"
- "加载器"(无法细分时的默认值)
"""
p = rel_path.replace("\\", "/").lower()
if "dimensions/" in p or "dimensions\\" in p:
return "维度加载器 (SCD2)"
if "facts/" in p or "facts\\" in p:
return "事实表加载器"
if "ods/" in p or "ods\\" in p:
return "ODS 通用加载器"
return "加载器"
# ---------------------------------------------------------------------------
# find_orphan_modules — 找出未被任何入口直接或间接引用的 Python 模块
# ---------------------------------------------------------------------------
def find_orphan_modules(
repo_root: Path,
all_entries: list[FileEntry],
reachable: set[str],
) -> list[str]:
"""找出未被任何入口直接或间接引用的 Python 模块。
排除规则(不视为孤立):
- __init__.py 文件
- tests/ 目录下的文件
- scripts/audit/ 目录下的文件(审计脚本自身)
- 目录条目
- 非 .py 文件
- 不属于项目包的文件
返回按路径排序的孤立模块列表。
"""
orphans: list[str] = []
for entry in all_entries:
# 跳过目录
if entry.is_dir:
continue
# 只关注 .py 文件
if entry.extension != ".py":
continue
rel = entry.rel_path.replace("\\", "/")
# 排除 __init__.py
if rel.endswith("/__init__.py") or rel == "__init__.py":
continue
# 排除测试文件
if rel.startswith("tests/") or rel.startswith("tests\\"):
continue
# 排除审计脚本自身
if rel.startswith("scripts/audit/") or rel.startswith("scripts\\audit\\"):
continue
# 只检查属于项目包的文件
top_dir = rel.split("/")[0] if "/" in rel else ""
if top_dir not in _PROJECT_PACKAGES:
continue
# 不在可达集合中 → 孤立
if rel not in reachable:
orphans.append(rel)
orphans.sort()
return orphans
# ---------------------------------------------------------------------------
# 统计辅助
# ---------------------------------------------------------------------------
def _count_nodes_by_type(trees: list[FlowNode]) -> dict[str, int]:
"""递归统计流程树中各类型节点的数量。"""
counts: dict[str, int] = {"entry": 0, "module": 0, "class": 0, "function": 0}
def _walk(node: FlowNode) -> None:
t = node.node_type
counts[t] = counts.get(t, 0) + 1
for child in node.children:
_walk(child)
for tree in trees:
_walk(tree)
return counts
def _count_tasks_and_loaders(trees: list[FlowNode]) -> tuple[int, int]:
"""统计流程树中任务模块和加载器模块的数量。"""
tasks = 0
loaders = 0
seen: set[str] = set()
def _walk(node: FlowNode) -> None:
nonlocal tasks, loaders
if node.source_file in seen:
return
seen.add(node.source_file)
sf = node.source_file.replace("\\", "/")
if sf.startswith("tasks/") and not sf.endswith("__init__.py"):
base = sf.rsplit("/", 1)[-1]
if not base.startswith("base_"):
tasks += 1
if sf.startswith("loaders/") and not sf.endswith("__init__.py"):
base = sf.rsplit("/", 1)[-1]
if not base.startswith("base_"):
loaders += 1
for child in node.children:
_walk(child)
for tree in trees:
_walk(tree)
return tasks, loaders
# ---------------------------------------------------------------------------
# 类型标注辅助
# ---------------------------------------------------------------------------
def _get_type_annotation(source_file: str) -> str:
"""根据源文件路径返回类型标注字符串(用于报告中的节点标注)。"""
sf = source_file.replace("\\", "/")
if sf.startswith("tasks/"):
return f" [{classify_task_type(sf)}]"
if sf.startswith("loaders/"):
return f" [{classify_loader_type(sf)}]"
return ""
# ---------------------------------------------------------------------------
# Mermaid 图生成
# ---------------------------------------------------------------------------
def _render_mermaid(trees: list[FlowNode]) -> str:
"""生成 Mermaid 流程图代码。"""
lines: list[str] = ["```mermaid", "graph TD"]
seen_edges: set[tuple[str, str]] = set()
node_ids: dict[str, str] = {}
counter = [0]
def _node_id(name: str) -> str:
if name not in node_ids:
node_ids[name] = f"N{counter[0]}"
counter[0] += 1
return node_ids[name]
def _walk(node: FlowNode) -> None:
nid = _node_id(node.name)
annotation = _get_type_annotation(node.source_file)
label = f"{node.name}{annotation}"
# 声明节点
lines.append(f" {nid}[\"`{label}`\"]")
for child in node.children:
cid = _node_id(child.name)
edge = (nid, cid)
if edge not in seen_edges:
seen_edges.add(edge)
lines.append(f" {nid} --> {cid}")
_walk(child)
for tree in trees:
_walk(tree)
lines.append("```")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# 缩进文本树生成
# ---------------------------------------------------------------------------
def _render_text_tree(trees: list[FlowNode]) -> str:
"""生成缩进文本形式的流程树。"""
lines: list[str] = []
seen: set[str] = set()
def _walk(node: FlowNode, depth: int) -> None:
indent = " " * depth
annotation = _get_type_annotation(node.source_file)
line = f"{indent}- `{node.name}` (`{node.source_file}`){annotation}"
lines.append(line)
key = node.source_file
if key in seen:
# 已展开过,不再递归(避免循环)
if node.children:
lines.append(f"{indent} - *(已展开)*")
return
seen.add(key)
for child in node.children:
_walk(child, depth + 1)
for tree in trees:
_walk(tree, 0)
return "\n".join(lines)
# ---------------------------------------------------------------------------
# render_flow_report — 生成 Markdown 格式的流程树报告
# ---------------------------------------------------------------------------
def render_flow_report(
trees: list[FlowNode],
orphans: list[str],
repo_root: str,
) -> str:
"""生成 Markdown 格式的流程树报告(含 Mermaid 图和缩进文本)。
报告结构:
1. 头部(时间戳、仓库路径)
2. Mermaid 流程图
3. 缩进文本树
4. 孤立模块列表
5. 统计摘要
"""
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
sections: list[str] = []
# --- 头部 ---
sections.append("# 项目流程树报告\n")
sections.append(f"- 生成时间: {timestamp}")
sections.append(f"- 仓库路径: `{repo_root}`\n")
# --- Mermaid 图 ---
sections.append("## 流程图Mermaid\n")
sections.append(_render_mermaid(trees))
sections.append("")
# --- 缩进文本树 ---
sections.append("## 流程树(缩进文本)\n")
sections.append(_render_text_tree(trees))
sections.append("")
# --- 孤立模块 ---
sections.append("## 孤立模块\n")
if orphans:
for o in orphans:
sections.append(f"- `{o}`")
else:
sections.append("未发现孤立模块。")
sections.append("")
# --- 统计摘要 ---
entry_count = sum(1 for t in trees if t.node_type == "entry")
task_count, loader_count = _count_tasks_and_loaders(trees)
orphan_count = len(orphans)
sections.append("## 统计摘要\n")
sections.append(f"| 指标 | 数量 |")
sections.append(f"|------|------|")
sections.append(f"| 入口点 | {entry_count} |")
sections.append(f"| 任务 | {task_count} |")
sections.append(f"| 加载器 | {loader_count} |")
sections.append(f"| 孤立模块 | {orphan_count} |")
sections.append("")
return "\n".join(sections)