在准备环境前提交次全部更改。

2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions
--- a/apps/etl/connectors/feiqiu/scripts/audit/flow_analyzer.py
+++ b/apps/etl/connectors/feiqiu/scripts/audit/flow_analyzer.py
@@ -0,0 +1,618 @@
+# -*- coding: utf-8 -*-
+"""
+流程树分析器 — 通过静态分析 Python 源码的 import 语句和类继承关系，
+构建从入口到末端模块的调用树。
+
+仅执行只读操作：读取并解析 Python 源文件，不修改任何文件。
+"""
+
+from __future__ import annotations
+
+import ast
+import logging
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+from scripts.audit import FileEntry, FlowNode
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# 项目内部包名列表（顶层目录中属于项目代码的包）
+# ---------------------------------------------------------------------------
+
+_PROJECT_PACKAGES: set[str] = {
+    "cli", "config", "api", "database", "tasks", "loaders",
+    "scd", "orchestration", "quality", "models", "utils",
+    "gui", "scripts",
+}
+
+# ---------------------------------------------------------------------------
+# 已知的第三方包和标准库顶层模块（用于排除非项目导入）
+# ---------------------------------------------------------------------------
+
+_KNOWN_THIRD_PARTY: set[str] = {
+    "psycopg2", "requests", "dateutil", "python_dateutil",
+    "dotenv", "openpyxl", "PySide6", "flask", "pyinstaller",
+    "PyInstaller", "hypothesis", "pytest", "_pytest", "py",
+    "pluggy", "pkg_resources", "setuptools", "pip", "wheel",
+    "tzdata", "six", "certifi", "urllib3", "charset_normalizer",
+    "idna", "shiboken6",
+}
+
+
+def _is_project_module(module_name: str) -> bool:
+    """判断模块名是否属于项目内部模块。"""
+    top = module_name.split(".")[0]
+    if top in _PROJECT_PACKAGES:
+        return True
+    return False
+
+
+def _is_stdlib_or_third_party(module_name: str) -> bool:
+    """判断模块名是否属于标准库或已知第三方包。"""
+    top = module_name.split(".")[0]
+    if top in _KNOWN_THIRD_PARTY:
+        return True
+    # 检查标准库
+    if top in sys.stdlib_module_names:
+        return True
+    return False
+
+
+# ---------------------------------------------------------------------------
+# 文件读取（多编码回退）
+# ---------------------------------------------------------------------------
+
+def _read_source(filepath: Path) -> str | None:
+    """读取 Python 源文件内容，尝试 utf-8 → gbk → latin-1 回退。
+
+    返回文件内容字符串，读取失败时返回 None。
+    """
+    for encoding in ("utf-8", "gbk", "latin-1"):
+        try:
+            return filepath.read_text(encoding=encoding)
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+        except (OSError, PermissionError) as exc:
+            logger.warning("无法读取文件 %s: %s", filepath, exc)
+            return None
+    logger.warning("无法以任何编码读取文件 %s", filepath)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# 路径 ↔ 模块名转换
+# ---------------------------------------------------------------------------
+
+def _path_to_module_name(rel_path: str) -> str:
+    """将相对路径转换为 Python 模块名。
+
+    例如：
+    - "cli/main.py" → "cli.main"
+    - "cli/__init__.py" → "cli"
+    - "tasks/dws/assistant.py" → "tasks.dws.assistant"
+    """
+    p = rel_path.replace("\\", "/")
+    if p.endswith("/__init__.py"):
+        p = p[: -len("/__init__.py")]
+    elif p.endswith(".py"):
+        p = p[:-3]
+    return p.replace("/", ".")
+
+
+def _module_to_path(module_name: str) -> str:
+    """将模块名转换为相对文件路径（优先 .py 文件）。
+
+    例如：
+    - "cli.main" → "cli/main.py"
+    - "cli" → "cli/__init__.py"
+    """
+    return module_name.replace(".", "/") + ".py"
+
+
+# ---------------------------------------------------------------------------
+# parse_imports — 解析 Python 文件的 import 语句
+# ---------------------------------------------------------------------------
+
+def parse_imports(filepath: Path) -> list[str]:
+    """使用 ast 模块解析 Python 文件的 import 语句，返回被导入的本地模块列表。
+
+    - 仅返回项目内部模块（排除标准库和第三方包）
+    - 结果去重
+    - 语法错误或文件不存在时返回空列表
+    """
+    if not filepath.exists():
+        return []
+
+    source = _read_source(filepath)
+    if source is None:
+        return []
+
+    try:
+        tree = ast.parse(source, filename=str(filepath))
+    except SyntaxError:
+        logger.warning("语法错误，无法解析 %s", filepath)
+        return []
+
+    modules: list[str] = []
+
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                name = alias.name
+                if _is_project_module(name) and not _is_stdlib_or_third_party(name):
+                    modules.append(name)
+        elif isinstance(node, ast.ImportFrom):
+            if node.module and node.level == 0:
+                name = node.module
+                if _is_project_module(name) and not _is_stdlib_or_third_party(name):
+                    modules.append(name)
+
+    # 去重并保持顺序
+    seen: set[str] = set()
+    result: list[str] = []
+    for m in modules:
+        if m not in seen:
+            seen.add(m)
+            result.append(m)
+    return result
+
+
+# ---------------------------------------------------------------------------
+# build_flow_tree — 从入口递归追踪 import 链，构建流程树
+# ---------------------------------------------------------------------------
+
+def build_flow_tree(
+    repo_root: Path,
+    entry_file: str,
+    _visited: set[str] | None = None,
+) -> FlowNode:
+    """从指定入口文件出发，递归追踪 import 链，构建流程树。
+
+    Parameters
+    ----------
+    repo_root : Path
+        仓库根目录。
+    entry_file : str
+        入口文件的相对路径（如 "cli/main.py"）。
+    _visited : set[str] | None
+        内部使用，防止循环导入导致无限递归。
+
+    Returns
+    -------
+    FlowNode
+        以入口文件为根的流程树。
+    """
+    is_root = _visited is None
+    if _visited is None:
+        _visited = set()
+
+    module_name = _path_to_module_name(entry_file)
+    node_type = "entry" if is_root else "module"
+
+    _visited.add(entry_file)
+
+    filepath = repo_root / entry_file
+    children: list[FlowNode] = []
+
+    if filepath.exists():
+        imported_modules = parse_imports(filepath)
+        for mod in imported_modules:
+            child_path = _module_to_path(mod)
+            # 如果 .py 文件不存在，尝试 __init__.py
+            if not (repo_root / child_path).exists():
+                alt_path = mod.replace(".", "/") + "/__init__.py"
+                if (repo_root / alt_path).exists():
+                    child_path = alt_path
+
+            if child_path not in _visited:
+                child_node = build_flow_tree(repo_root, child_path, _visited)
+                children.append(child_node)
+
+    return FlowNode(
+        name=module_name,
+        source_file=entry_file,
+        node_type=node_type,
+        children=children,
+    )
+
+
+# ---------------------------------------------------------------------------
+# 批处理文件解析
+# ---------------------------------------------------------------------------
+
+def _parse_bat_python_target(bat_path: Path) -> str | None:
+    """从批处理文件中解析 python -m 命令的目标模块名。
+
+    返回模块名（如 "cli.main"），未找到时返回 None。
+    """
+    if not bat_path.exists():
+        return None
+
+    content = _read_source(bat_path)
+    if content is None:
+        return None
+
+    # 匹配 python -m module.name 或 python3 -m module.name
+    pattern = re.compile(r"python[3]?\s+-m\s+([\w.]+)", re.IGNORECASE)
+    for line in content.splitlines():
+        m = pattern.search(line)
+        if m:
+            return m.group(1)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# 入口点识别
+# ---------------------------------------------------------------------------
+
+def discover_entry_points(repo_root: Path) -> list[dict[str, str]]:
+    """识别项目的所有入口点。
+
+    返回字典列表，每个字典包含：
+    - type: 入口类型（CLI / GUI / 批处理 / 运维脚本）
+    - file: 相对路径
+    - description: 简要说明
+
+    识别规则：
+    - cli/main.py → CLI 入口
+    - gui/main.py → GUI 入口
+    - *.bat 文件 → 解析其中的 python -m 命令
+    - scripts/*.py（含 if __name__ == "__main__"，排除 __init__.py 和 audit/ 子目录）
+    """
+    entries: list[dict[str, str]] = []
+
+    # CLI 入口
+    cli_main = repo_root / "cli" / "main.py"
+    if cli_main.exists():
+        entries.append({
+            "type": "CLI",
+            "file": "cli/main.py",
+            "description": "CLI 主入口 (`python -m cli.main`)",
+        })
+
+    # GUI 入口
+    gui_main = repo_root / "gui" / "main.py"
+    if gui_main.exists():
+        entries.append({
+            "type": "GUI",
+            "file": "gui/main.py",
+            "description": "GUI 主入口 (`python -m gui.main`)",
+        })
+
+    # 批处理文件
+    for bat in sorted(repo_root.glob("*.bat")):
+        target = _parse_bat_python_target(bat)
+        desc = f"批处理脚本"
+        if target:
+            desc += f"，调用 `{target}`"
+        entries.append({
+            "type": "批处理",
+            "file": bat.name,
+            "description": desc,
+        })
+
+    # 运维脚本：scripts/ 下的 .py 文件（排除 __init__.py 和 audit/ 子目录）
+    scripts_dir = repo_root / "scripts"
+    if scripts_dir.is_dir():
+        for py_file in sorted(scripts_dir.glob("*.py")):
+            if py_file.name == "__init__.py":
+                continue
+            # 检查是否包含 if __name__ == "__main__"
+            source = _read_source(py_file)
+            if source and '__name__' in source and '__main__' in source:
+                rel = py_file.relative_to(repo_root).as_posix()
+                entries.append({
+                    "type": "运维脚本",
+                    "file": rel,
+                    "description": f"运维脚本 `{py_file.name}`",
+                })
+
+    return entries
+
+
+# ---------------------------------------------------------------------------
+# 任务类型和加载器类型区分
+# ---------------------------------------------------------------------------
+
+def classify_task_type(rel_path: str) -> str:
+    """根据文件路径区分任务类型。
+
+    返回值：
+    - "ODS 抓取任务"
+    - "DWD 加载任务"
+    - "DWS 汇总任务"
+    - "校验任务"
+    - "Schema 初始化任务"
+    - "任务"（无法细分时的默认值）
+    """
+    p = rel_path.replace("\\", "/").lower()
+
+    if "verification/" in p or "verification\\" in p:
+        return "校验任务"
+    if "dws/" in p or "dws\\" in p:
+        return "DWS 汇总任务"
+    # 文件名级别判断
+    basename = p.rsplit("/", 1)[-1] if "/" in p else p
+    if basename.startswith("ods_") or basename.startswith("ods."):
+        return "ODS 抓取任务"
+    if basename.startswith("dwd_") or basename.startswith("dwd."):
+        return "DWD 加载任务"
+    if basename.startswith("dws_"):
+        return "DWS 汇总任务"
+    if "init" in basename and "schema" in basename:
+        return "Schema 初始化任务"
+    return "任务"
+
+
+def classify_loader_type(rel_path: str) -> str:
+    """根据文件路径区分加载器类型。
+
+    返回值：
+    - "维度加载器 (SCD2)"
+    - "事实表加载器"
+    - "ODS 通用加载器"
+    - "加载器"（无法细分时的默认值）
+    """
+    p = rel_path.replace("\\", "/").lower()
+
+    if "dimensions/" in p or "dimensions\\" in p:
+        return "维度加载器 (SCD2)"
+    if "facts/" in p or "facts\\" in p:
+        return "事实表加载器"
+    if "ods/" in p or "ods\\" in p:
+        return "ODS 通用加载器"
+    return "加载器"
+
+
+# ---------------------------------------------------------------------------
+# find_orphan_modules — 找出未被任何入口直接或间接引用的 Python 模块
+# ---------------------------------------------------------------------------
+
+def find_orphan_modules(
+    repo_root: Path,
+    all_entries: list[FileEntry],
+    reachable: set[str],
+) -> list[str]:
+    """找出未被任何入口直接或间接引用的 Python 模块。
+
+    排除规则（不视为孤立）：
+    - __init__.py 文件
+    - tests/ 目录下的文件
+    - scripts/audit/ 目录下的文件（审计脚本自身）
+    - 目录条目
+    - 非 .py 文件
+    - 不属于项目包的文件
+
+    返回按路径排序的孤立模块列表。
+    """
+    orphans: list[str] = []
+
+    for entry in all_entries:
+        # 跳过目录
+        if entry.is_dir:
+            continue
+        # 只关注 .py 文件
+        if entry.extension != ".py":
+            continue
+
+        rel = entry.rel_path.replace("\\", "/")
+
+        # 排除 __init__.py
+        if rel.endswith("/__init__.py") or rel == "__init__.py":
+            continue
+        # 排除测试文件
+        if rel.startswith("tests/") or rel.startswith("tests\\"):
+            continue
+        # 排除审计脚本自身
+        if rel.startswith("scripts/audit/") or rel.startswith("scripts\\audit\\"):
+            continue
+
+        # 只检查属于项目包的文件
+        top_dir = rel.split("/")[0] if "/" in rel else ""
+        if top_dir not in _PROJECT_PACKAGES:
+            continue
+
+        # 不在可达集合中 → 孤立
+        if rel not in reachable:
+            orphans.append(rel)
+
+    orphans.sort()
+    return orphans
+
+
+# ---------------------------------------------------------------------------
+# 统计辅助
+# ---------------------------------------------------------------------------
+
+def _count_nodes_by_type(trees: list[FlowNode]) -> dict[str, int]:
+    """递归统计流程树中各类型节点的数量。"""
+    counts: dict[str, int] = {"entry": 0, "module": 0, "class": 0, "function": 0}
+
+    def _walk(node: FlowNode) -> None:
+        t = node.node_type
+        counts[t] = counts.get(t, 0) + 1
+        for child in node.children:
+            _walk(child)
+
+    for tree in trees:
+        _walk(tree)
+    return counts
+
+
+def _count_tasks_and_loaders(trees: list[FlowNode]) -> tuple[int, int]:
+    """统计流程树中任务模块和加载器模块的数量。"""
+    tasks = 0
+    loaders = 0
+    seen: set[str] = set()
+
+    def _walk(node: FlowNode) -> None:
+        nonlocal tasks, loaders
+        if node.source_file in seen:
+            return
+        seen.add(node.source_file)
+        sf = node.source_file.replace("\\", "/")
+        if sf.startswith("tasks/") and not sf.endswith("__init__.py"):
+            base = sf.rsplit("/", 1)[-1]
+            if not base.startswith("base_"):
+                tasks += 1
+        if sf.startswith("loaders/") and not sf.endswith("__init__.py"):
+            base = sf.rsplit("/", 1)[-1]
+            if not base.startswith("base_"):
+                loaders += 1
+        for child in node.children:
+            _walk(child)
+
+    for tree in trees:
+        _walk(tree)
+    return tasks, loaders
+
+
+# ---------------------------------------------------------------------------
+# 类型标注辅助
+# ---------------------------------------------------------------------------
+
+def _get_type_annotation(source_file: str) -> str:
+    """根据源文件路径返回类型标注字符串（用于报告中的节点标注）。"""
+    sf = source_file.replace("\\", "/")
+    if sf.startswith("tasks/"):
+        return f" [{classify_task_type(sf)}]"
+    if sf.startswith("loaders/"):
+        return f" [{classify_loader_type(sf)}]"
+    return ""
+
+
+# ---------------------------------------------------------------------------
+# Mermaid 图生成
+# ---------------------------------------------------------------------------
+
+def _render_mermaid(trees: list[FlowNode]) -> str:
+    """生成 Mermaid 流程图代码。"""
+    lines: list[str] = ["```mermaid", "graph TD"]
+    seen_edges: set[tuple[str, str]] = set()
+    node_ids: dict[str, str] = {}
+    counter = [0]
+
+    def _node_id(name: str) -> str:
+        if name not in node_ids:
+            node_ids[name] = f"N{counter[0]}"
+            counter[0] += 1
+        return node_ids[name]
+
+    def _walk(node: FlowNode) -> None:
+        nid = _node_id(node.name)
+        annotation = _get_type_annotation(node.source_file)
+        label = f"{node.name}{annotation}"
+        # 声明节点
+        lines.append(f"    {nid}[\"`{label}`\"]")
+        for child in node.children:
+            cid = _node_id(child.name)
+            edge = (nid, cid)
+            if edge not in seen_edges:
+                seen_edges.add(edge)
+                lines.append(f"    {nid} --> {cid}")
+            _walk(child)
+
+    for tree in trees:
+        _walk(tree)
+
+    lines.append("```")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# 缩进文本树生成
+# ---------------------------------------------------------------------------
+
+def _render_text_tree(trees: list[FlowNode]) -> str:
+    """生成缩进文本形式的流程树。"""
+    lines: list[str] = []
+    seen: set[str] = set()
+
+    def _walk(node: FlowNode, depth: int) -> None:
+        indent = "  " * depth
+        annotation = _get_type_annotation(node.source_file)
+        line = f"{indent}- `{node.name}` (`{node.source_file}`){annotation}"
+        lines.append(line)
+
+        key = node.source_file
+        if key in seen:
+            # 已展开过，不再递归（避免循环）
+            if node.children:
+                lines.append(f"{indent}  - *(已展开)*")
+            return
+        seen.add(key)
+
+        for child in node.children:
+            _walk(child, depth + 1)
+
+    for tree in trees:
+        _walk(tree, 0)
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# render_flow_report — 生成 Markdown 格式的流程树报告
+# ---------------------------------------------------------------------------
+
+def render_flow_report(
+    trees: list[FlowNode],
+    orphans: list[str],
+    repo_root: str,
+) -> str:
+    """生成 Markdown 格式的流程树报告（含 Mermaid 图和缩进文本）。
+
+    报告结构：
+    1. 头部（时间戳、仓库路径）
+    2. Mermaid 流程图
+    3. 缩进文本树
+    4. 孤立模块列表
+    5. 统计摘要
+    """
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    sections: list[str] = []
+
+    # --- 头部 ---
+    sections.append("# 项目流程树报告\n")
+    sections.append(f"- 生成时间: {timestamp}")
+    sections.append(f"- 仓库路径: `{repo_root}`\n")
+
+    # --- Mermaid 图 ---
+    sections.append("## 流程图（Mermaid）\n")
+    sections.append(_render_mermaid(trees))
+    sections.append("")
+
+    # --- 缩进文本树 ---
+    sections.append("## 流程树（缩进文本）\n")
+    sections.append(_render_text_tree(trees))
+    sections.append("")
+
+    # --- 孤立模块 ---
+    sections.append("## 孤立模块\n")
+    if orphans:
+        for o in orphans:
+            sections.append(f"- `{o}`")
+    else:
+        sections.append("未发现孤立模块。")
+    sections.append("")
+
+    # --- 统计摘要 ---
+    entry_count = sum(1 for t in trees if t.node_type == "entry")
+    task_count, loader_count = _count_tasks_and_loaders(trees)
+    orphan_count = len(orphans)
+
+    sections.append("## 统计摘要\n")
+    sections.append(f"| 指标 | 数量 |")
+    sections.append(f"|------|------|")
+    sections.append(f"| 入口点 | {entry_count} |")
+    sections.append(f"| 任务 | {task_count} |")
+    sections.append(f"| 加载器 | {loader_count} |")
+    sections.append(f"| 孤立模块 | {orphan_count} |")
+    sections.append("")
+
+    return "\n".join(sections)