Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/audit/doc_alignment_analyzer.py

# -*- coding: utf-8 -*-
"""
文档对齐分析器 — 检查文档与代码之间的映射关系、过期点、冲突点和缺失点。

文档来源：
- docs/ 目录（.md, .txt, .csv, .json）
- 根目录 README.md
- 各模块内的 README.md
- .kiro/steering/ 引导文件
- docs/test-json-doc/ API 响应样本
"""

from __future__ import annotations

import json
import re
from datetime import datetime, timezone
from pathlib import Path

from scripts.audit import AlignmentIssue, DocMapping

# ---------------------------------------------------------------------------
# 常量
# ---------------------------------------------------------------------------

# 文档文件扩展名
_DOC_EXTENSIONS = {".md", ".txt", ".csv"}

# 核心代码目录——缺少文档时应报告
_CORE_CODE_DIRS = {
    "tasks",
    "loaders",
    "orchestration",
    "quality",
    "models",
    "utils",
    "api",
    "scd",
    "config",
    "database",
}

# ODS 表中的通用元数据列，比对时忽略
_ODS_META_COLUMNS = {"content_hash", "payload", "created_at", "updated_at", "id"}

# SQL 关键字，解析 DDL 列名时排除
_SQL_KEYWORDS = {
    "primary", "key", "not", "null", "default", "unique", "check",
    "references", "foreign", "constraint", "index", "create", "table",
    "if", "exists", "serial", "bigserial", "true", "false",
}


# ---------------------------------------------------------------------------
# 安全读取文件（编码回退）
# ---------------------------------------------------------------------------

def _safe_read(path: Path) -> str:
    """尝试以 utf-8 → gbk → latin-1 回退读取文件内容。"""
    for enc in ("utf-8", "gbk", "latin-1"):
        try:
            return path.read_text(encoding=enc)
        except (UnicodeDecodeError, UnicodeError):
            continue
    return ""


# ---------------------------------------------------------------------------
# scan_docs — 扫描所有文档来源
# ---------------------------------------------------------------------------

def scan_docs(repo_root: Path) -> list[str]:
    """扫描所有文档文件路径，返回相对路径列表（已排序）。

    文档来源：
    1. docs/ 目录下的 .md, .txt, .csv, .json 文件
    2. 根目录 README.md
    3. 各模块内的 README.md（如 gui/README.md）
    4. .kiro/steering/ 引导文件
    """
    results: list[str] = []

    def _rel(p: Path) -> str:
        """返回归一化的正斜杠相对路径。"""
        return str(p.relative_to(repo_root)).replace("\\", "/")

    # 1. docs/ 目录（递归，含 test-json-doc 下的 .json）
    docs_dir = repo_root / "docs"
    if docs_dir.is_dir():
        for p in docs_dir.rglob("*"):
            if p.is_file():
                ext = p.suffix.lower()
                if ext in _DOC_EXTENSIONS or ext == ".json":
                    results.append(_rel(p))

    # 2. 根目录 README.md
    root_readme = repo_root / "README.md"
    if root_readme.is_file():
        results.append("README.md")

    # 3. 各模块内的 README.md
    for child in sorted(repo_root.iterdir()):
        if child.is_dir() and child.name not in ("docs", ".kiro"):
            readme = child / "README.md"
            if readme.is_file():
                results.append(_rel(readme))

    # 4. .kiro/steering/
    steering_dir = repo_root / ".kiro" / "steering"
    if steering_dir.is_dir():
        for p in sorted(steering_dir.iterdir()):
            if p.is_file():
                results.append(_rel(p))

    return sorted(set(results))


# ---------------------------------------------------------------------------
# extract_code_references — 从文档提取代码引用
# ---------------------------------------------------------------------------

def extract_code_references(doc_path: Path) -> list[str]:
    """从文档中提取代码引用（反引号内的文件路径、类名、函数名等）。

    规则：
    - 提取反引号内的内容
    - 跳过单字符引用
    - 跳过纯数字/版本号
    - 反斜杠归一化为正斜杠
    - 去重
    """
    if not doc_path.is_file():
        return []

    text = _safe_read(doc_path)
    if not text:
        return []

    # 提取反引号内容
    backtick_refs = re.findall(r"`([^`]+)`", text)

    seen: set[str] = set()
    results: list[str] = []

    for raw in backtick_refs:
        ref = raw.strip()
        # 归一化反斜杠
        ref = ref.replace("\\", "/")
        # 跳过单字符
        if len(ref) <= 1:
            continue
        # 跳过纯数字和版本号
        if re.fullmatch(r"[\d.]+", ref):
            continue
        # 去重
        if ref in seen:
            continue
        seen.add(ref)
        results.append(ref)

    return results


# ---------------------------------------------------------------------------
# check_reference_validity — 检查引用有效性
# ---------------------------------------------------------------------------

def check_reference_validity(ref: str, repo_root: Path) -> bool:
    """检查文档中的代码引用是否仍然有效。

    检查策略：
    1. 直接作为文件/目录路径检查
    2. 去掉 FQ-ETL/ 前缀后检查（兼容旧文档引用）
    3. 将点号路径转为文件路径检查（如 config.settings → config/settings.py）
    """
    # 1. 直接路径
    if (repo_root / ref).exists():
        return True

    # 2. 去掉旧包名前缀（兼容历史文档）
    for prefix in ("FQ-ETL/", "etl_billiards/"):
        if ref.startswith(prefix):
            stripped = ref[len(prefix):]
            if (repo_root / stripped).exists():
                return True

    # 3. 点号模块路径 → 文件路径
    if "." in ref and "/" not in ref:
        as_path = ref.replace(".", "/") + ".py"
        if (repo_root / as_path).exists():
            return True
        # 也可能是目录（包）
        as_dir = ref.replace(".", "/")
        if (repo_root / as_dir).is_dir():
            return True

    return False


# ---------------------------------------------------------------------------
# find_undocumented_modules — 找出缺少文档的核心代码模块
# ---------------------------------------------------------------------------

def find_undocumented_modules(
    repo_root: Path,
    documented: set[str],
) -> list[str]:
    """找出缺少文档的核心代码模块。

    只检查 _CORE_CODE_DIRS 中的 .py 文件（排除 __init__.py）。
    返回已排序的相对路径列表。
    """
    undocumented: list[str] = []

    for core_dir in sorted(_CORE_CODE_DIRS):
        dir_path = repo_root / core_dir
        if not dir_path.is_dir():
            continue
        for py_file in dir_path.rglob("*.py"):
            if py_file.name == "__init__.py":
                continue
            rel = str(py_file.relative_to(repo_root))
            # 归一化路径分隔符
            rel = rel.replace("\\", "/")
            if rel not in documented:
                undocumented.append(rel)

    return sorted(undocumented)


# ---------------------------------------------------------------------------
# DDL / 数据字典解析辅助函数
# ---------------------------------------------------------------------------

def _parse_ddl_tables(sql: str) -> dict[str, set[str]]:
    """从 DDL SQL 中提取表名和列名。

    返回 {表名: {列名集合}} 字典。
    支持带 schema 前缀的表名（如 billiards_dwd.dim_member → dim_member）。
    """
    tables: dict[str, set[str]] = {}

    # 匹配 CREATE TABLE [IF NOT EXISTS] [schema.]table_name (
    create_re = re.compile(
        r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?"
        r"(?:\w+\.)?(\w+)\s*\(",
        re.IGNORECASE,
    )

    for match in create_re.finditer(sql):
        table_name = match.group(1)
        # 找到对应的括号内容
        start = match.end()
        depth = 1
        pos = start
        while pos < len(sql) and depth > 0:
            if sql[pos] == "(":
                depth += 1
            elif sql[pos] == ")":
                depth -= 1
            pos += 1
        body = sql[start:pos - 1]

        columns: set[str] = set()
        # 逐行提取列名——取每行第一个标识符
        for line in body.split("\n"):
            line = line.strip().rstrip(",")
            if not line:
                continue
            # 提取第一个单词
            col_match = re.match(r"(\w+)", line)
            if col_match:
                col_name = col_match.group(1).lower()
                # 排除 SQL 关键字
                if col_name not in _SQL_KEYWORDS:
                    columns.add(col_name)

        tables[table_name] = columns

    return tables


def _parse_dictionary_tables(md: str) -> dict[str, set[str]]:
    """从数据字典 Markdown 中提取表名和字段名。

    约定：
    - 表名出现在 ## 标题中（可能带反引号）
    - 字段名出现在 Markdown 表格的第一列
    - 跳过表头行（含"字段"字样）和分隔行（含 ---）
    """
    tables: dict[str, set[str]] = {}
    current_table: str | None = None

    for line in md.split("\n"):
        # 匹配 ## 标题中的表名
        heading = re.match(r"^##\s+`?(\w+)`?", line)
        if heading:
            current_table = heading.group(1)
            tables[current_table] = set()
            continue

        if current_table is None:
            continue

        # 跳过分隔行
        if re.match(r"^\s*\|[-\s|]+\|\s*$", line):
            continue

        # 解析表格行
        row_match = re.match(r"^\s*\|\s*(\S+)", line)
        if row_match:
            field = row_match.group(1)
            # 跳过表头（含"字段"字样）
            if field in ("字段",):
                continue
            tables[current_table].add(field)

    return tables


# ---------------------------------------------------------------------------
# check_ddl_vs_dictionary — DDL 与数据字典比对
# ---------------------------------------------------------------------------

def check_ddl_vs_dictionary(repo_root: Path) -> list[AlignmentIssue]:
    """比对 DDL 文件与数据字典文档的覆盖度。

    检查：
    1. DDL 中有但字典中没有的表 → missing
    2. 同名表中 DDL 有但字典没有的列 → conflict
    """
    issues: list[AlignmentIssue] = []

    # 收集所有 DDL 表定义
    ddl_tables: dict[str, set[str]] = {}
    db_dir = repo_root / "database"
    if db_dir.is_dir():
        for sql_file in sorted(db_dir.glob("schema_*.sql")):
            content = _safe_read(sql_file)
            for tbl, cols in _parse_ddl_tables(content).items():
                if tbl in ddl_tables:
                    ddl_tables[tbl] |= cols
                else:
                    ddl_tables[tbl] = set(cols)

    # 收集所有数据字典表定义
    dict_tables: dict[str, set[str]] = {}
    docs_dir = repo_root / "docs"
    if docs_dir.is_dir():
        for dict_file in sorted(docs_dir.glob("*dictionary*.md")):
            content = _safe_read(dict_file)
            for tbl, fields in _parse_dictionary_tables(content).items():
                if tbl in dict_tables:
                    dict_tables[tbl] |= fields
                else:
                    dict_tables[tbl] = set(fields)

    # 比对
    for tbl, ddl_cols in sorted(ddl_tables.items()):
        if tbl not in dict_tables:
            issues.append(AlignmentIssue(
                doc_path="docs/*dictionary*.md",
                issue_type="missing",
                description=f"DDL 定义了表 `{tbl}`，但数据字典中未收录",
                related_code=f"database/schema_*.sql ({tbl})",
            ))
        else:
            # 检查列差异
            dict_cols = dict_tables[tbl]
            missing_cols = ddl_cols - dict_cols
            for col in sorted(missing_cols):
                issues.append(AlignmentIssue(
                    doc_path="docs/*dictionary*.md",
                    issue_type="conflict",
                    description=f"表 `{tbl}` 的列 `{col}` 在 DDL 中存在但数据字典中缺失",
                    related_code=f"database/schema_*.sql ({tbl}.{col})",
                ))

    return issues


# ---------------------------------------------------------------------------
# check_api_samples_vs_parsers — API 样本与解析器比对
# ---------------------------------------------------------------------------

def check_api_samples_vs_parsers(repo_root: Path) -> list[AlignmentIssue]:
    """比对 API 响应样本与 ODS 表结构的一致性。

    策略：
    1. 扫描 docs/test-json-doc/ 下的 .json 文件
    2. 提取 JSON 中的顶层字段名
    3. 从 ODS DDL 中查找同名表
    4. 比对字段差异（忽略 ODS 元数据列）
    """
    issues: list[AlignmentIssue] = []

    sample_dir = repo_root / "docs" / "test-json-doc"
    if not sample_dir.is_dir():
        return issues

    # 收集 ODS 表定义（保留全部列，比对时忽略元数据列）
    ods_tables: dict[str, set[str]] = {}
    db_dir = repo_root / "database"
    if db_dir.is_dir():
        for sql_file in sorted(db_dir.glob("schema_*ODS*.sql")):
            content = _safe_read(sql_file)
            for tbl, cols in _parse_ddl_tables(content).items():
                ods_tables[tbl] = cols

    # 逐个样本文件比对
    for json_file in sorted(sample_dir.glob("*.json")):
        entity_name = json_file.stem  # 文件名（不含扩展名）作为实体名

        # 解析 JSON 样本
        try:
            content = _safe_read(json_file)
            data = json.loads(content)
        except (json.JSONDecodeError, ValueError):
            continue

        # 提取顶层字段名
        sample_fields: set[str] = set()
        if isinstance(data, list) and data:
            # 数组格式——取第一个元素的键
            first = data[0]
            if isinstance(first, dict):
                sample_fields = set(first.keys())
        elif isinstance(data, dict):
            sample_fields = set(data.keys())

        if not sample_fields:
            continue

        # 查找匹配的 ODS 表
        matched_table: str | None = None
        matched_cols: set[str] = set()
        for tbl, cols in ods_tables.items():
            # 表名包含实体名（如 test_entity 匹配 billiards_ods.test_entity）
            tbl_lower = tbl.lower()
            entity_lower = entity_name.lower()
            if entity_lower in tbl_lower or tbl_lower == entity_lower:
                matched_table = tbl
                matched_cols = cols
                break

        if matched_table is None:
            continue

        # 比对：样本中有但 ODS 表中没有的字段
        extra_fields = sample_fields - matched_cols
        for field in sorted(extra_fields):
            issues.append(AlignmentIssue(
                doc_path=f"docs/test-json-doc/{json_file.name}",
                issue_type="conflict",
                description=(
                    f"API 样本字段 `{field}` 在 ODS 表 `{matched_table}` 中未定义"
                ),
                related_code=f"database/schema_*ODS*.sql ({matched_table})",
            ))

    return issues


# ---------------------------------------------------------------------------
# build_mappings — 构建文档与代码的映射关系
# ---------------------------------------------------------------------------

def build_mappings(
    doc_paths: list[str],
    repo_root: Path,
) -> list[DocMapping]:
    """为每份文档建立与代码模块的映射关系。"""
    mappings: list[DocMapping] = []

    for doc_rel in doc_paths:
        doc_path = repo_root / doc_rel
        refs = extract_code_references(doc_path)

        # 确定关联代码和状态
        valid_refs: list[str] = []
        has_stale = False
        for ref in refs:
            if check_reference_validity(ref, repo_root):
                valid_refs.append(ref)
            else:
                has_stale = True

        # 推断文档主题（取文件名或第一行标题）
        topic = _infer_topic(doc_path, doc_rel)

        if not refs:
            status = "orphan"
        elif has_stale:
            status = "stale"
        else:
            status = "aligned"

        mappings.append(DocMapping(
            doc_path=doc_rel,
            doc_topic=topic,
            related_code=valid_refs,
            status=status,
        ))

    return mappings


def _infer_topic(doc_path: Path, doc_rel: str) -> str:
    """从文档推断主题——优先取 Markdown 一级标题，否则用文件名。"""
    if doc_path.is_file() and doc_path.suffix.lower() in (".md", ".txt"):
        try:
            text = _safe_read(doc_path)
            for line in text.split("\n"):
                line = line.strip()
                if line.startswith("# "):
                    return line[2:].strip()
        except Exception:
            pass
    return doc_rel


# ---------------------------------------------------------------------------
# render_alignment_report — 生成 Markdown 格式的文档对齐报告
# ---------------------------------------------------------------------------

def render_alignment_report(
    mappings: list[DocMapping],
    issues: list[AlignmentIssue],
    repo_root: str,
) -> str:
    """生成 Markdown 格式的文档对齐报告。

    分区：映射关系表、过期点列表、冲突点列表、缺失点列表、统计摘要。
    """
    lines: list[str] = []

    # --- 头部 ---
    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
    lines.append("# 文档对齐报告")
    lines.append("")
    lines.append(f"- 生成时间：{now}")
    lines.append(f"- 仓库路径：`{repo_root}`")
    lines.append("")

    # --- 映射关系 ---
    lines.append("## 映射关系")
    lines.append("")
    if mappings:
        lines.append("| 文档路径 | 主题 | 关联代码 | 状态 |")
        lines.append("|---|---|---|---|")
        for m in mappings:
            code_str = ", ".join(f"`{c}`" for c in m.related_code) if m.related_code else "—"
            lines.append(f"| `{m.doc_path}` | {m.doc_topic} | {code_str} | {m.status} |")
    else:
        lines.append("未发现文档映射关系。")
    lines.append("")

    # --- 按 issue_type 分组 ---
    stale = [i for i in issues if i.issue_type == "stale"]
    conflict = [i for i in issues if i.issue_type == "conflict"]
    missing = [i for i in issues if i.issue_type == "missing"]

    # --- 过期点 ---
    lines.append("## 过期点")
    lines.append("")
    if stale:
        lines.append("| 文档路径 | 描述 | 关联代码 |")
        lines.append("|---|---|---|")
        for i in stale:
            lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
    else:
        lines.append("未发现过期点。")
    lines.append("")

    # --- 冲突点 ---
    lines.append("## 冲突点")
    lines.append("")
    if conflict:
        lines.append("| 文档路径 | 描述 | 关联代码 |")
        lines.append("|---|---|---|")
        for i in conflict:
            lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
    else:
        lines.append("未发现冲突点。")
    lines.append("")

    # --- 缺失点 ---
    lines.append("## 缺失点")
    lines.append("")
    if missing:
        lines.append("| 文档路径 | 描述 | 关联代码 |")
        lines.append("|---|---|---|")
        for i in missing:
            lines.append(f"| `{i.doc_path}` | {i.description} | `{i.related_code}` |")
    else:
        lines.append("未发现缺失点。")
    lines.append("")

    # --- 统计摘要 ---
    lines.append("## 统计摘要")
    lines.append("")
    lines.append(f"- 文档总数：{len(mappings)}")
    lines.append(f"- 过期点数量：{len(stale)}")
    lines.append(f"- 冲突点数量：{len(conflict)}")
    lines.append(f"- 缺失点数量：{len(missing)}")
    lines.append("")

    return "\n".join(lines)