Neo-ZQYY/scripts/ops/gen_full_dataflow_doc.py

# -*- coding: utf-8 -*-
"""
全链路数据流文档生成器：API JSON → ODS → DWD。
从真实 API 获取 JSON 样本，结合 DDL 和 ETL 源码，生成带跨层跳转链接的 Markdown 文档。

用法: python scripts/ops/gen_full_dataflow_doc.py
输出: $FULL_DATAFLOW_DOC_ROOT/dataflow_api_ods_dwd.md（由 .env 配置）
      $API_SAMPLE_CACHE_ROOT/*.json（API 原始响应缓存）
"""
import json
import os
import re
import sys
import time
from collections import OrderedDict
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any
from zoneinfo import ZoneInfo

import requests
from dotenv import load_dotenv

ROOT = Path(__file__).resolve().parents[2]
ETL = ROOT / "apps" / "etl" / "pipelines" / "feiqiu"
DB = ROOT / "db" / "etl_feiqiu" / "schemas"
# 从 .env 读取输出路径（缺失时抛 KeyError）
from _env_paths import get_output_path as _get_path
OUT = _get_path("FULL_DATAFLOW_DOC_ROOT") / "dataflow_api_ods_dwd.md"
SAMPLE_DIR = _get_path("API_SAMPLE_CACHE_ROOT")

TZ = ZoneInfo("Asia/Shanghai")

# ── 加载环境变量 ──────────────────────────────────────────────────
load_dotenv(ETL / ".env", override=True)
load_dotenv(ROOT / ".env")

API_BASE = os.environ.get("API_BASE", "").rstrip("/")
API_TOKEN = os.environ.get("API_TOKEN", "")
STORE_ID = os.environ.get("STORE_ID", "2790685415443269")
API_TIMEOUT = int(os.environ.get("API_TIMEOUT", "20"))

# ETL 元数据列（不来自 API）
ETL_META_COLS = {
    "content_hash", "source_file", "source_endpoint",
    "fetched_at", "payload", "record_index",
}

# ── ODS 任务规格（从 ODS_TASK_SPECS 提取的关键信息） ─────────────
# 格式: (code, table_name, endpoint, data_path, list_key, time_fields, requires_window, extra_params, description)
ODS_SPECS = [
    {
        "code": "ODS_ASSISTANT_ACCOUNT",
        "table": "assistant_accounts_master",
        "endpoint": "/PersonnelManagement/SearchAssistantInfo",
        "data_path": ("data",),
        "list_key": "assistantInfos",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "助教账号档案",
    },
    {
        "code": "ODS_SETTLEMENT_RECORDS",
        "table": "settlement_records",
        "endpoint": "/Site/GetAllOrderSettleList",
        "data_path": ("data",),
        "list_key": "settleList",
        "time_fields": ("rangeStartTime", "rangeEndTime"),
        "requires_window": True,
        "extra_params": {},
        "description": "结账记录",
    },
    {
        "code": "ODS_TABLE_USE",
        "table": "table_fee_transactions",
        "endpoint": "/Site/GetSiteTableOrderDetails",
        "data_path": ("data",),
        "list_key": "siteTableUseDetailsList",
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "台费计费流水",
    },
    {
        "code": "ODS_ASSISTANT_LEDGER",
        "table": "assistant_service_records",
        "endpoint": "/AssistantPerformance/GetOrderAssistantDetails",
        "data_path": ("data",),
        "list_key": "orderAssistantDetails",
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "助教服务流水",
    },
    {
        "code": "ODS_STORE_GOODS_SALES",
        "table": "store_goods_sales_records",
        "endpoint": "/TenantGoods/GetGoodsSalesList",
        "data_path": ("data",),
        "list_key": "orderGoodsLedgers",
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "门店商品销售流水",
    },
    {
        "code": "ODS_PAYMENT",
        "table": "payment_transactions",
        "endpoint": "/PayLog/GetPayLogListPage",
        "data_path": ("data",),
        "list_key": None,
        "time_fields": ("StartPayTime", "EndPayTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "支付流水",
    },
    {
        "code": "ODS_REFUND",
        "table": "refund_transactions",
        "endpoint": "/Order/GetRefundPayLogList",
        "data_path": ("data",),
        "list_key": None,
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "退款流水",
    },
    {
        "code": "ODS_PLATFORM_COUPON",
        "table": "platform_coupon_redemption_records",
        "endpoint": "/Promotion/GetOfflineCouponConsumePageList",
        "data_path": ("data",),
        "list_key": None,
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "平台/团购券核销",
    },
    {
        "code": "ODS_MEMBER",
        "table": "member_profiles",
        "endpoint": "/MemberProfile/GetTenantMemberList",
        "data_path": ("data",),
        "list_key": "tenantMemberInfos",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "会员档案",
    },
    {
        "code": "ODS_MEMBER_CARD",
        "table": "member_stored_value_cards",
        "endpoint": "/MemberProfile/GetTenantMemberCardList",
        "data_path": ("data",),
        "list_key": "tenantMemberCards",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "会员储值卡",
    },
    {
        "code": "ODS_MEMBER_BALANCE",
        "table": "member_balance_changes",
        "endpoint": "/MemberProfile/GetMemberCardBalanceChange",
        "data_path": ("data",),
        "list_key": "tenantMemberCardLogs",
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "会员余额变动",
    },
    {
        "code": "ODS_RECHARGE_SETTLE",
        "table": "recharge_settlements",
        "endpoint": "/Site/GetRechargeSettleList",
        "data_path": ("data",),
        "list_key": "settleList",
        "time_fields": ("rangeStartTime", "rangeEndTime"),
        "requires_window": True,
        "extra_params": {},
        "description": "充值结算",
    },
    {
        "code": "ODS_GROUP_PACKAGE",
        "table": "group_buy_packages",
        "endpoint": "/PackageCoupon/QueryPackageCouponList",
        "data_path": ("data",),
        "list_key": "packageCouponList",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "团购套餐定义",
    },
    {
        "code": "ODS_GROUP_BUY_REDEMPTION",
        "table": "group_buy_redemption_records",
        "endpoint": "/Site/GetSiteTableUseDetails",
        "data_path": ("data",),
        "list_key": "siteTableUseDetailsList",
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "团购套餐核销",
    },
    {
        "code": "ODS_INVENTORY_STOCK",
        "table": "goods_stock_summary",
        "endpoint": "/TenantGoods/GetGoodsStockReport",
        "data_path": ("data",),
        "list_key": None,
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "库存汇总",
    },
    {
        "code": "ODS_INVENTORY_CHANGE",
        "table": "goods_stock_movements",
        "endpoint": "/GoodsStockManage/QueryGoodsOutboundReceipt",
        "data_path": ("data",),
        "list_key": "queryDeliveryRecordsList",
        "time_fields": ("startTime", "endTime"),
        "requires_window": True,
        "extra_params": {},
        "description": "库存变化记录",
    },
    {
        "code": "ODS_TABLES",
        "table": "site_tables_master",
        "endpoint": "/Table/GetSiteTables",
        "data_path": ("data",),
        "list_key": "siteTables",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "台桌维表",
    },
    {
        "code": "ODS_GOODS_CATEGORY",
        "table": "stock_goods_category_tree",
        "endpoint": "/TenantGoodsCategory/QueryPrimarySecondaryCategory",
        "data_path": ("data",),
        "list_key": "goodsCategoryList",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "库存商品分类树",
    },
    {
        "code": "ODS_STORE_GOODS",
        "table": "store_goods_master",
        "endpoint": "/TenantGoods/GetGoodsInventoryList",
        "data_path": ("data",),
        "list_key": "orderGoodsList",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {"siteId": [STORE_ID]},
        "description": "门店商品档案",
    },
    {
        "code": "ODS_TABLE_FEE_DISCOUNT",
        "table": "table_fee_discount_records",
        "endpoint": "/Site/GetTaiFeeAdjustList",
        "data_path": ("data",),
        "list_key": "taiFeeAdjustInfos",
        "time_fields": ("startTime", "endTime"),
        "requires_window": False,
        "extra_params": {},
        "description": "台费折扣/调账",
    },
    {
        "code": "ODS_TENANT_GOODS",
        "table": "tenant_goods_master",
        "endpoint": "/TenantGoods/QueryTenantGoods",
        "data_path": ("data",),
        "list_key": "tenantGoodsList",
        "time_fields": None,
        "requires_window": False,
        "extra_params": {},
        "description": "租户商品档案",
    },
]


# ── 浏览器风格请求头 ──────────────────────────────────────────────
HEADERS = {
    "Accept": "application/json, text/plain, */*",
    "Content-Type": "application/json",
    "Origin": "https://pc.ficoo.vip",
    "Referer": "https://pc.ficoo.vip/",
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"
    ),
    "Authorization": f"Bearer {API_TOKEN}" if API_TOKEN else "",
}

# 默认 list_key 候选（与 APIClient 一致）
DEFAULT_LIST_KEYS = (
    "list", "rows", "records", "items", "dataList", "data_list",
    "tenantMemberInfos", "tenantMemberCardLogs", "tenantMemberCards",
    "settleList", "orderAssistantDetails", "assistantInfos", "siteTables",
    "taiFeeAdjustInfos", "siteTableUseDetailsList", "tenantGoodsList",
    "packageCouponList", "queryDeliveryRecordsList", "goodsCategoryList",
    "orderGoodsList", "orderGoodsLedgers",
)


# ══════════════════════════════════════════════════════════════════
# 1. API 请求
# ══════════════════════════════════════════════════════════════════

def api_post(endpoint: str, payload: dict) -> dict:
    """发送 POST 请求到 API。"""
    url = f"{API_BASE}/{endpoint.lstrip('/')}"
    resp = requests.post(url, json=payload, headers=HEADERS, timeout=API_TIMEOUT)
    resp.raise_for_status()
    data = resp.json()
    code = data.get("code")
    if code not in (0, "0", None):
        msg = data.get("msg") or data.get("message") or ""
        raise ValueError(f"API 错误 code={code} msg={msg} endpoint={endpoint}")
    return data


def extract_list(payload: dict, data_path: tuple, list_key: str | None) -> list:
    """从 API 响应中提取记录列表。"""
    cur = payload
    for key in data_path:
        if isinstance(cur, dict):
            cur = cur.get(key)
        else:
            cur = None
        if cur is None:
            break
    if isinstance(cur, list):
        return cur
    if isinstance(cur, dict):
        if list_key and isinstance(cur.get(list_key), list):
            return cur[list_key]
        for k in DEFAULT_LIST_KEYS:
            if isinstance(cur.get(k), list):
                return cur[k]
        for v in cur.values():
            if isinstance(v, list):
                return v
    return []


def fetch_records(spec: dict, target_count: int = 200) -> list[dict]:
    """
    获取 API 记录。
    - 有时间字段的表：从今天往回 10 天一批，不够则继续扩展，最多 10 次重试
    - 无时间字段的表：单次请求 200 条
    """
    endpoint = spec["endpoint"]
    data_path = spec["data_path"]
    list_key = spec["list_key"]
    time_fields = spec["time_fields"]
    extra_params = spec.get("extra_params", {})

    all_records = []

    if time_fields:
        # 有时间窗口：从今天往回扩展
        start_key, end_key = time_fields
        now = datetime.now(TZ)
        end_dt = now
        batch_days = 10
        max_retries = 10

        for attempt in range(max_retries):
            start_dt = end_dt - timedelta(days=batch_days)
            params = {
                "siteId": STORE_ID,
                "page": 1,
                "limit": target_count,
                start_key: start_dt.strftime("%Y-%m-%d %H:%M:%S"),
                end_key: end_dt.strftime("%Y-%m-%d %H:%M:%S"),
                **extra_params,
            }
            try:
                resp = api_post(endpoint, params)
                records = extract_list(resp, data_path, list_key)
                all_records.extend(records)
            except Exception as e:
                print(f"    警告: API 请求失败 {endpoint} attempt={attempt+1}: {e}")

            if len(all_records) >= target_count:
                break
            # 继续往前扩展
            end_dt = start_dt
    else:
        # 无时间窗口：单次请求
        params = {
            "siteId": STORE_ID,
            "page": 1,
            "limit": target_count,
            **extra_params,
        }
        try:
            resp = api_post(endpoint, params)
            all_records = extract_list(resp, data_path, list_key)
        except Exception as e:
            print(f"    警告: API 请求失败 {endpoint}: {e}")

    return all_records[:target_count]


# ══════════════════════════════════════════════════════════════════
# 2. JSON 字段分析
# ══════════════════════════════════════════════════════════════════

def merge_record_layers(record: dict) -> dict:
    """模拟 ETL 的 merge_record_layers：展平 data 和 settleList 嵌套层。"""
    merged = dict(record)
    data_part = merged.get("data")
    while isinstance(data_part, dict):
        merged = {**data_part, **merged}
        data_part = data_part.get("data")
    settle_inner = merged.get("settleList")
    if isinstance(settle_inner, dict):
        merged = {**settle_inner, **merged}
    return merged


def analyze_json_fields(records: list[dict]) -> OrderedDict[str, dict]:
    """
    分析多条 JSON 记录，合并所有字段信息。
    返回 OrderedDict: field_name -> {"type": str, "sample": str, "nested": bool}
    """
    fields: OrderedDict[str, dict] = OrderedDict()

    for record in records:
        merged = merge_record_layers(record)
        for k, v in merged.items():
            if k not in fields:
                vtype = _json_type(v)
                sample = _sample_value(v)
                nested = isinstance(v, (dict, list))
                fields[k] = {"type": vtype, "sample": sample, "nested": nested}
            elif fields[k]["type"] == "null" and v is not None:
                fields[k]["type"] = _json_type(v)
                fields[k]["sample"] = _sample_value(v)
                fields[k]["nested"] = isinstance(v, (dict, list))

    return fields


def _json_type(v: Any) -> str:
    if v is None:
        return "null"
    if isinstance(v, bool):
        return "boolean"
    if isinstance(v, int):
        return "integer"
    if isinstance(v, float):
        return "number"
    if isinstance(v, str):
        return "string"
    if isinstance(v, list):
        return "array"
    if isinstance(v, dict):
        return "object"
    return type(v).__name__


def _sample_value(v: Any, max_len: int = 40) -> str:
    if v is None:
        return "null"
    if isinstance(v, (dict, list)):
        s = json.dumps(v, ensure_ascii=False)
        return s[:max_len] + "..." if len(s) > max_len else s
    s = str(v)
    return s[:max_len] + "..." if len(s) > max_len else s


# ══════════════════════════════════════════════════════════════════
# 3. DDL 解析
# ══════════════════════════════════════════════════════════════════

def parse_ddl_tables(sql_path: Path, schema: str) -> dict[str, list[dict]]:
    """解析 CREATE TABLE 语句，返回 {table_name: [{col, type}, ...]}（不含 schema 前缀）"""
    text = sql_path.read_text(encoding="utf-8")
    tables: dict[str, list[dict]] = {}
    pattern = re.compile(
        r"CREATE\s+TABLE\s+IF\s+NOT\s+EXISTS\s+"
        r"(?:(\w+)\.)?(\w+)\s*\((.*?)\)\s*;",
        re.DOTALL | re.IGNORECASE,
    )
    for m in pattern.finditer(text):
        tname = m.group(2)
        body = m.group(3)
        cols = []
        for line in body.split("\n"):
            line = line.strip().rstrip(",")
            if not line or line.upper().startswith("PRIMARY") or line.startswith("--"):
                continue
            if re.match(r"^(CONSTRAINT|UNIQUE|CHECK|FOREIGN|EXCLUDE)\b", line, re.I):
                continue
            parts = line.split()
            if len(parts) >= 2:
                col_name = parts[0].strip('"')
                col_type = parts[1]
                if len(parts) > 2 and parts[2].startswith("("):
                    col_type += parts[2]
                cols.append({"col": col_name, "type": col_type})
        tables[tname] = cols
    return tables


# ══════════════════════════════════════════════════════════════════
# 4. TABLE_MAP / FACT_MAPPINGS 解析（复用 gen_dataflow_doc.py 逻辑）
# ══════════════════════════════════════════════════════════════════

def parse_table_map(py_path: Path) -> dict[str, str]:
    """解析 TABLE_MAP: {dwd_table -> ods_table}"""
    text = py_path.read_text(encoding="utf-8")
    match = re.search(
        r"TABLE_MAP\s*(?::\s*dict\[.*?\])?\s*=\s*\{(.*?)\}",
        text, re.DOTALL,
    )
    if not match:
        return {}
    body = match.group(1)
    result = {}
    for m in re.finditer(r'"([^"]+)"\s*:\s*"([^"]+)"', body):
        result[m.group(1)] = m.group(2)
    return result


def parse_fact_mappings(py_path: Path) -> dict[str, list[tuple]]:
    """解析 FACT_MAPPINGS: {dwd_table -> [(dwd_col, ods_expr, cast), ...]}"""
    text = py_path.read_text(encoding="utf-8")
    start = text.find("FACT_MAPPINGS")
    if start < 0:
        return {}
    brace_start = text.find("{", start)
    if brace_start < 0:
        return {}
    depth = 0
    end = brace_start
    for i in range(brace_start, len(text)):
        if text[i] == "{":
            depth += 1
        elif text[i] == "}":
            depth -= 1
            if depth == 0:
                end = i + 1
                break
    block = text[brace_start:end]
    result = {}
    table_pattern = re.compile(r'"([^"]+)"\s*:\s*\[', re.DOTALL)
    for tm in table_pattern.finditer(block):
        table_name = tm.group(1)
        list_start = tm.end()
        bracket_depth = 1
        list_end = list_start
        for i in range(list_start, len(block)):
            if block[i] == "[":
                bracket_depth += 1
            elif block[i] == "]":
                bracket_depth -= 1
                if bracket_depth == 0:
                    list_end = i
                    break
        list_body = block[list_start:list_end]
        tuples = []
        tuple_pattern = re.compile(
            r'\(\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*(?:"([^"]+)"|None)\s*\)'
        )
        for tp in tuple_pattern.finditer(list_body):
            tuples.append((tp.group(1), tp.group(2), tp.group(3)))
        result[table_name] = tuples
    return result


# ══════════════════════════════════════════════════════════════════
# 5. 映射计算
# ══════════════════════════════════════════════════════════════════

def make_anchor(prefix: str, table: str) -> str:
    """生成 Markdown 锚点 ID。"""
    return f"{prefix}-{table.replace('.', '-').replace('_', '-')}"


def compute_api_to_ods_mapping(
    api_fields: OrderedDict[str, dict],
    ods_cols: list[dict],
) -> list[dict]:
    """
    计算 API JSON 字段 → ODS 列的映射。
    ETL 使用大小写不敏感匹配。
    """
    ods_by_lower = {}
    for c in ods_cols:
        col_name = c["col"]
        if col_name.lower() not in ETL_META_COLS:
            ods_by_lower[col_name.lower()] = col_name

    mappings = []
    matched_ods = set()

    for api_key, info in api_fields.items():
        api_lower = api_key.lower()
        api_type = info["type"]

        # 跳过展平源对象
        if api_type == "object" and api_lower in ("data", "settlelist", "siteprofile", "tableprofile"):
            mappings.append({
                "api_field": api_key,
                "api_type": api_type,
                "ods_col": "—",
                "note": "嵌套对象，展平后各字段独立映射" if api_lower in ("data", "settlelist") else "嵌套对象，不直接映射",
            })
            continue

        if api_lower in ods_by_lower:
            ods_col = ods_by_lower[api_lower]
            matched_ods.add(api_lower)
            note = "同名映射" if api_key == ods_col else "大小写不敏感匹配"
            mappings.append({
                "api_field": api_key,
                "api_type": api_type,
                "ods_col": ods_col,
                "note": note,
            })
        else:
            mappings.append({
                "api_field": api_key,
                "api_type": api_type,
                "ods_col": "—",
                "note": "仅存于 payload JSONB",
            })

    # ODS 中有但 API 中没有的非元数据列
    for c in ods_cols:
        col_name = c["col"]
        if col_name.lower() not in matched_ods and col_name.lower() not in ETL_META_COLS:
            # 检查是否已在 api_fields 中（可能来自嵌套展平）
            if col_name.lower() not in {k.lower() for k in api_fields}:
                mappings.append({
                    "api_field": "—",
                    "api_type": "—",
                    "ods_col": col_name,
                    "note": "ETL 派生/嵌套提取",
                })

    return mappings


def build_ods_to_dwd_info(
    ods_table: str,
    ods_cols: list[dict],
    table_map: dict[str, str],
    fact_mappings: dict[str, list[tuple]],
    dwd_ddl: dict[str, list[dict]],
) -> list[dict]:
    """
    构建 ODS 表到 DWD 表的映射信息。
    返回 [{dwd_table, dwd_cols_info: [{col, type, ods_source, cast, note}]}]
    """
    full_ods = f"ods.{ods_table}"
    # 找到所有映射到此 ODS 表的 DWD 表
    dwd_tables = [dwd_t for dwd_t, ods_t in table_map.items() if ods_t == full_ods]
    if not dwd_tables:
        return []

    ods_col_names_lower = {c["col"].lower() for c in ods_cols}
    results = []

    for dwd_full in sorted(dwd_tables):
        dwd_short = dwd_full.split(".")[-1] if "." in dwd_full else dwd_full
        dwd_cols = dwd_ddl.get(dwd_short, [])
        if not dwd_cols:
            continue

        mappings_list = fact_mappings.get(dwd_full, [])
        mapping_dict = {m[0].lower(): (m[1], m[2]) for m in mappings_list}

        scd2_cols = {"scd2_start_time", "scd2_end_time", "scd2_is_current", "scd2_version"}
        cols_info = []

        for c in dwd_cols:
            col_name = c["col"]
            col_type = c["type"]
            col_lower = col_name.lower()

            if col_lower in scd2_cols:
                cols_info.append({
                    "col": col_name, "type": col_type,
                    "ods_source": "—", "cast": "—", "note": "SCD2 元数据",
                })
                continue

            if col_lower in mapping_dict:
                ods_expr, cast = mapping_dict[col_lower]
                cast_str = f"→ {cast}" if cast else "直接"
                note = ""
                if "->>" in ods_expr:
                    note = "JSONB 提取"
                elif "CASE" in ods_expr.upper():
                    note = "派生计算"
                elif ods_expr.lower() != col_lower:
                    note = "字段重命名"
                cols_info.append({
                    "col": col_name, "type": col_type,
                    "ods_source": ods_expr, "cast": cast_str, "note": note,
                })
            elif col_lower in ods_col_names_lower:
                cols_info.append({
                    "col": col_name, "type": col_type,
                    "ods_source": col_name, "cast": "直接", "note": "同名直传",
                })
            else:
                cols_info.append({
                    "col": col_name, "type": col_type,
                    "ods_source": "—", "cast": "—", "note": "未显式映射",
                })

        is_dim = "dim_" in dwd_short
        is_ex = dwd_short.endswith("_ex")
        table_type = "维度" if is_dim else "事实"
        if is_ex:
            table_type += "（扩展）"

        results.append({
            "dwd_table": dwd_full,
            "dwd_short": dwd_short,
            "table_type": table_type,
            "cols_info": cols_info,
        })

    return results


# ══════════════════════════════════════════════════════════════════
# 6. Markdown 文档生成
# ══════════════════════════════════════════════════════════════════

def generate_document(
    specs: list[dict],
    api_data: dict[str, list[dict]],       # table -> records
    api_fields: dict[str, OrderedDict],     # table -> fields
    ods_ddl: dict[str, list[dict]],         # table -> cols
    dwd_ddl: dict[str, list[dict]],         # table -> cols
    table_map: dict[str, str],
    fact_mappings: dict[str, list[tuple]],
) -> str:
    """生成完整的全链路数据流 Markdown 文档。"""
    lines: list[str] = []

    # ── 文档头 ──
    lines.append("# API → ODS → DWD 全链路数据流文档")
    lines.append("")
    lines.append(f"> 自动生成于 `scripts/ops/gen_full_dataflow_doc.py`，基于真实 API 响应 + DDL + ETL 源码。")
    lines.append(f"> 生成时间：{datetime.now(TZ).strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("")

    # ── 概览 ──
    api_ok = sum(1 for v in api_data.values() if v)
    lines.append("## 概览")
    lines.append("")
    lines.append(f"- API 端点数量: {len(specs)}")
    lines.append(f"- 成功获取数据: {api_ok}/{len(specs)}")
    lines.append(f"- ODS 表数量: {len(ods_ddl)}")
    lines.append(f"- DWD 表数量: {len(dwd_ddl)}")
    lines.append(f"- TABLE_MAP 映射: {len(table_map)} 条")
    lines.append("")

    # ── 图例 ──
    lines.append("## 图例")
    lines.append("")
    lines.append("- 🔗 跳转链接：点击可在 JSON / ODS / DWD 层之间跳转")
    lines.append("- ✅ 已映射到下游")
    lines.append("- ⚠️ 仅存于 payload，未入 ODS 列")
    lines.append("- 🔄 大小写不敏感匹配")
    lines.append("- 📦 嵌套对象")
    lines.append("")

    # ── 目录 ──
    lines.append("## 目录")
    lines.append("")
    for i, spec in enumerate(specs, 1):
        table = spec["table"]
        desc = spec["description"]
        anchor = make_anchor("api", table)
        lines.append(f"{i}. [{desc} (`{table}`)](#{anchor})")
    lines.append("")
    lines.append("---")
    lines.append("")

    # ── 逐表详情 ──
    for spec in specs:
        table = spec["table"]
        desc = spec["description"]
        code = spec["code"]
        endpoint = spec["endpoint"]
        records = api_data.get(table, [])
        fields = api_fields.get(table, OrderedDict())
        ods_cols = ods_ddl.get(table, [])

        api_anchor = make_anchor("api", table)
        ods_anchor = make_anchor("ods", table)

        # ════════════════════════════════════════════════════════
        # API JSON 层
        # ════════════════════════════════════════════════════════
        lines.append(f'<a id="{api_anchor}"></a>')
        lines.append("")
        lines.append(f"## {desc} (`{table}`)")
        lines.append("")
        lines.append(f"- 任务编码: `{code}`")
        lines.append(f"- API 端点: `{endpoint}`")
        if spec["time_fields"]:
            lines.append(f"- 时间字段: `{spec['time_fields'][0]}` / `{spec['time_fields'][1]}`")
        lines.append(f"- 获取记录数: {len(records)}")
        lines.append("")

        if not fields:
            lines.append("*未获取到 API 数据，跳过 JSON 字段分析。*")
            lines.append("")
        else:
            # 计算 API→ODS 映射
            api_ods_map = compute_api_to_ods_mapping(fields, ods_cols)

            lines.append(f"### API 源字段（{len(fields)} 个） [🔗 ODS](#{ods_anchor})")
            lines.append("")
            lines.append("| # | JSON 字段 | 类型 | 示例值 | → ODS 列 | 说明 |")
            lines.append("|---|----------|------|--------|----------|------|")

            idx = 0
            for m in api_ods_map:
                if m["api_field"] == "—":
                    continue
                idx += 1
                api_f = m["api_field"]
                api_t = m["api_type"]
                ods_c = m["ods_col"]
                note = m["note"]
                sample = fields.get(api_f, {}).get("sample", "")
                # 转义 Markdown 表格中的管道符
                sample = sample.replace("|", "\\|")

                if ods_c != "—":
                    ods_link = f"[`{ods_c}`](#{ods_anchor})"
                    icon = "🔄" if "不敏感" in note else "✅"
                else:
                    ods_link = "—"
                    icon = "📦" if "嵌套" in note else "⚠️"

                lines.append(f"| {idx} | `{api_f}` | {api_t} | {sample} | {ods_link} | {icon} {note} |")

            # 统计
            mapped = sum(1 for m in api_ods_map if m["ods_col"] != "—" and m["api_field"] != "—")
            unmapped = sum(1 for m in api_ods_map if m["ods_col"] == "—" and m["api_field"] != "—"
                          and m["api_type"] not in ("object",))
            lines.append("")
            lines.append(f"> 映射统计：{mapped} 个字段映射到 ODS，{unmapped} 个仅存于 payload。")
            lines.append("")

        # ════════════════════════════════════════════════════════
        # ODS 层
        # ════════════════════════════════════════════════════════
        if ods_cols:
            # 构建 DWD 映射信息
            dwd_info_list = build_ods_to_dwd_info(table, ods_cols, table_map, fact_mappings, dwd_ddl)

            # ODS 列 → DWD 列的反向查找（用于标注下游链接）
            ods_to_dwd_cols: dict[str, list[str]] = {}  # ods_col_lower -> [(dwd_table, dwd_col)]
            for dwd_info in dwd_info_list:
                for ci in dwd_info["cols_info"]:
                    src = ci["ods_source"]
                    if src != "—":
                        # 提取纯列名（去掉 JSONB 表达式等）
                        src_lower = src.split("->")[0].strip().strip('"').lower()
                        key = src_lower
                        if key not in ods_to_dwd_cols:
                            ods_to_dwd_cols[key] = []
                        ods_to_dwd_cols[key].append((dwd_info["dwd_table"], ci["col"]))

            lines.append(f'<a id="{ods_anchor}"></a>')
            lines.append("")
            lines.append(f"### ODS: `ods.{table}` ({len(ods_cols)} 列) [🔗 API](#{api_anchor})")
            lines.append("")

            # 过滤掉 ETL 元数据列
            biz_cols = [c for c in ods_cols if c["col"].lower() not in ETL_META_COLS]
            meta_cols = [c for c in ods_cols if c["col"].lower() in ETL_META_COLS]

            lines.append("| # | ODS 列名 | 类型 | ← JSON 源 | → DWD 目标 |")
            lines.append("|---|---------|------|-----------|-----------|")

            for idx, c in enumerate(biz_cols, 1):
                col_name = c["col"]
                col_type = c["type"]

                # 上游 JSON 链接
                json_match = None
                if fields:
                    for api_key in fields:
                        if api_key.lower() == col_name.lower():
                            json_match = api_key
                            break
                if json_match:
                    json_link = f"[`{json_match}`](#{api_anchor})"
                else:
                    json_link = "—"

                # 下游 DWD 链接
                dwd_targets = ods_to_dwd_cols.get(col_name.lower(), [])
                if dwd_targets:
                    dwd_links = []
                    for dwd_t, dwd_c in dwd_targets[:3]:  # 最多显示 3 个
                        dwd_short = dwd_t.split(".")[-1] if "." in dwd_t else dwd_t
                        dwd_a = make_anchor("dwd", dwd_short)
                        dwd_links.append(f"[`{dwd_c}`](#{dwd_a})")
                    dwd_link = ", ".join(dwd_links)
                    if len(dwd_targets) > 3:
                        dwd_link += f" +{len(dwd_targets)-3}"
                else:
                    dwd_link = "—"

                lines.append(f"| {idx} | `{col_name}` | {col_type} | {json_link} | {dwd_link} |")

            if meta_cols:
                lines.append("")
                lines.append(f"*ETL 元数据列（{len(meta_cols)} 个）：" +
                             ", ".join(f"`{c['col']}`" for c in meta_cols) + "*")

            lines.append("")

        # ════════════════════════════════════════════════════════
        # DWD 层
        # ════════════════════════════════════════════════════════
        dwd_info_list = build_ods_to_dwd_info(table, ods_cols, table_map, fact_mappings, dwd_ddl)

        if dwd_info_list:
            for dwd_info in dwd_info_list:
                dwd_short = dwd_info["dwd_short"]
                dwd_full = dwd_info["dwd_table"]
                table_type = dwd_info["table_type"]
                cols_info = dwd_info["cols_info"]
                dwd_anchor = make_anchor("dwd", dwd_short)

                lines.append(f'<a id="{dwd_anchor}"></a>')
                lines.append("")
                lines.append(f"### DWD: `{dwd_full}` — {table_type} ({len(cols_info)} 列) "
                             f"[🔗 ODS](#{ods_anchor})")
                lines.append("")
                lines.append("| # | DWD 列名 | 类型 | ← ODS 来源 | 转换 | 说明 |")
                lines.append("|---|---------|------|-----------|------|------|")

                for idx, ci in enumerate(cols_info, 1):
                    col = ci["col"]
                    ctype = ci["type"]
                    src = ci["ods_source"]
                    cast = ci["cast"]
                    note = ci["note"]

                    if src != "—":
                        ods_link = f"[`{src}`](#{ods_anchor})"
                    else:
                        ods_link = "—"

                    lines.append(f"| {idx} | `{col}` | {ctype} | {ods_link} | {cast} | {note} |")

                lines.append("")
        else:
            lines.append(f"*该 ODS 表暂无 DWD 映射（仅用于 DWS 或其他下游）*")
            lines.append("")

        lines.append("---")
        lines.append("")

    # ── 附录 ──
    lines.append("## 附录")
    lines.append("")
    _append_appendix(lines)

    return "\n".join(lines)


def _append_appendix(lines: list[str]):
    """添加附录内容。"""
    lines.append("### ETL 元数据列")
    lines.append("")
    lines.append("所有 ODS 表均包含以下 ETL 元数据列，不映射到 DWD：")
    lines.append("")
    lines.append("| 列名 | 类型 | 说明 |")
    lines.append("|------|------|------|")
    lines.append("| `content_hash` | TEXT | 记录内容哈希，用于去重和变更检测 |")
    lines.append("| `source_file` | TEXT | 原始导出文件名，用于数据追溯 |")
    lines.append("| `source_endpoint` | TEXT | 采集来源接口/文件路径 |")
    lines.append("| `fetched_at` | TIMESTAMPTZ | 采集/入库时间戳 |")
    lines.append("| `payload` | JSONB | 完整原始 JSON 记录快照 |")
    lines.append("")

    lines.append("### DWD 维度表 SCD2 列")
    lines.append("")
    lines.append("| 列名 | 类型 | 说明 |")
    lines.append("|------|------|------|")
    lines.append("| `scd2_start_time` | TIMESTAMPTZ | 版本生效起点 |")
    lines.append("| `scd2_end_time` | TIMESTAMPTZ | 版本失效时间（9999-12-31 = 当前） |")
    lines.append("| `scd2_is_current` | INT | 当前版本标记（1=当前，0=历史） |")
    lines.append("| `scd2_version` | INT | 版本号（自增） |")
    lines.append("")

    lines.append("### DWD 事实表增量策略")
    lines.append("")
    lines.append("事实表按时间窗口增量写入，优先使用以下业务时间列进行过滤：")
    lines.append("")
    lines.append("1. `pay_time` — 支付时间")
    lines.append("2. `create_time` — 创建时间")
    lines.append("3. `update_time` — 更新时间")
    lines.append("4. `occur_time` — 发生时间")
    lines.append("5. `settle_time` — 结算时间")
    lines.append("6. `start_use_time` — 开始使用时间")
    lines.append("7. `fetched_at` — 入库时间（兜底）")
    lines.append("")


# ══════════════════════════════════════════════════════════════════
# 7. 主流程
# ══════════════════════════════════════════════════════════════════

def main():
    print("=" * 60)
    print("全链路数据流文档生成器")
    print("=" * 60)

    # 检查 API 配置
    if not API_BASE or not API_TOKEN:
        print("错误: 未配置 API_BASE 或 API_TOKEN，请检查 .env 文件", file=sys.stderr)
        sys.exit(1)

    print(f"API_BASE: {API_BASE}")
    print(f"STORE_ID: {STORE_ID}")
    print()

    # ── 解析 DDL ──
    print("解析 DDL...")
    ods_ddl = parse_ddl_tables(DB / "ods.sql", "ods")
    dwd_ddl = parse_ddl_tables(DB / "dwd.sql", "dwd")
    print(f"  ODS 表: {len(ods_ddl)}, DWD 表: {len(dwd_ddl)}")

    # ── 解析 TABLE_MAP / FACT_MAPPINGS ──
    print("解析 TABLE_MAP / FACT_MAPPINGS...")
    dwd_task_py = ETL / "tasks" / "dwd" / "dwd_load_task.py"
    table_map = parse_table_map(dwd_task_py)
    fact_mappings = parse_fact_mappings(dwd_task_py)
    print(f"  TABLE_MAP: {len(table_map)} 条, FACT_MAPPINGS: {len(fact_mappings)} 条")

    # ── 创建样本目录 ──
    SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

    # ── 获取 API 数据 ──
    print()
    print("从 API 获取数据...")
    api_data: dict[str, list[dict]] = {}
    api_fields: dict[str, OrderedDict] = {}

    for spec in ODS_SPECS:
        table = spec["table"]
        cache_file = SAMPLE_DIR / f"{table}.json"

        # 检查缓存（24 小时内有效）
        if cache_file.exists():
            mtime = datetime.fromtimestamp(cache_file.stat().st_mtime, tz=TZ)
            if (datetime.now(TZ) - mtime).total_seconds() < 86400:
                print(f"  [{spec['code']}] {table}: 使用缓存 ({cache_file.name})")
                with open(cache_file, "r", encoding="utf-8") as f:
                    records = json.load(f)
                api_data[table] = records
                api_fields[table] = analyze_json_fields(records)
                continue

        print(f"  [{spec['code']}] {table}: 请求 API...", end=" ", flush=True)

        try:
            records = fetch_records(spec, target_count=200)
            api_data[table] = records
            print(f"获取 {len(records)} 条")

            # 保存缓存
            with open(cache_file, "w", encoding="utf-8") as f:
                json.dump(records, f, ensure_ascii=False, indent=2, default=str)

            # 分析字段
            api_fields[table] = analyze_json_fields(records)

            # 请求间隔，避免触发限流
            time.sleep(0.5)

        except Exception as e:
            print(f"失败: {e}")
            api_data[table] = []
            api_fields[table] = OrderedDict()

    # ── 生成文档 ──
    print()
    print("生成文档...")
    doc = generate_document(
        specs=ODS_SPECS,
        api_data=api_data,
        api_fields=api_fields,
        ods_ddl=ods_ddl,
        dwd_ddl=dwd_ddl,
        table_map=table_map,
        fact_mappings=fact_mappings,
    )

    OUT.parent.mkdir(parents=True, exist_ok=True)
    OUT.write_text(doc, encoding="utf-8")

    line_count = doc.count("\n") + 1
    print(f"文档已生成: {OUT}")
    print(f"  总行数: {line_count}")
    print(f"  API 样本缓存: {SAMPLE_DIR}")
    print()

    # 统计
    ok = sum(1 for v in api_data.values() if v)
    fail = len(ODS_SPECS) - ok
    print(f"完成: {ok} 个表成功获取数据, {fail} 个未获取")


if __name__ == "__main__":
    main()