Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/full_api_refresh_v2.py

# -*- coding: utf-8 -*-
"""
全量 API JSON 刷新 + 字段分析 + MD 文档完善 + 对比报告（v2）
时间范围：2026-01-01 00:00:00 ~ 2026-02-13 00:00:00，每接口 100 条

改进点（相比 v1）：
- siteProfile/tableProfile 等嵌套对象：MD 中已记录为 object 则不展开子字段
- 请求参数与响应字段分开对比
- 只对比顶层业务字段
- 真正缺失的新字段才补充到 MD

用法：python scripts/full_api_refresh_v2.py
"""
import json
import os
import re
import sys
import time
from datetime import datetime

import requests

# ── 配置 ──────────────────────────────────────────────────────────────────
API_BASE = "https://pc.ficoo.vip/apiprod/admin/v1/"
API_TOKEN = os.environ.get("API_TOKEN", "")
if not API_TOKEN:
    env_path = os.path.join(os.path.dirname(__file__), "..", ".env")
    if os.path.exists(env_path):
        with open(env_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("API_TOKEN="):
                    API_TOKEN = line.split("=", 1)[1].strip()
                    break

SITE_ID = 2790685415443269
START_TIME = "2026-01-01 00:00:00"
END_TIME = "2026-02-13 00:00:00"
LIMIT = 100

SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
DOCS_DIR = os.path.join("docs", "api-reference")
REPORT_DIR = os.path.join("docs", "reports")
REGISTRY_PATH = os.path.join("docs", "api-reference", "api_registry.json")

HEADERS = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json",
}

# 已知的嵌套对象字段名（MD 中记录为 object，不展开子字段）
KNOWN_NESTED_OBJECTS = {
    "siteProfile", "tableProfile", "settleList",
    "goodsStockWarningInfo", "goodsCategoryList",
}


def load_registry():
    with open(REGISTRY_PATH, "r", encoding="utf-8") as f:
        return json.load(f)


def call_api(module, action, body):
    url = f"{API_BASE}{module}/{action}"
    try:
        resp = requests.post(url, json=body, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"  ❌ 请求失败: {e}")
        return None


def build_body(entry):
    body = dict(entry.get("body") or {})
    if entry.get("time_range") and entry.get("time_keys"):
        keys = entry["time_keys"]
        if len(keys) >= 2:
            body[keys[0]] = START_TIME
            body[keys[1]] = END_TIME
    if entry.get("pagination"):
        body[entry["pagination"].get("page_key", "page")] = 1
        body[entry["pagination"].get("limit_key", "limit")] = LIMIT
    return body


def unwrap_records(raw_json, entry):
    """从原始 API 响应中提取业务记录列表"""
    if raw_json is None:
        return []
    data = raw_json.get("data")
    if data is None:
        return []

    table_name = entry["id"]
    data_path = entry.get("data_path", "")

    # tenant_member_balance_overview: data 本身就是汇总对象
    if table_name == "tenant_member_balance_overview":
        if isinstance(data, dict):
            return [data]
        return []

    # 按 data_path 解析
    if data_path and data_path.startswith("data."):
        path_parts = data_path.split(".")[1:]
        current = data
        for part in path_parts:
            if isinstance(current, dict):
                current = current.get(part)
            else:
                current = None
                break
        if isinstance(current, list):
            return current

    # fallback
    if isinstance(data, dict):
        for k, v in data.items():
            if isinstance(v, list) and k.lower() not in ("total",):
                return v
    if isinstance(data, list):
        return data
    return []


def get_top_level_fields(record):
    """只提取顶层字段名和类型（不递归展开嵌套对象）"""
    fields = {}
    if not isinstance(record, dict):
        return fields
    for k, v in record.items():
        if isinstance(v, dict):
            fields[k] = "object"
        elif isinstance(v, list):
            fields[k] = "array"
        elif isinstance(v, bool):
            fields[k] = "boolean"
        elif isinstance(v, int):
            fields[k] = "integer"
        elif isinstance(v, float):
            fields[k] = "number"
        elif v is None:
            fields[k] = "null"
        else:
            fields[k] = "string"
    return fields


def get_nested_fields(record, parent_key):
    """提取指定嵌套对象的子字段"""
    obj = record.get(parent_key)
    if not isinstance(obj, dict):
        return {}
    fields = {}
    for k, v in obj.items():
        path = f"{parent_key}.{k}"
        if isinstance(v, dict):
            fields[path] = "object"
        elif isinstance(v, list):
            fields[path] = "array"
        elif isinstance(v, bool):
            fields[path] = "boolean"
        elif isinstance(v, int):
            fields[path] = "integer"
        elif isinstance(v, float):
            fields[path] = "number"
        elif v is None:
            fields[path] = "null"
        else:
            fields[path] = "string"
    return fields


def select_top5_richest(records):
    """从所有记录中选出字段数最多的前 5 条"""
    if not records:
        return []
    scored = []
    for i, rec in enumerate(records):
        if not isinstance(rec, dict):
            continue
        field_count = len(rec)
        json_len = len(json.dumps(rec, ensure_ascii=False))
        scored.append((field_count, json_len, i, rec))
    scored.sort(key=lambda x: (x[0], x[1]), reverse=True)
    return [item[3] for item in scored[:5]]


def collect_all_top_fields(records):
    """遍历所有记录，收集所有顶层字段（含类型、出现次数、示例值）"""
    all_fields = {}
    for rec in records:
        if not isinstance(rec, dict):
            continue
        fields = get_top_level_fields(rec)
        for name, typ in fields.items():
            if name not in all_fields:
                all_fields[name] = {"type": typ, "count": 0, "example": None}
            all_fields[name]["count"] += 1
            if all_fields[name]["example"] is None:
                val = rec.get(name)
                if val is not None and val != "" and val != 0 and not isinstance(val, (dict, list)):
                    ex = str(val)
                    if len(ex) > 80:
                        ex = ex[:77] + "..."
                    all_fields[name]["example"] = ex
    return all_fields


def collect_nested_fields(records, parent_key):
    """遍历所有记录，收集指定嵌套对象的子字段"""
    all_fields = {}
    for rec in records:
        if not isinstance(rec, dict):
            continue
        fields = get_nested_fields(rec, parent_key)
        for path, typ in fields.items():
            if path not in all_fields:
                all_fields[path] = {"type": typ, "count": 0, "example": None}
            all_fields[path]["count"] += 1
            if all_fields[path]["example"] is None:
                obj = rec.get(parent_key, {})
                k = path.split(".")[-1]
                val = obj.get(k) if isinstance(obj, dict) else None
                if val is not None and val != "" and val != 0 and not isinstance(val, (dict, list)):
                    ex = str(val)
                    if len(ex) > 80:
                        ex = ex[:77] + "..."
                    all_fields[path]["example"] = ex
    return all_fields


def extract_md_response_fields(table_name):
    """从 MD 文档的响应字段章节提取字段名（排除请求参数）"""
    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
    if not os.path.exists(md_path):
        return set(), set(), ""

    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()

    response_fields = set()
    nested_fields = set()  # siteProfile.xxx 等嵌套字段
    field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|', re.MULTILINE)
    header_fields = {"字段名", "类型", "示例值", "说明", "field", "example",
                     "description", "type", "路径", "参数", "必填", "属性", "值"}

    # 找到"四、响应字段"章节的范围
    in_response = False
    lines = content.split("\n")
    response_start = None
    response_end = len(lines)

    for i, line in enumerate(lines):
        s = line.strip()
        if ("## 四" in s or "## 4" in s) and "响应字段" in s:
            in_response = True
            response_start = i
            continue
        if in_response and s.startswith("## ") and "响应字段" not in s:
            response_end = i
            break

    if response_start is None:
        # 没有明确的响应字段章节，尝试从整个文档提取
        for m in field_pattern.finditer(content):
            raw = m.group(1).strip()
            if raw.lower() in {h.lower() for h in header_fields}:
                continue
            if "." in raw:
                nested_fields.add(raw)
            else:
                response_fields.add(raw)
        return response_fields, nested_fields, content

    # 只从响应字段章节提取
    response_section = "\n".join(lines[response_start:response_end])
    for m in field_pattern.finditer(response_section):
        raw = m.group(1).strip()
        if raw.lower() in {h.lower() for h in header_fields}:
            continue
        if "." in raw:
            nested_fields.add(raw)
        else:
            response_fields.add(raw)

    return response_fields, nested_fields, content


def compare_fields(json_fields, md_fields, md_nested_fields, table_name):
    """对比 JSON 字段与 MD 字段，返回缺失和多余"""
    json_names = set(json_fields.keys())
    md_names = set(md_fields) if isinstance(md_fields, set) else set(md_fields)

    # JSON 有但 MD 没有的顶层字段
    missing_in_md = []
    for name in sorted(json_names - md_names):
        # 跳过已知嵌套对象（如果 MD 中已记录为 object）
        if name in KNOWN_NESTED_OBJECTS and name in md_names:
            continue
        info = json_fields[name]
        missing_in_md.append((name, info))

    # MD 有但 JSON 没有的字段
    extra_in_md = sorted(md_names - json_names)

    return missing_in_md, extra_in_md


def save_top5_sample(table_name, top5):
    """保存前 5 条最全记录作为 JSON 样本"""
    sample_path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
    with open(sample_path, "w", encoding="utf-8") as f:
        json.dump(top5, f, ensure_ascii=False, indent=2)
    return sample_path


def update_md_with_missing_fields(table_name, missing_fields, md_content):
    """将真正缺失的字段补充到 MD 文档的响应字段章节末尾"""
    if not missing_fields:
        return False

    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
    if not os.path.exists(md_path):
        return False

    lines = md_content.split("\n")

    # 找到响应字段章节的最后一个表格行
    insert_idx = None
    in_response = False
    last_table_row = None

    for i, line in enumerate(lines):
        s = line.strip()
        if ("## 四" in s or "## 4" in s) and "响应字段" in s:
            in_response = True
            continue
        if in_response and s.startswith("## ") and "响应字段" not in s:
            insert_idx = last_table_row
            break
        if in_response and s.startswith("|") and "---" not in s:
            # 检查是否是表头行
            if not any(h in s for h in ["字段名", "字段", "类型", "说明"]):
                last_table_row = i
            elif last_table_row is None:
                last_table_row = i

    if insert_idx is None and last_table_row is not None:
        insert_idx = last_table_row

    if insert_idx is None:
        return False

    new_rows = []
    for name, info in missing_fields:
        typ = info["type"]
        example = info["example"] or ""
        count = info["count"]
        new_rows.append(
            f"| `{name}` | {typ} | {example} | "
            f"（新发现字段，{count}/{LIMIT} 条记录中出现） |"
        )

    for row in reversed(new_rows):
        lines.insert(insert_idx + 1, row)

    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    return True


def generate_report(results):
    """生成最终的 JSON vs MD 对比报告"""
    lines = []
    lines.append("# API JSON 字段 vs MD 文档对比报告")
    lines.append("")
    lines.append(f"生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (Asia/Shanghai)")
    lines.append(f"数据范围：{START_TIME} ~ {END_TIME}")
    lines.append(f"每接口获取：{LIMIT} 条")
    lines.append("")

    # 汇总
    ok = sum(1 for r in results if r["status"] == "ok")
    gap = sum(1 for r in results if r["status"] == "gap")
    skip = sum(1 for r in results if r["status"] == "skipped")
    err = sum(1 for r in results if r["status"] == "error")

    lines.append("## 汇总")
    lines.append("")
    lines.append("| 状态 | 数量 |")
    lines.append("|------|------|")
    lines.append(f"| ✅ 完全一致 | {ok} |")
    lines.append(f"| ⚠️ 有新字段（已补充） | {gap} |")
    lines.append(f"| ⏭️ 跳过 | {skip} |")
    lines.append(f"| 💥 错误 | {err} |")
    lines.append(f"| 合计 | {len(results)} |")
    lines.append("")

    # 各接口详情
    lines.append("## 各接口详情")
    lines.append("")

    for r in results:
        icon = {"ok": "✅", "gap": "⚠️", "skipped": "⏭️", "error": "💥"}.get(r["status"], "❓")
        lines.append(f"### {r['table']} ({r.get('name_zh', '')})")
        lines.append("")
        lines.append(f"| 项目 | 值 |")
        lines.append(f"|------|-----|")
        lines.append(f"| 状态 | {icon} {r['status']} |")
        lines.append(f"| 获取记录数 | {r['record_count']} |")
        lines.append(f"| JSON 顶层字段数 | {r['json_field_count']} |")
        lines.append(f"| MD 响应字段数 | {r['md_field_count']} |")
        lines.append(f"| 数据路径 | `{r.get('data_path', 'N/A')}` |")
        if r.get("top5_field_counts"):
            lines.append(f"| 前5条最全记录字段数 | {r['top5_field_counts']} |")
        lines.append("")

        if r.get("missing_in_md"):
            lines.append("新发现字段（已补充到 MD）：")
            lines.append("")
            lines.append("| 字段名 | 类型 | 示例 | 出现次数 |")
            lines.append("|--------|------|------|----------|")
            for name, info in r["missing_in_md"]:
                lines.append(f"| `{name}` | {info['type']} | {info.get('example', '')} | {info['count']} |")
            lines.append("")

        if r.get("extra_in_md"):
            lines.append(f"MD 中有但本次 JSON 未出现的字段（可能为条件性字段）：`{'`, `'.join(r['extra_in_md'])}`")
            lines.append("")

        # 嵌套对象子字段汇总
        if r.get("nested_summary"):
            for parent, count in r["nested_summary"].items():
                lines.append(f"嵌套对象 `{parent}` 含 {count} 个子字段（MD 中已记录为 object，不逐字段展开）")
            lines.append("")

    # 附录：siteProfile 通用字段参考
    lines.append("## 附录：siteProfile 通用字段参考")
    lines.append("")
    lines.append("以下字段在大多数接口的 `siteProfile` 嵌套对象中出现，为门店信息快照（冗余），各接口结构一致：")
    lines.append("")
    lines.append("| 字段 | 类型 | 说明 |")
    lines.append("|------|------|------|")
    lines.append("| `id` | integer | 门店 ID |")
    lines.append("| `org_id` | integer | 组织 ID |")
    lines.append("| `shop_name` | string | 门店名称 |")
    lines.append("| `avatar` | string | 门店头像 URL |")
    lines.append("| `business_tel` | string | 门店电话 |")
    lines.append("| `full_address` | string | 完整地址 |")
    lines.append("| `address` | string | 简短地址 |")
    lines.append("| `longitude` | number | 经度 |")
    lines.append("| `latitude` | number | 纬度 |")
    lines.append("| `tenant_site_region_id` | integer | 区域 ID |")
    lines.append("| `tenant_id` | integer | 租户 ID |")
    lines.append("| `auto_light` | integer | 自动开灯 |")
    lines.append("| `attendance_distance` | integer | 考勤距离 |")
    lines.append("| `attendance_enabled` | integer | 考勤启用 |")
    lines.append("| `wifi_name` | string | WiFi 名称 |")
    lines.append("| `wifi_password` | string | WiFi 密码 |")
    lines.append("| `customer_service_qrcode` | string | 客服二维码 |")
    lines.append("| `customer_service_wechat` | string | 客服微信 |")
    lines.append("| `fixed_pay_qrCode` | string | 固定支付二维码 |")
    lines.append("| `prod_env` | integer | 生产环境标识 |")
    lines.append("| `light_status` | integer | 灯光状态 |")
    lines.append("| `light_type` | integer | 灯光类型 |")
    lines.append("| `light_token` | string | 灯光控制 token |")
    lines.append("| `site_type` | integer | 门店类型 |")
    lines.append("| `site_label` | string | 门店标签 |")
    lines.append("| `shop_status` | integer | 门店状态 |")
    lines.append("")

    return "\n".join(lines)


def main():
    registry = load_registry()
    print(f"加载 API 注册表: {len(registry)} 个端点")
    print(f"时间范围: {START_TIME} ~ {END_TIME}")
    print(f"每接口获取: {LIMIT} 条")
    print("=" * 80)

    results = []

    for entry in registry:
        table_name = entry["id"]
        name_zh = entry.get("name_zh", "")
        module = entry["module"]
        action = entry["action"]
        skip = entry.get("skip", False)

        print(f"\n{'─' * 60}")
        print(f"[{table_name}] {name_zh} — {module}/{action}")

        if skip:
            print("  ⏭️ 跳过")
            results.append({
                "table": table_name, "name_zh": name_zh,
                "status": "skipped", "record_count": 0,
                "json_field_count": 0, "md_field_count": 0,
                "data_path": entry.get("data_path"),
            })
            continue

        # 使用已有的 raw JSON（上一步已获取）
        raw_path = os.path.join(SAMPLES_DIR, f"{table_name}_raw.json")
        if os.path.exists(raw_path):
            with open(raw_path, "r", encoding="utf-8") as f:
                raw = json.load(f)
            print(f"  使用已缓存的原始响应")
        else:
            body = build_body(entry)
            print(f"  请求: POST {module}/{action}")
            raw = call_api(module, action, body)
            if raw:
                with open(raw_path, "w", encoding="utf-8") as f:
                    json.dump(raw, f, ensure_ascii=False, indent=2)

        if raw is None:
            results.append({
                "table": table_name, "name_zh": name_zh,
                "status": "error", "record_count": 0,
                "json_field_count": 0, "md_field_count": 0,
                "data_path": entry.get("data_path"),
            })
            continue

        records = unwrap_records(raw, entry)
        print(f"  记录数: {len(records)}")

        if not records:
            results.append({
                "table": table_name, "name_zh": name_zh,
                "status": "ok", "record_count": 0,
                "json_field_count": 0, "md_field_count": 0,
                "data_path": entry.get("data_path"),
            })
            continue

        # 选出字段最全的前 5 条
        top5 = select_top5_richest(records)
        top5_counts = [len(r) for r in top5]
        print(f"  前 5 条最全记录顶层字段数: {top5_counts}")

        # 保存前 5 条样本
        save_top5_sample(table_name, top5)

        # 收集所有顶层字段
        json_fields = collect_all_top_fields(records)
        print(f"  JSON 顶层字段数: {len(json_fields)}")

        # 收集嵌套对象子字段（仅用于报告，不用于对比）
        nested_summary = {}
        for name, info in json_fields.items():
            if info["type"] == "object" and name in KNOWN_NESTED_OBJECTS:
                nested = collect_nested_fields(records, name)
                nested_summary[name] = len(nested)

        # 提取 MD 响应字段
        md_fields, md_nested, md_content = extract_md_response_fields(table_name)
        print(f"  MD 响应字段数: {len(md_fields)}")

        # 对比
        missing_in_md, extra_in_md = compare_fields(json_fields, md_fields, md_nested, table_name)

        # 过滤掉已知嵌套对象（MD 中已记录为 object）
        real_missing = [(n, i) for n, i in missing_in_md
                        if n not in KNOWN_NESTED_OBJECTS or n not in md_fields]

        status = "ok" if not real_missing else "gap"

        if real_missing:
            print(f"  ⚠️ 发现 {len(real_missing)} 个新字段:")
            for name, info in real_missing:
                print(f"     + {name} ({info['type']}, {info['count']}次)")
            # 补充到 MD
            updated = update_md_with_missing_fields(table_name, real_missing, md_content)
            if updated:
                print(f"  📝 已补充到 MD 文档")
        else:
            print(f"  ✅ 字段完全覆盖")

        if extra_in_md:
            print(f"  ℹ️ MD 多 {len(extra_in_md)} 个条件性字段")

        results.append({
            "table": table_name, "name_zh": name_zh,
            "status": status,
            "record_count": len(records),
            "json_field_count": len(json_fields),
            "md_field_count": len(md_fields),
            "data_path": entry.get("data_path"),
            "missing_in_md": real_missing,
            "extra_in_md": extra_in_md,
            "top5_field_counts": top5_counts,
            "nested_summary": nested_summary,
        })

    # ── 生成报告 ──
    print(f"\n{'=' * 80}")
    print("生成对比报告...")

    report = generate_report(results)
    os.makedirs(REPORT_DIR, exist_ok=True)
    report_path = os.path.join(REPORT_DIR, "api_json_vs_md_report_20260214.md")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(report)
    print(f"报告: {report_path}")

    # JSON 详细结果
    json_path = os.path.join(REPORT_DIR, "api_refresh_detail_20260214.json")
    serializable = []
    for r in results:
        sr = dict(r)
        if "missing_in_md" in sr and sr["missing_in_md"]:
            sr["missing_in_md"] = [(n, {"type": i["type"], "count": i["count"]})
                                    for n, i in sr["missing_in_md"]]
        serializable.append(sr)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(serializable, f, ensure_ascii=False, indent=2)

    # 汇总
    ok = sum(1 for r in results if r["status"] == "ok")
    gap = sum(1 for r in results if r["status"] == "gap")
    skip = sum(1 for r in results if r["status"] == "skipped")
    err = sum(1 for r in results if r["status"] == "error")
    print(f"\n汇总: ✅ {ok} | ⚠️ {gap} | ⏭️ {skip} | 💥 {err}")


if __name__ == "__main__":
    main()