Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/refresh_json_and_audit.py

# -*- coding: utf-8 -*-
"""
重新获取全部 API 接口的 JSON 数据（最多 100 条），
遍历所有记录提取最全字段集合，
与 .md 文档比对并输出差异报告。

时间范围：2026-01-01 00:00:00 ~ 2026-02-13 00:00:00

用法：python scripts/refresh_json_and_audit.py
"""
import json
import os
import re
import sys
import time
import requests

# ── 配置 ──────────────────────────────────────────────────────────────────
API_BASE = "https://pc.ficoo.vip/apiprod/admin/v1/"
API_TOKEN = os.environ.get("API_TOKEN", "")
if not API_TOKEN:
    env_path = os.path.join(os.path.dirname(__file__), "..", ".env")
    if os.path.exists(env_path):
        with open(env_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith("API_TOKEN="):
                    API_TOKEN = line.split("=", 1)[1].strip()
                    break

SITE_ID = 2790685415443269
START_TIME = "2026-01-01 00:00:00"
END_TIME = "2026-02-13 00:00:00"
LIMIT = 100

SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
DOCS_DIR = os.path.join("docs", "api-reference")
REPORT_DIR = os.path.join("docs", "reports")

HEADERS = {
    "Authorization": f"Bearer {API_TOKEN}",
    "Content-Type": "application/json",
}

REGISTRY_PATH = os.path.join("docs", "api-reference", "api_registry.json")

WRAPPER_FIELDS = {"settleList", "siteProfile", "tableProfile",
                  "goodsCategoryList", "data", "code", "msg",
                  "settlelist", "siteprofile", "tableprofile",
                  "goodscategorylist"}

CROSS_REF_HEADERS = {"字段名", "类型", "示例值", "说明", "field", "example",
                     "description"}

# 每个接口实际返回的列表字段名（从调试中获得）
ACTUAL_LIST_KEY = {
    "assistant_accounts_master": "assistantInfos",
    "assistant_service_records": "orderAssistantDetails",
    "assistant_cancellation_records": "abolitionAssistants",
    "table_fee_transactions": "siteTableUseDetailsList",
    "table_fee_discount_records": "taiFeeAdjustInfos",
    "tenant_goods_master": "tenantGoodsList",
    "store_goods_sales_records": "orderGoodsLedgers",
    "store_goods_master": "orderGoodsList",
    "goods_stock_movements": "queryDeliveryRecordsList",
    "member_profiles": "tenantMemberInfos",
    "member_stored_value_cards": "tenantMemberCards",
    "member_balance_changes": "tenantMemberCardLogs",
    "group_buy_packages": "packageCouponList",
    "group_buy_redemption_records": "siteTableUseDetailsList",
    "site_tables_master": "siteTables",
    # 以下使用 "list" 或特殊路径
    "payment_transactions": "list",
    "refund_transactions": "list",
    "platform_coupon_redemption_records": "list",
    "goods_stock_summary": "list",
    "settlement_records": "settleList",
    "recharge_settlements": "settleList",
}


def load_registry():
    with open(REGISTRY_PATH, "r", encoding="utf-8") as f:
        return json.load(f)


def call_api(module, action, body):
    url = f"{API_BASE}{module}/{action}"
    try:
        resp = requests.post(url, json=body, headers=HEADERS, timeout=30)
        resp.raise_for_status()
        return resp.json()
    except Exception as e:
        print(f"  ❌ 请求失败: {e}")
        return None


def unwrap_records(raw_json, table_name):
    """从原始 API 响应中提取业务记录列表"""
    if raw_json is None:
        return []

    data = raw_json.get("data")
    if data is None:
        return []

    # ── 特殊表：stock_goods_category_tree ──
    if table_name == "stock_goods_category_tree":
        if isinstance(data, dict):
            cats = data.get("goodsCategoryList", [])
            return cats if isinstance(cats, list) else []
        return []

    # ── 特殊表：role_area_association ──
    if table_name == "role_area_association":
        if isinstance(data, dict):
            rels = data.get("roleAreaRelations", [])
            return rels if isinstance(rels, list) else []
        return []

    # ── 特殊表：tenant_member_balance_overview ──
    # 返回的是汇总对象 + rechargeCardList/giveCardList
    if table_name == "tenant_member_balance_overview":
        if isinstance(data, dict):
            # 合并顶层标量字段 + 列表中的字段
            records = [data]  # 顶层作为一条记录
            for list_key in ("rechargeCardList", "giveCardList"):
                items = data.get(list_key, [])
                if isinstance(items, list):
                    records.extend(items)
            return records
        return []

    # ── settlement_records / recharge_settlements ──
    # data.settleList 是列表，每个元素内部有 settleList 子对象
    if table_name in ("settlement_records", "recharge_settlements"):
        if isinstance(data, dict):
            settle_list = data.get("settleList", [])
            if isinstance(settle_list, list):
                return settle_list
        return []

    # ── 通用：data 是 dict，从中找列表字段 ──
    if isinstance(data, dict):
        list_key = ACTUAL_LIST_KEY.get(table_name, "list")
        items = data.get(list_key, [])
        if isinstance(items, list):
            return items
        # fallback: 找第一个列表字段
        for k, v in data.items():
            if isinstance(v, list) and k != "total":
                return v
        return []

    if isinstance(data, list):
        return data

    return []


def extract_all_fields(records, table_name):
    """从多条记录中提取所有唯一字段名（小写）"""
    all_fields = set()
    for record in records:
        if not isinstance(record, dict):
            continue

        # settlement_records / recharge_settlements: 内层 settleList 展开
        if table_name in ("settlement_records", "recharge_settlements"):
            settle = record.get("settleList", record)
            if isinstance(settle, list):
                settle = settle[0] if settle else {}
            if isinstance(settle, dict):
                for k in settle.keys():
                    kl = k.lower()
                    if kl == "siteprofile":
                        all_fields.add("siteprofile")
                    elif kl in WRAPPER_FIELDS:
                        continue
                    else:
                        all_fields.add(kl)
            continue

        # tenant_member_balance_overview: 特殊处理
        if table_name == "tenant_member_balance_overview":
            for k in record.keys():
                kl = k.lower()
                # 跳过嵌套列表键名本身
                if kl in ("rechargecardlist", "givecardlist"):
                    continue
                all_fields.add(kl)
            continue

        # 通用
        for k in record.keys():
            kl = k.lower()
            if kl in WRAPPER_FIELDS:
                if kl in ("siteprofile", "tableprofile"):
                    all_fields.add(kl)
                continue
            all_fields.add(kl)

    return all_fields


def extract_md_fields(table_name):
    """从 .md 文档的"四、响应字段详解"章节提取字段名（小写）"""
    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
    if not os.path.exists(md_path):
        return set()

    with open(md_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    fields = set()
    in_section = False
    in_siteprofile = False
    field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|')
    siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE)

    for line in lines:
        s = line.strip()

        if s.startswith("## 四、") and "响应字段" in s:
            in_section = True
            in_siteprofile = False
            continue

        if in_section and s.startswith("## ") and not s.startswith("## 四"):
            break

        if not in_section:
            continue

        if table_name in ("settlement_records", "recharge_settlements"):
            if siteprofile_header.search(s):
                in_siteprofile = True
                continue
            if s.startswith("### ") and in_siteprofile:
                if not siteprofile_header.search(s):
                    in_siteprofile = False

        m = field_pattern.match(s)
        if m:
            raw = m.group(1).strip()
            if raw.lower() in {h.lower() for h in CROSS_REF_HEADERS}:
                continue
            if table_name in ("settlement_records", "recharge_settlements"):
                if in_siteprofile:
                    continue
                if raw.startswith("siteProfile."):
                    continue
            if raw.lower() in WRAPPER_FIELDS and raw.lower() not in (
                    "siteprofile", "tableprofile"):
                continue
            fields.add(raw.lower())

    return fields


def build_body(entry):
    body = dict(entry.get("body") or {})
    if entry.get("time_range") and entry.get("time_keys"):
        keys = entry["time_keys"]
        if len(keys) >= 2:
            body[keys[0]] = START_TIME
            body[keys[1]] = END_TIME
    if entry.get("pagination"):
        body[entry["pagination"].get("page_key", "page")] = 1
        body[entry["pagination"].get("limit_key", "limit")] = LIMIT
    return body


def save_sample(table_name, records):
    """保存第一条记录作为 JSON 样本"""
    sample_path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
    if records and isinstance(records[0], dict):
        with open(sample_path, "w", encoding="utf-8") as f:
            json.dump(records[0], f, ensure_ascii=False, indent=2)
    return sample_path


def discover_actual_data_path(raw_json, table_name):
    """发现 API 实际返回的数据路径"""
    data = raw_json.get("data") if raw_json else None
    if data is None:
        return None

    # 特殊表
    if table_name == "stock_goods_category_tree":
        return "data.goodsCategoryList"
    if table_name == "role_area_association":
        return "data.roleAreaRelations"
    if table_name == "tenant_member_balance_overview":
        return "data"  # 顶层汇总对象
    if table_name in ("settlement_records", "recharge_settlements"):
        return "data.settleList"

    if isinstance(data, dict):
        list_key = ACTUAL_LIST_KEY.get(table_name)
        if list_key and list_key in data:
            return f"data.{list_key}"
        # fallback
        for k, v in data.items():
            if isinstance(v, list) and k.lower() != "total":
                return f"data.{k}"
    return None


def update_md_data_path(table_name, actual_path):
    """在 .md 文档的接口概述表格中更新/添加实际数据路径"""
    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
    if not os.path.exists(md_path):
        return False

    with open(md_path, "r", encoding="utf-8") as f:
        content = f.read()

    # 检查是否已有"数据路径"或"响应数据路径"行
    if "数据路径" in content or "data_path" in content.lower():
        # 尝试更新已有行
        pattern = re.compile(
            r'(\|\s*(?:数据路径|响应数据路径|data_path)\s*\|\s*)`[^`]*`(\s*\|)',
            re.IGNORECASE
        )
        if pattern.search(content):
            new_content = pattern.sub(
                rf'\g<1>`{actual_path}`\g<2>', content
            )
            if new_content != content:
                with open(md_path, "w", encoding="utf-8") as f:
                    f.write(new_content)
                return True
            return False  # 已经是最新值

    # 没有数据路径行，在接口概述表格末尾添加
    # 找到"## 一、接口概述"后的表格最后一行（以 | 开头）
    lines = content.split("\n")
    insert_idx = None
    in_overview = False
    last_table_row = None

    for i, line in enumerate(lines):
        s = line.strip()
        if "## 一、" in s and "接口概述" in s:
            in_overview = True
            continue
        if in_overview and s.startswith("## "):
            break
        if in_overview and s.startswith("|") and "---" not in s:
            last_table_row = i

    if last_table_row is not None:
        new_line = f"| 响应数据路径 | `{actual_path}` |"
        lines.insert(last_table_row + 1, new_line)
        with open(md_path, "w", encoding="utf-8") as f:
            f.write("\n".join(lines))
        return True

    return False


def main():
    registry = load_registry()
    print(f"加载 API 注册表: {len(registry)} 个端点")
    print(f"时间范围: {START_TIME} ~ {END_TIME}")
    print(f"每接口获取: {LIMIT} 条")
    print("=" * 80)

    results = []
    all_gaps = []
    registry_updates = {}  # table_name -> actual_data_path

    for entry in registry:
        table_name = entry["id"]
        name_zh = entry.get("name_zh", "")
        module = entry["module"]
        action = entry["action"]
        skip = entry.get("skip", False)

        print(f"\n{'─' * 60}")
        print(f"[{table_name}] {name_zh} — {module}/{action}")

        if skip:
            print("  ⏭️ 跳过（标记为 skip）")
            results.append({
                "table": table_name,
                "status": "skipped",
                "record_count": 0,
                "json_field_count": 0,
                "md_field_count": 0,
                "json_fields": [],
                "md_fields": [],
                "json_only": [],
                "md_only": [],
                "actual_data_path": None,
            })
            continue

        body = build_body(entry)

        print(f"  请求: POST {module}/{action}")
        raw = call_api(module, action, body)

        if raw is None:
            results.append({
                "table": table_name,
                "status": "error",
                "record_count": 0,
                "json_field_count": 0,
                "md_field_count": 0,
                "json_fields": [],
                "md_fields": [],
                "json_only": [],
                "md_only": [],
                "actual_data_path": None,
            })
            continue

        # 发现实际数据路径
        actual_path = discover_actual_data_path(raw, table_name)
        old_path = entry.get("data_path", "")
        if actual_path and actual_path != old_path:
            print(f"  📍 数据路径: {old_path} → {actual_path}")
            registry_updates[table_name] = actual_path
        else:
            print(f"  📍 数据路径: {actual_path or old_path}")

        records = unwrap_records(raw, table_name)
        print(f"  获取记录数: {len(records)}")

        # 保存样本（第一条）
        save_sample(table_name, records)

        # 遍历所有记录提取全字段
        json_fields = extract_all_fields(records, table_name)
        md_fields = extract_md_fields(table_name)

        json_only = json_fields - md_fields
        md_only = md_fields - json_fields

        status = "ok"
        if json_only:
            status = "gap"
            print(f"  ❌ JSON 有但 .md 缺失 ({len(json_only)} 个): {sorted(json_only)}")
            all_gaps.append((table_name, name_zh, sorted(json_only)))
        else:
            if md_only:
                print(f"  ⚠️ .md 多 {len(md_only)} 个条件性字段")
            else:
                print(f"  ✅ 完全一致 ({len(json_fields)} 个字段)")

        # 更新 .md 文档中的数据路径
        if actual_path:
            updated = update_md_data_path(table_name, actual_path)
            if updated:
                print(f"  📝 已更新 .md 文档数据路径")

        results.append({
            "table": table_name,
            "status": status,
            "record_count": len(records),
            "json_field_count": len(json_fields),
            "md_field_count": len(md_fields),
            "json_fields": sorted(json_fields),
            "md_fields": sorted(md_fields),
            "json_only": sorted(json_only),
            "md_only": sorted(md_only),
            "actual_data_path": actual_path,
        })

        time.sleep(0.3)

    # ── 更新 api_registry.json 中的 data_path ──
    if registry_updates:
        print(f"\n{'─' * 60}")
        print(f"更新 api_registry.json 中 {len(registry_updates)} 个 data_path...")
        for entry in registry:
            tid = entry["id"]
            if tid in registry_updates:
                entry["data_path"] = registry_updates[tid]
        with open(REGISTRY_PATH, "w", encoding="utf-8") as f:
            json.dump(registry, f, ensure_ascii=False, indent=2)
        print("  ✅ api_registry.json 已更新")

    # ── 汇总 ──
    print(f"\n{'=' * 80}")
    print("汇总报告")
    print(f"{'=' * 80}")

    gap_count = sum(1 for r in results if r["status"] == "gap")
    ok_count = sum(1 for r in results if r["status"] == "ok")
    skip_count = sum(1 for r in results if r["status"] == "skipped")
    err_count = sum(1 for r in results if r["status"] == "error")

    print(f"  完全一致: {ok_count}")
    print(f"  有缺失:   {gap_count}")
    print(f"  跳过:     {skip_count}")
    print(f"  错误:     {err_count}")

    if all_gaps:
        print(f"\n需要补充到 .md 文档的字段:")
        for table, name_zh, fields in all_gaps:
            print(f"  {table} ({name_zh}): {fields}")

    # 保存详细结果
    out_path = os.path.join(REPORT_DIR, "json_refresh_audit.json")
    os.makedirs(REPORT_DIR, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\n详细结果已写入: {out_path}")


if __name__ == "__main__":
    main()

# AI_CHANGELOG:
# - 日期: 2026-02-14
# - Prompt: P20260214-060000 — 全量 JSON 刷新 + MD 文档补全 + 数据路径修正
# - 直接原因: 旧 JSON 样本仅含单条记录，缺少条件性字段；需重新获取 100 条数据并遍历提取最全字段
# - 变更摘要: 新建脚本，实现：(1) 调用全部 24 个 API 端点获取 100 条数据 (2) 遍历所有记录提取字段并集
#   (3) 与 .md 文档比对找出缺失字段 (4) 更新 JSON 样本和 api_registry.json data_path (5) 更新 .md 文档响应数据路径行
# - 风险与验证: 脚本需要有效的 API_TOKEN 和网络连接；验证：运行后检查 json_refresh_audit.json 中 24/24 通过