init: 项目初始提交 - NeoZQYY Monorepo 完整代码

2026-02-15 14:58:14 +08:00
commit ded6dfb9d8
769 changed files with 182616 additions and 0 deletions
--- a/apps/etl/pipelines/feiqiu/scripts/full_api_refresh_v2.py
+++ b/apps/etl/pipelines/feiqiu/scripts/full_api_refresh_v2.py
@@ -0,0 +1,634 @@
+# -*- coding: utf-8 -*-
+"""
+全量 API JSON 刷新 + 字段分析 + MD 文档完善 + 对比报告（v2）
+时间范围：2026-01-01 00:00:00 ~ 2026-02-13 00:00:00，每接口 100 条
+
+改进点（相比 v1）：
+- siteProfile/tableProfile 等嵌套对象：MD 中已记录为 object 则不展开子字段
+- 请求参数与响应字段分开对比
+- 只对比顶层业务字段
+- 真正缺失的新字段才补充到 MD
+
+用法：python scripts/full_api_refresh_v2.py
+"""
+import json
+import os
+import re
+import sys
+import time
+from datetime import datetime
+
+import requests
+
+# ── 配置 ──────────────────────────────────────────────────────────────────
+API_BASE = "https://pc.ficoo.vip/apiprod/admin/v1/"
+API_TOKEN = os.environ.get("API_TOKEN", "")
+if not API_TOKEN:
+    env_path = os.path.join(os.path.dirname(__file__), "..", ".env")
+    if os.path.exists(env_path):
+        with open(env_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line.startswith("API_TOKEN="):
+                    API_TOKEN = line.split("=", 1)[1].strip()
+                    break
+
+SITE_ID = 2790685415443269
+START_TIME = "2026-01-01 00:00:00"
+END_TIME = "2026-02-13 00:00:00"
+LIMIT = 100
+
+SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
+DOCS_DIR = os.path.join("docs", "api-reference")
+REPORT_DIR = os.path.join("docs", "reports")
+REGISTRY_PATH = os.path.join("docs", "api-reference", "api_registry.json")
+
+HEADERS = {
+    "Authorization": f"Bearer {API_TOKEN}",
+    "Content-Type": "application/json",
+}
+
+# 已知的嵌套对象字段名（MD 中记录为 object，不展开子字段）
+KNOWN_NESTED_OBJECTS = {
+    "siteProfile", "tableProfile", "settleList",
+    "goodsStockWarningInfo", "goodsCategoryList",
+}
+
+
+def load_registry():
+    with open(REGISTRY_PATH, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def call_api(module, action, body):
+    url = f"{API_BASE}{module}/{action}"
+    try:
+        resp = requests.post(url, json=body, headers=HEADERS, timeout=30)
+        resp.raise_for_status()
+        return resp.json()
+    except Exception as e:
+        print(f"  ❌ 请求失败: {e}")
+        return None
+
+
+def build_body(entry):
+    body = dict(entry.get("body") or {})
+    if entry.get("time_range") and entry.get("time_keys"):
+        keys = entry["time_keys"]
+        if len(keys) >= 2:
+            body[keys[0]] = START_TIME
+            body[keys[1]] = END_TIME
+    if entry.get("pagination"):
+        body[entry["pagination"].get("page_key", "page")] = 1
+        body[entry["pagination"].get("limit_key", "limit")] = LIMIT
+    return body
+
+
+def unwrap_records(raw_json, entry):
+    """从原始 API 响应中提取业务记录列表"""
+    if raw_json is None:
+        return []
+    data = raw_json.get("data")
+    if data is None:
+        return []
+
+    table_name = entry["id"]
+    data_path = entry.get("data_path", "")
+
+    # tenant_member_balance_overview: data 本身就是汇总对象
+    if table_name == "tenant_member_balance_overview":
+        if isinstance(data, dict):
+            return [data]
+        return []
+
+    # 按 data_path 解析
+    if data_path and data_path.startswith("data."):
+        path_parts = data_path.split(".")[1:]
+        current = data
+        for part in path_parts:
+            if isinstance(current, dict):
+                current = current.get(part)
+            else:
+                current = None
+                break
+        if isinstance(current, list):
+            return current
+
+    # fallback
+    if isinstance(data, dict):
+        for k, v in data.items():
+            if isinstance(v, list) and k.lower() not in ("total",):
+                return v
+    if isinstance(data, list):
+        return data
+    return []
+
+
+
+def get_top_level_fields(record):
+    """只提取顶层字段名和类型（不递归展开嵌套对象）"""
+    fields = {}
+    if not isinstance(record, dict):
+        return fields
+    for k, v in record.items():
+        if isinstance(v, dict):
+            fields[k] = "object"
+        elif isinstance(v, list):
+            fields[k] = "array"
+        elif isinstance(v, bool):
+            fields[k] = "boolean"
+        elif isinstance(v, int):
+            fields[k] = "integer"
+        elif isinstance(v, float):
+            fields[k] = "number"
+        elif v is None:
+            fields[k] = "null"
+        else:
+            fields[k] = "string"
+    return fields
+
+
+def get_nested_fields(record, parent_key):
+    """提取指定嵌套对象的子字段"""
+    obj = record.get(parent_key)
+    if not isinstance(obj, dict):
+        return {}
+    fields = {}
+    for k, v in obj.items():
+        path = f"{parent_key}.{k}"
+        if isinstance(v, dict):
+            fields[path] = "object"
+        elif isinstance(v, list):
+            fields[path] = "array"
+        elif isinstance(v, bool):
+            fields[path] = "boolean"
+        elif isinstance(v, int):
+            fields[path] = "integer"
+        elif isinstance(v, float):
+            fields[path] = "number"
+        elif v is None:
+            fields[path] = "null"
+        else:
+            fields[path] = "string"
+    return fields
+
+
+def select_top5_richest(records):
+    """从所有记录中选出字段数最多的前 5 条"""
+    if not records:
+        return []
+    scored = []
+    for i, rec in enumerate(records):
+        if not isinstance(rec, dict):
+            continue
+        field_count = len(rec)
+        json_len = len(json.dumps(rec, ensure_ascii=False))
+        scored.append((field_count, json_len, i, rec))
+    scored.sort(key=lambda x: (x[0], x[1]), reverse=True)
+    return [item[3] for item in scored[:5]]
+
+
+def collect_all_top_fields(records):
+    """遍历所有记录，收集所有顶层字段（含类型、出现次数、示例值）"""
+    all_fields = {}
+    for rec in records:
+        if not isinstance(rec, dict):
+            continue
+        fields = get_top_level_fields(rec)
+        for name, typ in fields.items():
+            if name not in all_fields:
+                all_fields[name] = {"type": typ, "count": 0, "example": None}
+            all_fields[name]["count"] += 1
+            if all_fields[name]["example"] is None:
+                val = rec.get(name)
+                if val is not None and val != "" and val != 0 and not isinstance(val, (dict, list)):
+                    ex = str(val)
+                    if len(ex) > 80:
+                        ex = ex[:77] + "..."
+                    all_fields[name]["example"] = ex
+    return all_fields
+
+
+def collect_nested_fields(records, parent_key):
+    """遍历所有记录，收集指定嵌套对象的子字段"""
+    all_fields = {}
+    for rec in records:
+        if not isinstance(rec, dict):
+            continue
+        fields = get_nested_fields(rec, parent_key)
+        for path, typ in fields.items():
+            if path not in all_fields:
+                all_fields[path] = {"type": typ, "count": 0, "example": None}
+            all_fields[path]["count"] += 1
+            if all_fields[path]["example"] is None:
+                obj = rec.get(parent_key, {})
+                k = path.split(".")[-1]
+                val = obj.get(k) if isinstance(obj, dict) else None
+                if val is not None and val != "" and val != 0 and not isinstance(val, (dict, list)):
+                    ex = str(val)
+                    if len(ex) > 80:
+                        ex = ex[:77] + "..."
+                    all_fields[path]["example"] = ex
+    return all_fields
+
+
+def extract_md_response_fields(table_name):
+    """从 MD 文档的响应字段章节提取字段名（排除请求参数）"""
+    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
+    if not os.path.exists(md_path):
+        return set(), set(), ""
+
+    with open(md_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    response_fields = set()
+    nested_fields = set()  # siteProfile.xxx 等嵌套字段
+    field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|', re.MULTILINE)
+    header_fields = {"字段名", "类型", "示例值", "说明", "field", "example",
+                     "description", "type", "路径", "参数", "必填", "属性", "值"}
+
+    # 找到"四、响应字段"章节的范围
+    in_response = False
+    lines = content.split("\n")
+    response_start = None
+    response_end = len(lines)
+
+    for i, line in enumerate(lines):
+        s = line.strip()
+        if ("## 四" in s or "## 4" in s) and "响应字段" in s:
+            in_response = True
+            response_start = i
+            continue
+        if in_response and s.startswith("## ") and "响应字段" not in s:
+            response_end = i
+            break
+
+    if response_start is None:
+        # 没有明确的响应字段章节，尝试从整个文档提取
+        for m in field_pattern.finditer(content):
+            raw = m.group(1).strip()
+            if raw.lower() in {h.lower() for h in header_fields}:
+                continue
+            if "." in raw:
+                nested_fields.add(raw)
+            else:
+                response_fields.add(raw)
+        return response_fields, nested_fields, content
+
+    # 只从响应字段章节提取
+    response_section = "\n".join(lines[response_start:response_end])
+    for m in field_pattern.finditer(response_section):
+        raw = m.group(1).strip()
+        if raw.lower() in {h.lower() for h in header_fields}:
+            continue
+        if "." in raw:
+            nested_fields.add(raw)
+        else:
+            response_fields.add(raw)
+
+    return response_fields, nested_fields, content
+
+
+def compare_fields(json_fields, md_fields, md_nested_fields, table_name):
+    """对比 JSON 字段与 MD 字段，返回缺失和多余"""
+    json_names = set(json_fields.keys())
+    md_names = set(md_fields) if isinstance(md_fields, set) else set(md_fields)
+
+    # JSON 有但 MD 没有的顶层字段
+    missing_in_md = []
+    for name in sorted(json_names - md_names):
+        # 跳过已知嵌套对象（如果 MD 中已记录为 object）
+        if name in KNOWN_NESTED_OBJECTS and name in md_names:
+            continue
+        info = json_fields[name]
+        missing_in_md.append((name, info))
+
+    # MD 有但 JSON 没有的字段
+    extra_in_md = sorted(md_names - json_names)
+
+    return missing_in_md, extra_in_md
+
+
+def save_top5_sample(table_name, top5):
+    """保存前 5 条最全记录作为 JSON 样本"""
+    sample_path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
+    with open(sample_path, "w", encoding="utf-8") as f:
+        json.dump(top5, f, ensure_ascii=False, indent=2)
+    return sample_path
+
+
+
+def update_md_with_missing_fields(table_name, missing_fields, md_content):
+    """将真正缺失的字段补充到 MD 文档的响应字段章节末尾"""
+    if not missing_fields:
+        return False
+
+    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
+    if not os.path.exists(md_path):
+        return False
+
+    lines = md_content.split("\n")
+
+    # 找到响应字段章节的最后一个表格行
+    insert_idx = None
+    in_response = False
+    last_table_row = None
+
+    for i, line in enumerate(lines):
+        s = line.strip()
+        if ("## 四" in s or "## 4" in s) and "响应字段" in s:
+            in_response = True
+            continue
+        if in_response and s.startswith("## ") and "响应字段" not in s:
+            insert_idx = last_table_row
+            break
+        if in_response and s.startswith("|") and "---" not in s:
+            # 检查是否是表头行
+            if not any(h in s for h in ["字段名", "字段", "类型", "说明"]):
+                last_table_row = i
+            elif last_table_row is None:
+                last_table_row = i
+
+    if insert_idx is None and last_table_row is not None:
+        insert_idx = last_table_row
+
+    if insert_idx is None:
+        return False
+
+    new_rows = []
+    for name, info in missing_fields:
+        typ = info["type"]
+        example = info["example"] or ""
+        count = info["count"]
+        new_rows.append(
+            f"| `{name}` | {typ} | {example} | "
+            f"（新发现字段，{count}/{LIMIT} 条记录中出现） |"
+        )
+
+    for row in reversed(new_rows):
+        lines.insert(insert_idx + 1, row)
+
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+    return True
+
+
+def generate_report(results):
+    """生成最终的 JSON vs MD 对比报告"""
+    lines = []
+    lines.append("# API JSON 字段 vs MD 文档对比报告")
+    lines.append("")
+    lines.append(f"生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (Asia/Shanghai)")
+    lines.append(f"数据范围：{START_TIME} ~ {END_TIME}")
+    lines.append(f"每接口获取：{LIMIT} 条")
+    lines.append("")
+
+    # 汇总
+    ok = sum(1 for r in results if r["status"] == "ok")
+    gap = sum(1 for r in results if r["status"] == "gap")
+    skip = sum(1 for r in results if r["status"] == "skipped")
+    err = sum(1 for r in results if r["status"] == "error")
+
+    lines.append("## 汇总")
+    lines.append("")
+    lines.append("| 状态 | 数量 |")
+    lines.append("|------|------|")
+    lines.append(f"| ✅ 完全一致 | {ok} |")
+    lines.append(f"| ⚠️ 有新字段（已补充） | {gap} |")
+    lines.append(f"| ⏭️ 跳过 | {skip} |")
+    lines.append(f"| 💥 错误 | {err} |")
+    lines.append(f"| 合计 | {len(results)} |")
+    lines.append("")
+
+    # 各接口详情
+    lines.append("## 各接口详情")
+    lines.append("")
+
+    for r in results:
+        icon = {"ok": "✅", "gap": "⚠️", "skipped": "⏭️", "error": "💥"}.get(r["status"], "❓")
+        lines.append(f"### {r['table']} ({r.get('name_zh', '')})")
+        lines.append("")
+        lines.append(f"| 项目 | 值 |")
+        lines.append(f"|------|-----|")
+        lines.append(f"| 状态 | {icon} {r['status']} |")
+        lines.append(f"| 获取记录数 | {r['record_count']} |")
+        lines.append(f"| JSON 顶层字段数 | {r['json_field_count']} |")
+        lines.append(f"| MD 响应字段数 | {r['md_field_count']} |")
+        lines.append(f"| 数据路径 | `{r.get('data_path', 'N/A')}` |")
+        if r.get("top5_field_counts"):
+            lines.append(f"| 前5条最全记录字段数 | {r['top5_field_counts']} |")
+        lines.append("")
+
+        if r.get("missing_in_md"):
+            lines.append("新发现字段（已补充到 MD）：")
+            lines.append("")
+            lines.append("| 字段名 | 类型 | 示例 | 出现次数 |")
+            lines.append("|--------|------|------|----------|")
+            for name, info in r["missing_in_md"]:
+                lines.append(f"| `{name}` | {info['type']} | {info.get('example', '')} | {info['count']} |")
+            lines.append("")
+
+        if r.get("extra_in_md"):
+            lines.append(f"MD 中有但本次 JSON 未出现的字段（可能为条件性字段）：`{'`, `'.join(r['extra_in_md'])}`")
+            lines.append("")
+
+        # 嵌套对象子字段汇总
+        if r.get("nested_summary"):
+            for parent, count in r["nested_summary"].items():
+                lines.append(f"嵌套对象 `{parent}` 含 {count} 个子字段（MD 中已记录为 object，不逐字段展开）")
+            lines.append("")
+
+    # 附录：siteProfile 通用字段参考
+    lines.append("## 附录：siteProfile 通用字段参考")
+    lines.append("")
+    lines.append("以下字段在大多数接口的 `siteProfile` 嵌套对象中出现，为门店信息快照（冗余），各接口结构一致：")
+    lines.append("")
+    lines.append("| 字段 | 类型 | 说明 |")
+    lines.append("|------|------|------|")
+    lines.append("| `id` | integer | 门店 ID |")
+    lines.append("| `org_id` | integer | 组织 ID |")
+    lines.append("| `shop_name` | string | 门店名称 |")
+    lines.append("| `avatar` | string | 门店头像 URL |")
+    lines.append("| `business_tel` | string | 门店电话 |")
+    lines.append("| `full_address` | string | 完整地址 |")
+    lines.append("| `address` | string | 简短地址 |")
+    lines.append("| `longitude` | number | 经度 |")
+    lines.append("| `latitude` | number | 纬度 |")
+    lines.append("| `tenant_site_region_id` | integer | 区域 ID |")
+    lines.append("| `tenant_id` | integer | 租户 ID |")
+    lines.append("| `auto_light` | integer | 自动开灯 |")
+    lines.append("| `attendance_distance` | integer | 考勤距离 |")
+    lines.append("| `attendance_enabled` | integer | 考勤启用 |")
+    lines.append("| `wifi_name` | string | WiFi 名称 |")
+    lines.append("| `wifi_password` | string | WiFi 密码 |")
+    lines.append("| `customer_service_qrcode` | string | 客服二维码 |")
+    lines.append("| `customer_service_wechat` | string | 客服微信 |")
+    lines.append("| `fixed_pay_qrCode` | string | 固定支付二维码 |")
+    lines.append("| `prod_env` | integer | 生产环境标识 |")
+    lines.append("| `light_status` | integer | 灯光状态 |")
+    lines.append("| `light_type` | integer | 灯光类型 |")
+    lines.append("| `light_token` | string | 灯光控制 token |")
+    lines.append("| `site_type` | integer | 门店类型 |")
+    lines.append("| `site_label` | string | 门店标签 |")
+    lines.append("| `shop_status` | integer | 门店状态 |")
+    lines.append("")
+
+    return "\n".join(lines)
+
+
+def main():
+    registry = load_registry()
+    print(f"加载 API 注册表: {len(registry)} 个端点")
+    print(f"时间范围: {START_TIME} ~ {END_TIME}")
+    print(f"每接口获取: {LIMIT} 条")
+    print("=" * 80)
+
+    results = []
+
+    for entry in registry:
+        table_name = entry["id"]
+        name_zh = entry.get("name_zh", "")
+        module = entry["module"]
+        action = entry["action"]
+        skip = entry.get("skip", False)
+
+        print(f"\n{'─' * 60}")
+        print(f"[{table_name}] {name_zh} — {module}/{action}")
+
+        if skip:
+            print("  ⏭️ 跳过")
+            results.append({
+                "table": table_name, "name_zh": name_zh,
+                "status": "skipped", "record_count": 0,
+                "json_field_count": 0, "md_field_count": 0,
+                "data_path": entry.get("data_path"),
+            })
+            continue
+
+        # 使用已有的 raw JSON（上一步已获取）
+        raw_path = os.path.join(SAMPLES_DIR, f"{table_name}_raw.json")
+        if os.path.exists(raw_path):
+            with open(raw_path, "r", encoding="utf-8") as f:
+                raw = json.load(f)
+            print(f"  使用已缓存的原始响应")
+        else:
+            body = build_body(entry)
+            print(f"  请求: POST {module}/{action}")
+            raw = call_api(module, action, body)
+            if raw:
+                with open(raw_path, "w", encoding="utf-8") as f:
+                    json.dump(raw, f, ensure_ascii=False, indent=2)
+
+        if raw is None:
+            results.append({
+                "table": table_name, "name_zh": name_zh,
+                "status": "error", "record_count": 0,
+                "json_field_count": 0, "md_field_count": 0,
+                "data_path": entry.get("data_path"),
+            })
+            continue
+
+        records = unwrap_records(raw, entry)
+        print(f"  记录数: {len(records)}")
+
+        if not records:
+            results.append({
+                "table": table_name, "name_zh": name_zh,
+                "status": "ok", "record_count": 0,
+                "json_field_count": 0, "md_field_count": 0,
+                "data_path": entry.get("data_path"),
+            })
+            continue
+
+        # 选出字段最全的前 5 条
+        top5 = select_top5_richest(records)
+        top5_counts = [len(r) for r in top5]
+        print(f"  前 5 条最全记录顶层字段数: {top5_counts}")
+
+        # 保存前 5 条样本
+        save_top5_sample(table_name, top5)
+
+        # 收集所有顶层字段
+        json_fields = collect_all_top_fields(records)
+        print(f"  JSON 顶层字段数: {len(json_fields)}")
+
+        # 收集嵌套对象子字段（仅用于报告，不用于对比）
+        nested_summary = {}
+        for name, info in json_fields.items():
+            if info["type"] == "object" and name in KNOWN_NESTED_OBJECTS:
+                nested = collect_nested_fields(records, name)
+                nested_summary[name] = len(nested)
+
+        # 提取 MD 响应字段
+        md_fields, md_nested, md_content = extract_md_response_fields(table_name)
+        print(f"  MD 响应字段数: {len(md_fields)}")
+
+        # 对比
+        missing_in_md, extra_in_md = compare_fields(json_fields, md_fields, md_nested, table_name)
+
+        # 过滤掉已知嵌套对象（MD 中已记录为 object）
+        real_missing = [(n, i) for n, i in missing_in_md
+                        if n not in KNOWN_NESTED_OBJECTS or n not in md_fields]
+
+        status = "ok" if not real_missing else "gap"
+
+        if real_missing:
+            print(f"  ⚠️ 发现 {len(real_missing)} 个新字段:")
+            for name, info in real_missing:
+                print(f"     + {name} ({info['type']}, {info['count']}次)")
+            # 补充到 MD
+            updated = update_md_with_missing_fields(table_name, real_missing, md_content)
+            if updated:
+                print(f"  📝 已补充到 MD 文档")
+        else:
+            print(f"  ✅ 字段完全覆盖")
+
+        if extra_in_md:
+            print(f"  ℹ️ MD 多 {len(extra_in_md)} 个条件性字段")
+
+        results.append({
+            "table": table_name, "name_zh": name_zh,
+            "status": status,
+            "record_count": len(records),
+            "json_field_count": len(json_fields),
+            "md_field_count": len(md_fields),
+            "data_path": entry.get("data_path"),
+            "missing_in_md": real_missing,
+            "extra_in_md": extra_in_md,
+            "top5_field_counts": top5_counts,
+            "nested_summary": nested_summary,
+        })
+
+    # ── 生成报告 ──
+    print(f"\n{'=' * 80}")
+    print("生成对比报告...")
+
+    report = generate_report(results)
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    report_path = os.path.join(REPORT_DIR, "api_json_vs_md_report_20260214.md")
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write(report)
+    print(f"报告: {report_path}")
+
+    # JSON 详细结果
+    json_path = os.path.join(REPORT_DIR, "api_refresh_detail_20260214.json")
+    serializable = []
+    for r in results:
+        sr = dict(r)
+        if "missing_in_md" in sr and sr["missing_in_md"]:
+            sr["missing_in_md"] = [(n, {"type": i["type"], "count": i["count"]})
+                                    for n, i in sr["missing_in_md"]]
+        serializable.append(sr)
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(serializable, f, ensure_ascii=False, indent=2)
+
+    # 汇总
+    ok = sum(1 for r in results if r["status"] == "ok")
+    gap = sum(1 for r in results if r["status"] == "gap")
+    skip = sum(1 for r in results if r["status"] == "skipped")
+    err = sum(1 for r in results if r["status"] == "error")
+    print(f"\n汇总: ✅ {ok} | ⚠️ {gap} | ⏭️ {skip} | 💥 {err}")
+
+
+if __name__ == "__main__":
+    main()