Files
Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/full_api_refresh_v2.py

635 lines
23 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
全量 API JSON 刷新 + 字段分析 + MD 文档完善 + 对比报告v2
时间范围2026-01-01 00:00:00 ~ 2026-02-13 00:00:00每接口 100 条
改进点(相比 v1
- siteProfile/tableProfile 等嵌套对象MD 中已记录为 object 则不展开子字段
- 请求参数与响应字段分开对比
- 只对比顶层业务字段
- 真正缺失的新字段才补充到 MD
用法python scripts/full_api_refresh_v2.py
"""
import json
import os
import re
import sys
import time
from datetime import datetime
import requests
# ── 配置 ──────────────────────────────────────────────────────────────────
API_BASE = "https://pc.ficoo.vip/apiprod/admin/v1/"
API_TOKEN = os.environ.get("API_TOKEN", "")
if not API_TOKEN:
env_path = os.path.join(os.path.dirname(__file__), "..", ".env")
if os.path.exists(env_path):
with open(env_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line.startswith("API_TOKEN="):
API_TOKEN = line.split("=", 1)[1].strip()
break
SITE_ID = 2790685415443269
START_TIME = "2026-01-01 00:00:00"
END_TIME = "2026-02-13 00:00:00"
LIMIT = 100
SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
DOCS_DIR = os.path.join("docs", "api-reference")
REPORT_DIR = os.path.join("docs", "reports")
REGISTRY_PATH = os.path.join("docs", "api-reference", "api_registry.json")
HEADERS = {
"Authorization": f"Bearer {API_TOKEN}",
"Content-Type": "application/json",
}
# 已知的嵌套对象字段名MD 中记录为 object不展开子字段
KNOWN_NESTED_OBJECTS = {
"siteProfile", "tableProfile", "settleList",
"goodsStockWarningInfo", "goodsCategoryList",
}
def load_registry():
with open(REGISTRY_PATH, "r", encoding="utf-8") as f:
return json.load(f)
def call_api(module, action, body):
url = f"{API_BASE}{module}/{action}"
try:
resp = requests.post(url, json=body, headers=HEADERS, timeout=30)
resp.raise_for_status()
return resp.json()
except Exception as e:
print(f" ❌ 请求失败: {e}")
return None
def build_body(entry):
body = dict(entry.get("body") or {})
if entry.get("time_range") and entry.get("time_keys"):
keys = entry["time_keys"]
if len(keys) >= 2:
body[keys[0]] = START_TIME
body[keys[1]] = END_TIME
if entry.get("pagination"):
body[entry["pagination"].get("page_key", "page")] = 1
body[entry["pagination"].get("limit_key", "limit")] = LIMIT
return body
def unwrap_records(raw_json, entry):
"""从原始 API 响应中提取业务记录列表"""
if raw_json is None:
return []
data = raw_json.get("data")
if data is None:
return []
table_name = entry["id"]
data_path = entry.get("data_path", "")
# tenant_member_balance_overview: data 本身就是汇总对象
if table_name == "tenant_member_balance_overview":
if isinstance(data, dict):
return [data]
return []
# 按 data_path 解析
if data_path and data_path.startswith("data."):
path_parts = data_path.split(".")[1:]
current = data
for part in path_parts:
if isinstance(current, dict):
current = current.get(part)
else:
current = None
break
if isinstance(current, list):
return current
# fallback
if isinstance(data, dict):
for k, v in data.items():
if isinstance(v, list) and k.lower() not in ("total",):
return v
if isinstance(data, list):
return data
return []
def get_top_level_fields(record):
"""只提取顶层字段名和类型(不递归展开嵌套对象)"""
fields = {}
if not isinstance(record, dict):
return fields
for k, v in record.items():
if isinstance(v, dict):
fields[k] = "object"
elif isinstance(v, list):
fields[k] = "array"
elif isinstance(v, bool):
fields[k] = "boolean"
elif isinstance(v, int):
fields[k] = "integer"
elif isinstance(v, float):
fields[k] = "number"
elif v is None:
fields[k] = "null"
else:
fields[k] = "string"
return fields
def get_nested_fields(record, parent_key):
"""提取指定嵌套对象的子字段"""
obj = record.get(parent_key)
if not isinstance(obj, dict):
return {}
fields = {}
for k, v in obj.items():
path = f"{parent_key}.{k}"
if isinstance(v, dict):
fields[path] = "object"
elif isinstance(v, list):
fields[path] = "array"
elif isinstance(v, bool):
fields[path] = "boolean"
elif isinstance(v, int):
fields[path] = "integer"
elif isinstance(v, float):
fields[path] = "number"
elif v is None:
fields[path] = "null"
else:
fields[path] = "string"
return fields
def select_top5_richest(records):
"""从所有记录中选出字段数最多的前 5 条"""
if not records:
return []
scored = []
for i, rec in enumerate(records):
if not isinstance(rec, dict):
continue
field_count = len(rec)
json_len = len(json.dumps(rec, ensure_ascii=False))
scored.append((field_count, json_len, i, rec))
scored.sort(key=lambda x: (x[0], x[1]), reverse=True)
return [item[3] for item in scored[:5]]
def collect_all_top_fields(records):
"""遍历所有记录,收集所有顶层字段(含类型、出现次数、示例值)"""
all_fields = {}
for rec in records:
if not isinstance(rec, dict):
continue
fields = get_top_level_fields(rec)
for name, typ in fields.items():
if name not in all_fields:
all_fields[name] = {"type": typ, "count": 0, "example": None}
all_fields[name]["count"] += 1
if all_fields[name]["example"] is None:
val = rec.get(name)
if val is not None and val != "" and val != 0 and not isinstance(val, (dict, list)):
ex = str(val)
if len(ex) > 80:
ex = ex[:77] + "..."
all_fields[name]["example"] = ex
return all_fields
def collect_nested_fields(records, parent_key):
"""遍历所有记录,收集指定嵌套对象的子字段"""
all_fields = {}
for rec in records:
if not isinstance(rec, dict):
continue
fields = get_nested_fields(rec, parent_key)
for path, typ in fields.items():
if path not in all_fields:
all_fields[path] = {"type": typ, "count": 0, "example": None}
all_fields[path]["count"] += 1
if all_fields[path]["example"] is None:
obj = rec.get(parent_key, {})
k = path.split(".")[-1]
val = obj.get(k) if isinstance(obj, dict) else None
if val is not None and val != "" and val != 0 and not isinstance(val, (dict, list)):
ex = str(val)
if len(ex) > 80:
ex = ex[:77] + "..."
all_fields[path]["example"] = ex
return all_fields
def extract_md_response_fields(table_name):
"""从 MD 文档的响应字段章节提取字段名(排除请求参数)"""
md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
if not os.path.exists(md_path):
return set(), set(), ""
with open(md_path, "r", encoding="utf-8") as f:
content = f.read()
response_fields = set()
nested_fields = set() # siteProfile.xxx 等嵌套字段
field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|', re.MULTILINE)
header_fields = {"字段名", "类型", "示例值", "说明", "field", "example",
"description", "type", "路径", "参数", "必填", "属性", ""}
# 找到"四、响应字段"章节的范围
in_response = False
lines = content.split("\n")
response_start = None
response_end = len(lines)
for i, line in enumerate(lines):
s = line.strip()
if ("## 四" in s or "## 4" in s) and "响应字段" in s:
in_response = True
response_start = i
continue
if in_response and s.startswith("## ") and "响应字段" not in s:
response_end = i
break
if response_start is None:
# 没有明确的响应字段章节,尝试从整个文档提取
for m in field_pattern.finditer(content):
raw = m.group(1).strip()
if raw.lower() in {h.lower() for h in header_fields}:
continue
if "." in raw:
nested_fields.add(raw)
else:
response_fields.add(raw)
return response_fields, nested_fields, content
# 只从响应字段章节提取
response_section = "\n".join(lines[response_start:response_end])
for m in field_pattern.finditer(response_section):
raw = m.group(1).strip()
if raw.lower() in {h.lower() for h in header_fields}:
continue
if "." in raw:
nested_fields.add(raw)
else:
response_fields.add(raw)
return response_fields, nested_fields, content
def compare_fields(json_fields, md_fields, md_nested_fields, table_name):
"""对比 JSON 字段与 MD 字段,返回缺失和多余"""
json_names = set(json_fields.keys())
md_names = set(md_fields) if isinstance(md_fields, set) else set(md_fields)
# JSON 有但 MD 没有的顶层字段
missing_in_md = []
for name in sorted(json_names - md_names):
# 跳过已知嵌套对象(如果 MD 中已记录为 object
if name in KNOWN_NESTED_OBJECTS and name in md_names:
continue
info = json_fields[name]
missing_in_md.append((name, info))
# MD 有但 JSON 没有的字段
extra_in_md = sorted(md_names - json_names)
return missing_in_md, extra_in_md
def save_top5_sample(table_name, top5):
"""保存前 5 条最全记录作为 JSON 样本"""
sample_path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
with open(sample_path, "w", encoding="utf-8") as f:
json.dump(top5, f, ensure_ascii=False, indent=2)
return sample_path
def update_md_with_missing_fields(table_name, missing_fields, md_content):
"""将真正缺失的字段补充到 MD 文档的响应字段章节末尾"""
if not missing_fields:
return False
md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
if not os.path.exists(md_path):
return False
lines = md_content.split("\n")
# 找到响应字段章节的最后一个表格行
insert_idx = None
in_response = False
last_table_row = None
for i, line in enumerate(lines):
s = line.strip()
if ("## 四" in s or "## 4" in s) and "响应字段" in s:
in_response = True
continue
if in_response and s.startswith("## ") and "响应字段" not in s:
insert_idx = last_table_row
break
if in_response and s.startswith("|") and "---" not in s:
# 检查是否是表头行
if not any(h in s for h in ["字段名", "字段", "类型", "说明"]):
last_table_row = i
elif last_table_row is None:
last_table_row = i
if insert_idx is None and last_table_row is not None:
insert_idx = last_table_row
if insert_idx is None:
return False
new_rows = []
for name, info in missing_fields:
typ = info["type"]
example = info["example"] or ""
count = info["count"]
new_rows.append(
f"| `{name}` | {typ} | {example} | "
f"(新发现字段,{count}/{LIMIT} 条记录中出现) |"
)
for row in reversed(new_rows):
lines.insert(insert_idx + 1, row)
with open(md_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
return True
def generate_report(results):
"""生成最终的 JSON vs MD 对比报告"""
lines = []
lines.append("# API JSON 字段 vs MD 文档对比报告")
lines.append("")
lines.append(f"生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} (Asia/Shanghai)")
lines.append(f"数据范围:{START_TIME} ~ {END_TIME}")
lines.append(f"每接口获取:{LIMIT}")
lines.append("")
# 汇总
ok = sum(1 for r in results if r["status"] == "ok")
gap = sum(1 for r in results if r["status"] == "gap")
skip = sum(1 for r in results if r["status"] == "skipped")
err = sum(1 for r in results if r["status"] == "error")
lines.append("## 汇总")
lines.append("")
lines.append("| 状态 | 数量 |")
lines.append("|------|------|")
lines.append(f"| ✅ 完全一致 | {ok} |")
lines.append(f"| ⚠️ 有新字段(已补充) | {gap} |")
lines.append(f"| ⏭️ 跳过 | {skip} |")
lines.append(f"| 💥 错误 | {err} |")
lines.append(f"| 合计 | {len(results)} |")
lines.append("")
# 各接口详情
lines.append("## 各接口详情")
lines.append("")
for r in results:
icon = {"ok": "", "gap": "⚠️", "skipped": "⏭️", "error": "💥"}.get(r["status"], "")
lines.append(f"### {r['table']} ({r.get('name_zh', '')})")
lines.append("")
lines.append(f"| 项目 | 值 |")
lines.append(f"|------|-----|")
lines.append(f"| 状态 | {icon} {r['status']} |")
lines.append(f"| 获取记录数 | {r['record_count']} |")
lines.append(f"| JSON 顶层字段数 | {r['json_field_count']} |")
lines.append(f"| MD 响应字段数 | {r['md_field_count']} |")
lines.append(f"| 数据路径 | `{r.get('data_path', 'N/A')}` |")
if r.get("top5_field_counts"):
lines.append(f"| 前5条最全记录字段数 | {r['top5_field_counts']} |")
lines.append("")
if r.get("missing_in_md"):
lines.append("新发现字段(已补充到 MD")
lines.append("")
lines.append("| 字段名 | 类型 | 示例 | 出现次数 |")
lines.append("|--------|------|------|----------|")
for name, info in r["missing_in_md"]:
lines.append(f"| `{name}` | {info['type']} | {info.get('example', '')} | {info['count']} |")
lines.append("")
if r.get("extra_in_md"):
lines.append(f"MD 中有但本次 JSON 未出现的字段(可能为条件性字段):`{'`, `'.join(r['extra_in_md'])}`")
lines.append("")
# 嵌套对象子字段汇总
if r.get("nested_summary"):
for parent, count in r["nested_summary"].items():
lines.append(f"嵌套对象 `{parent}` 含 {count} 个子字段MD 中已记录为 object不逐字段展开")
lines.append("")
# 附录siteProfile 通用字段参考
lines.append("## 附录siteProfile 通用字段参考")
lines.append("")
lines.append("以下字段在大多数接口的 `siteProfile` 嵌套对象中出现,为门店信息快照(冗余),各接口结构一致:")
lines.append("")
lines.append("| 字段 | 类型 | 说明 |")
lines.append("|------|------|------|")
lines.append("| `id` | integer | 门店 ID |")
lines.append("| `org_id` | integer | 组织 ID |")
lines.append("| `shop_name` | string | 门店名称 |")
lines.append("| `avatar` | string | 门店头像 URL |")
lines.append("| `business_tel` | string | 门店电话 |")
lines.append("| `full_address` | string | 完整地址 |")
lines.append("| `address` | string | 简短地址 |")
lines.append("| `longitude` | number | 经度 |")
lines.append("| `latitude` | number | 纬度 |")
lines.append("| `tenant_site_region_id` | integer | 区域 ID |")
lines.append("| `tenant_id` | integer | 租户 ID |")
lines.append("| `auto_light` | integer | 自动开灯 |")
lines.append("| `attendance_distance` | integer | 考勤距离 |")
lines.append("| `attendance_enabled` | integer | 考勤启用 |")
lines.append("| `wifi_name` | string | WiFi 名称 |")
lines.append("| `wifi_password` | string | WiFi 密码 |")
lines.append("| `customer_service_qrcode` | string | 客服二维码 |")
lines.append("| `customer_service_wechat` | string | 客服微信 |")
lines.append("| `fixed_pay_qrCode` | string | 固定支付二维码 |")
lines.append("| `prod_env` | integer | 生产环境标识 |")
lines.append("| `light_status` | integer | 灯光状态 |")
lines.append("| `light_type` | integer | 灯光类型 |")
lines.append("| `light_token` | string | 灯光控制 token |")
lines.append("| `site_type` | integer | 门店类型 |")
lines.append("| `site_label` | string | 门店标签 |")
lines.append("| `shop_status` | integer | 门店状态 |")
lines.append("")
return "\n".join(lines)
def main():
registry = load_registry()
print(f"加载 API 注册表: {len(registry)} 个端点")
print(f"时间范围: {START_TIME} ~ {END_TIME}")
print(f"每接口获取: {LIMIT}")
print("=" * 80)
results = []
for entry in registry:
table_name = entry["id"]
name_zh = entry.get("name_zh", "")
module = entry["module"]
action = entry["action"]
skip = entry.get("skip", False)
print(f"\n{'' * 60}")
print(f"[{table_name}] {name_zh}{module}/{action}")
if skip:
print(" ⏭️ 跳过")
results.append({
"table": table_name, "name_zh": name_zh,
"status": "skipped", "record_count": 0,
"json_field_count": 0, "md_field_count": 0,
"data_path": entry.get("data_path"),
})
continue
# 使用已有的 raw JSON上一步已获取
raw_path = os.path.join(SAMPLES_DIR, f"{table_name}_raw.json")
if os.path.exists(raw_path):
with open(raw_path, "r", encoding="utf-8") as f:
raw = json.load(f)
print(f" 使用已缓存的原始响应")
else:
body = build_body(entry)
print(f" 请求: POST {module}/{action}")
raw = call_api(module, action, body)
if raw:
with open(raw_path, "w", encoding="utf-8") as f:
json.dump(raw, f, ensure_ascii=False, indent=2)
if raw is None:
results.append({
"table": table_name, "name_zh": name_zh,
"status": "error", "record_count": 0,
"json_field_count": 0, "md_field_count": 0,
"data_path": entry.get("data_path"),
})
continue
records = unwrap_records(raw, entry)
print(f" 记录数: {len(records)}")
if not records:
results.append({
"table": table_name, "name_zh": name_zh,
"status": "ok", "record_count": 0,
"json_field_count": 0, "md_field_count": 0,
"data_path": entry.get("data_path"),
})
continue
# 选出字段最全的前 5 条
top5 = select_top5_richest(records)
top5_counts = [len(r) for r in top5]
print(f" 前 5 条最全记录顶层字段数: {top5_counts}")
# 保存前 5 条样本
save_top5_sample(table_name, top5)
# 收集所有顶层字段
json_fields = collect_all_top_fields(records)
print(f" JSON 顶层字段数: {len(json_fields)}")
# 收集嵌套对象子字段(仅用于报告,不用于对比)
nested_summary = {}
for name, info in json_fields.items():
if info["type"] == "object" and name in KNOWN_NESTED_OBJECTS:
nested = collect_nested_fields(records, name)
nested_summary[name] = len(nested)
# 提取 MD 响应字段
md_fields, md_nested, md_content = extract_md_response_fields(table_name)
print(f" MD 响应字段数: {len(md_fields)}")
# 对比
missing_in_md, extra_in_md = compare_fields(json_fields, md_fields, md_nested, table_name)
# 过滤掉已知嵌套对象MD 中已记录为 object
real_missing = [(n, i) for n, i in missing_in_md
if n not in KNOWN_NESTED_OBJECTS or n not in md_fields]
status = "ok" if not real_missing else "gap"
if real_missing:
print(f" ⚠️ 发现 {len(real_missing)} 个新字段:")
for name, info in real_missing:
print(f" + {name} ({info['type']}, {info['count']}次)")
# 补充到 MD
updated = update_md_with_missing_fields(table_name, real_missing, md_content)
if updated:
print(f" 📝 已补充到 MD 文档")
else:
print(f" ✅ 字段完全覆盖")
if extra_in_md:
print(f" MD 多 {len(extra_in_md)} 个条件性字段")
results.append({
"table": table_name, "name_zh": name_zh,
"status": status,
"record_count": len(records),
"json_field_count": len(json_fields),
"md_field_count": len(md_fields),
"data_path": entry.get("data_path"),
"missing_in_md": real_missing,
"extra_in_md": extra_in_md,
"top5_field_counts": top5_counts,
"nested_summary": nested_summary,
})
# ── 生成报告 ──
print(f"\n{'=' * 80}")
print("生成对比报告...")
report = generate_report(results)
os.makedirs(REPORT_DIR, exist_ok=True)
report_path = os.path.join(REPORT_DIR, "api_json_vs_md_report_20260214.md")
with open(report_path, "w", encoding="utf-8") as f:
f.write(report)
print(f"报告: {report_path}")
# JSON 详细结果
json_path = os.path.join(REPORT_DIR, "api_refresh_detail_20260214.json")
serializable = []
for r in results:
sr = dict(r)
if "missing_in_md" in sr and sr["missing_in_md"]:
sr["missing_in_md"] = [(n, {"type": i["type"], "count": i["count"]})
for n, i in sr["missing_in_md"]]
serializable.append(sr)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(serializable, f, ensure_ascii=False, indent=2)
# 汇总
ok = sum(1 for r in results if r["status"] == "ok")
gap = sum(1 for r in results if r["status"] == "gap")
skip = sum(1 for r in results if r["status"] == "skipped")
err = sum(1 for r in results if r["status"] == "error")
print(f"\n汇总: ✅ {ok} | ⚠️ {gap} | ⏭️ {skip} | 💥 {err}")
if __name__ == "__main__":
main()