# -*- coding: utf-8 -*- """ 比对 JSON 样本字段 vs API 参考文档(.md)字段。 找出 JSON 中存在但 .md 文档"四、响应字段详解"中缺失的字段。 特殊处理: - settlement_records / recharge_settlements: 从 settleList 内层提取字段 siteProfile 子字段不提取(ODS 中存为 siteprofile jsonb 列) - stock_goods_category_tree: 从 goodsCategoryList 内层提取字段 - 嵌套对象(siteProfile, tableProfile)作为整体字段名 """ import json import os import re import sys SAMPLES_DIR = os.path.join("docs", "api-reference", "samples") DOCS_DIR = os.path.join("docs", "api-reference") # 结构包装器字段(不应出现在比对中) WRAPPER_FIELDS = {"settleList", "siteProfile", "tableProfile", "goodsCategoryList", "data", "code", "msg", "settlelist", "siteprofile", "tableprofile", "goodscategorylist"} # 表头关键字(跳过)— 注意 "type" 不能放这里,因为有些表有 type 业务字段 CROSS_REF_HEADERS = {"字段名", "类型", "示例值", "说明", "field", "example", "description"} def extract_json_fields(table_name: str) -> set: """从 JSON 样本提取所有字段名(小写)""" path = os.path.join(SAMPLES_DIR, f"{table_name}.json") if not os.path.exists(path): return set() with open(path, "r", encoding="utf-8") as f: data = json.load(f) # settlement_records / recharge_settlements: settleList 内层 if table_name in ("settlement_records", "recharge_settlements"): settle = data.get("settleList", {}) if isinstance(settle, list): settle = settle[0] if settle else {} fields = set() for k in settle.keys(): kl = k.lower() if kl in {"siteprofile"}: fields.add(kl) # 作为整体 jsonb 列 continue fields.add(kl) return fields # stock_goods_category_tree: goodsCategoryList 内层 if table_name == "stock_goods_category_tree": cat_list = data.get("goodsCategoryList", []) if cat_list: return {k.lower() for k in cat_list[0].keys() if k.lower() not in WRAPPER_FIELDS} return set() # role_area_association: roleAreaRelations 内层 if table_name == "role_area_association": rel_list = data.get("roleAreaRelations", []) if rel_list: return {k.lower() for k in rel_list[0].keys() if k.lower() not in WRAPPER_FIELDS} return set() # 通用:顶层字段 fields = set() for k in data.keys(): kl = k.lower() if kl in WRAPPER_FIELDS: # 嵌套对象作为整体 if kl in ("siteprofile", "tableprofile"): fields.add(kl) continue fields.add(kl) return fields def extract_md_fields(table_name: str) -> set: """从 .md 文档的"四、响应字段详解"章节提取字段名(小写)""" md_path = os.path.join(DOCS_DIR, f"{table_name}.md") if not os.path.exists(md_path): return set() with open(md_path, "r", encoding="utf-8") as f: lines = f.readlines() fields = set() in_section = False in_siteprofile = False field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|') siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE) for line in lines: s = line.strip() if s.startswith("## 四、") and "响应字段" in s: in_section = True in_siteprofile = False continue if in_section and s.startswith("## ") and not s.startswith("## 四"): break if not in_section: continue # siteProfile 子章节处理 if table_name in ("settlement_records", "recharge_settlements"): if siteprofile_header.search(s): in_siteprofile = True continue if s.startswith("### ") and in_siteprofile: if not siteprofile_header.search(s): in_siteprofile = False m = field_pattern.match(s) if m: raw = m.group(1).strip() if raw.lower() in {h.lower() for h in CROSS_REF_HEADERS}: continue if table_name in ("settlement_records", "recharge_settlements"): if in_siteprofile: continue if raw.startswith("siteProfile."): continue if raw.lower() in WRAPPER_FIELDS and raw.lower() not in ("siteprofile", "tableprofile"): continue fields.add(raw.lower()) return fields def main(): samples = sorted([ f.replace(".json", "") for f in os.listdir(SAMPLES_DIR) if f.endswith(".json") ]) results = [] for table in samples: json_fields = extract_json_fields(table) md_fields = extract_md_fields(table) # JSON 中有但 .md 中没有的 json_only = json_fields - md_fields # .md 中有但 JSON 中没有的(可能是条件性字段,仅供参考) md_only = md_fields - json_fields results.append({ "table": table, "json_count": len(json_fields), "md_count": len(md_fields), "json_only": sorted(json_only), "md_only": sorted(md_only), }) # 输出 print("=" * 80) print("JSON 样本 vs .md 文档 字段比对报告") print("=" * 80) issues = 0 for r in results: if r["json_only"]: issues += 1 print(f"\n❌ {r['table']} — JSON={r['json_count']}, MD={r['md_count']}") print(f" JSON 中有但 .md 缺失 ({len(r['json_only'])} 个):") for f in r["json_only"]: print(f" - {f}") if r["md_only"]: print(f" .md 中有但 JSON 无 ({len(r['md_only'])} 个,可能是条件性字段):") for f in r["md_only"]: print(f" - {f}") else: status = "✅" if not r["md_only"] else "⚠️" extra = "" if r["md_only"]: extra = f" (.md 多 {len(r['md_only'])} 个条件性字段)" print(f"\n{status} {r['table']} — JSON={r['json_count']}, MD={r['md_count']}{extra}") print(f"\n{'=' * 80}") print(f"总计: {len(results)} 个表, {issues} 个有 JSON→MD 缺失") # 输出 JSON 格式供后续处理 out_path = os.path.join("docs", "reports", "json_vs_md_gaps.json") with open(out_path, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n详细结果已写入: {out_path}") if __name__ == "__main__": main() # AI_CHANGELOG: # - 日期: 2026-02-14 # - Prompt: P20260214-044500 — "md文档和json数据不对应!全面排查" # - 直接原因: 用户要求全面排查 JSON 样本与 .md 文档的字段一致性 # - 变更摘要: 新建脚本,从 JSON 样本提取字段与 .md 文档"响应字段详解"章节比对; # 修复 3 个 bug(type 过滤、siteProfile/tableProfile 例外、roleAreaRelations 包装器) # - 风险与验证: 纯分析脚本,无运行时影响;运行 `python scripts/check_json_vs_md.py` 验证输出