Files
Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/check_json_vs_md.py

206 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
比对 JSON 样本字段 vs API 参考文档(.md)字段。
找出 JSON 中存在但 .md 文档"四、响应字段详解"中缺失的字段。
特殊处理:
- settlement_records / recharge_settlements: 从 settleList 内层提取字段
siteProfile 子字段不提取ODS 中存为 siteprofile jsonb 列)
- stock_goods_category_tree: 从 goodsCategoryList 内层提取字段
- 嵌套对象siteProfile, tableProfile作为整体字段名
"""
import json
import os
import re
import sys
SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
DOCS_DIR = os.path.join("docs", "api-reference")
# 结构包装器字段(不应出现在比对中)
WRAPPER_FIELDS = {"settleList", "siteProfile", "tableProfile",
"goodsCategoryList", "data", "code", "msg",
"settlelist", "siteprofile", "tableprofile",
"goodscategorylist"}
# 表头关键字(跳过)— 注意 "type" 不能放这里,因为有些表有 type 业务字段
CROSS_REF_HEADERS = {"字段名", "类型", "示例值", "说明", "field", "example", "description"}
def extract_json_fields(table_name: str) -> set:
"""从 JSON 样本提取所有字段名(小写)"""
path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
if not os.path.exists(path):
return set()
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
# settlement_records / recharge_settlements: settleList 内层
if table_name in ("settlement_records", "recharge_settlements"):
settle = data.get("settleList", {})
if isinstance(settle, list):
settle = settle[0] if settle else {}
fields = set()
for k in settle.keys():
kl = k.lower()
if kl in {"siteprofile"}:
fields.add(kl) # 作为整体 jsonb 列
continue
fields.add(kl)
return fields
# stock_goods_category_tree: goodsCategoryList 内层
if table_name == "stock_goods_category_tree":
cat_list = data.get("goodsCategoryList", [])
if cat_list:
return {k.lower() for k in cat_list[0].keys()
if k.lower() not in WRAPPER_FIELDS}
return set()
# role_area_association: roleAreaRelations 内层
if table_name == "role_area_association":
rel_list = data.get("roleAreaRelations", [])
if rel_list:
return {k.lower() for k in rel_list[0].keys()
if k.lower() not in WRAPPER_FIELDS}
return set()
# 通用:顶层字段
fields = set()
for k in data.keys():
kl = k.lower()
if kl in WRAPPER_FIELDS:
# 嵌套对象作为整体
if kl in ("siteprofile", "tableprofile"):
fields.add(kl)
continue
fields.add(kl)
return fields
def extract_md_fields(table_name: str) -> set:
"""从 .md 文档的"四、响应字段详解"章节提取字段名(小写)"""
md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
if not os.path.exists(md_path):
return set()
with open(md_path, "r", encoding="utf-8") as f:
lines = f.readlines()
fields = set()
in_section = False
in_siteprofile = False
field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|')
siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE)
for line in lines:
s = line.strip()
if s.startswith("## 四、") and "响应字段" in s:
in_section = True
in_siteprofile = False
continue
if in_section and s.startswith("## ") and not s.startswith("## 四"):
break
if not in_section:
continue
# siteProfile 子章节处理
if table_name in ("settlement_records", "recharge_settlements"):
if siteprofile_header.search(s):
in_siteprofile = True
continue
if s.startswith("### ") and in_siteprofile:
if not siteprofile_header.search(s):
in_siteprofile = False
m = field_pattern.match(s)
if m:
raw = m.group(1).strip()
if raw.lower() in {h.lower() for h in CROSS_REF_HEADERS}:
continue
if table_name in ("settlement_records", "recharge_settlements"):
if in_siteprofile:
continue
if raw.startswith("siteProfile."):
continue
if raw.lower() in WRAPPER_FIELDS and raw.lower() not in ("siteprofile", "tableprofile"):
continue
fields.add(raw.lower())
return fields
def main():
samples = sorted([
f.replace(".json", "")
for f in os.listdir(SAMPLES_DIR)
if f.endswith(".json")
])
results = []
for table in samples:
json_fields = extract_json_fields(table)
md_fields = extract_md_fields(table)
# JSON 中有但 .md 中没有的
json_only = json_fields - md_fields
# .md 中有但 JSON 中没有的(可能是条件性字段,仅供参考)
md_only = md_fields - json_fields
results.append({
"table": table,
"json_count": len(json_fields),
"md_count": len(md_fields),
"json_only": sorted(json_only),
"md_only": sorted(md_only),
})
# 输出
print("=" * 80)
print("JSON 样本 vs .md 文档 字段比对报告")
print("=" * 80)
issues = 0
for r in results:
if r["json_only"]:
issues += 1
print(f"\n{r['table']} — JSON={r['json_count']}, MD={r['md_count']}")
print(f" JSON 中有但 .md 缺失 ({len(r['json_only'])} 个):")
for f in r["json_only"]:
print(f" - {f}")
if r["md_only"]:
print(f" .md 中有但 JSON 无 ({len(r['md_only'])} 个,可能是条件性字段):")
for f in r["md_only"]:
print(f" - {f}")
else:
status = "" if not r["md_only"] else "⚠️"
extra = ""
if r["md_only"]:
extra = f" (.md 多 {len(r['md_only'])} 个条件性字段)"
print(f"\n{status} {r['table']} — JSON={r['json_count']}, MD={r['md_count']}{extra}")
print(f"\n{'=' * 80}")
print(f"总计: {len(results)} 个表, {issues} 个有 JSON→MD 缺失")
# 输出 JSON 格式供后续处理
out_path = os.path.join("docs", "reports", "json_vs_md_gaps.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\n详细结果已写入: {out_path}")
if __name__ == "__main__":
main()
# AI_CHANGELOG:
# - 日期: 2026-02-14
# - Prompt: P20260214-044500 — "md文档和json数据不对应全面排查"
# - 直接原因: 用户要求全面排查 JSON 样本与 .md 文档的字段一致性
# - 变更摘要: 新建脚本,从 JSON 样本提取字段与 .md 文档"响应字段详解"章节比对;
# 修复 3 个 bugtype 过滤、siteProfile/tableProfile 例外、roleAreaRelations 包装器)
# - 风险与验证: 纯分析脚本,无运行时影响;运行 `python scripts/check_json_vs_md.py` 验证输出