init: 项目初始提交 - NeoZQYY Monorepo 完整代码
This commit is contained in:
205
apps/etl/pipelines/feiqiu/scripts/check_json_vs_md.py
Normal file
205
apps/etl/pipelines/feiqiu/scripts/check_json_vs_md.py
Normal file
@@ -0,0 +1,205 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
比对 JSON 样本字段 vs API 参考文档(.md)字段。
|
||||
找出 JSON 中存在但 .md 文档"四、响应字段详解"中缺失的字段。
|
||||
|
||||
特殊处理:
|
||||
- settlement_records / recharge_settlements: 从 settleList 内层提取字段
|
||||
siteProfile 子字段不提取(ODS 中存为 siteprofile jsonb 列)
|
||||
- stock_goods_category_tree: 从 goodsCategoryList 内层提取字段
|
||||
- 嵌套对象(siteProfile, tableProfile)作为整体字段名
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
|
||||
DOCS_DIR = os.path.join("docs", "api-reference")
|
||||
|
||||
# 结构包装器字段(不应出现在比对中)
|
||||
WRAPPER_FIELDS = {"settleList", "siteProfile", "tableProfile",
|
||||
"goodsCategoryList", "data", "code", "msg",
|
||||
"settlelist", "siteprofile", "tableprofile",
|
||||
"goodscategorylist"}
|
||||
|
||||
# 表头关键字(跳过)— 注意 "type" 不能放这里,因为有些表有 type 业务字段
|
||||
CROSS_REF_HEADERS = {"字段名", "类型", "示例值", "说明", "field", "example", "description"}
|
||||
|
||||
|
||||
def extract_json_fields(table_name: str) -> set:
|
||||
"""从 JSON 样本提取所有字段名(小写)"""
|
||||
path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
|
||||
if not os.path.exists(path):
|
||||
return set()
|
||||
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# settlement_records / recharge_settlements: settleList 内层
|
||||
if table_name in ("settlement_records", "recharge_settlements"):
|
||||
settle = data.get("settleList", {})
|
||||
if isinstance(settle, list):
|
||||
settle = settle[0] if settle else {}
|
||||
fields = set()
|
||||
for k in settle.keys():
|
||||
kl = k.lower()
|
||||
if kl in {"siteprofile"}:
|
||||
fields.add(kl) # 作为整体 jsonb 列
|
||||
continue
|
||||
fields.add(kl)
|
||||
return fields
|
||||
|
||||
# stock_goods_category_tree: goodsCategoryList 内层
|
||||
if table_name == "stock_goods_category_tree":
|
||||
cat_list = data.get("goodsCategoryList", [])
|
||||
if cat_list:
|
||||
return {k.lower() for k in cat_list[0].keys()
|
||||
if k.lower() not in WRAPPER_FIELDS}
|
||||
return set()
|
||||
|
||||
# role_area_association: roleAreaRelations 内层
|
||||
if table_name == "role_area_association":
|
||||
rel_list = data.get("roleAreaRelations", [])
|
||||
if rel_list:
|
||||
return {k.lower() for k in rel_list[0].keys()
|
||||
if k.lower() not in WRAPPER_FIELDS}
|
||||
return set()
|
||||
|
||||
# 通用:顶层字段
|
||||
fields = set()
|
||||
for k in data.keys():
|
||||
kl = k.lower()
|
||||
if kl in WRAPPER_FIELDS:
|
||||
# 嵌套对象作为整体
|
||||
if kl in ("siteprofile", "tableprofile"):
|
||||
fields.add(kl)
|
||||
continue
|
||||
fields.add(kl)
|
||||
return fields
|
||||
|
||||
|
||||
def extract_md_fields(table_name: str) -> set:
|
||||
"""从 .md 文档的"四、响应字段详解"章节提取字段名(小写)"""
|
||||
md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
|
||||
if not os.path.exists(md_path):
|
||||
return set()
|
||||
|
||||
with open(md_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
fields = set()
|
||||
in_section = False
|
||||
in_siteprofile = False
|
||||
field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|')
|
||||
siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE)
|
||||
|
||||
for line in lines:
|
||||
s = line.strip()
|
||||
|
||||
if s.startswith("## 四、") and "响应字段" in s:
|
||||
in_section = True
|
||||
in_siteprofile = False
|
||||
continue
|
||||
|
||||
if in_section and s.startswith("## ") and not s.startswith("## 四"):
|
||||
break
|
||||
|
||||
if not in_section:
|
||||
continue
|
||||
|
||||
# siteProfile 子章节处理
|
||||
if table_name in ("settlement_records", "recharge_settlements"):
|
||||
if siteprofile_header.search(s):
|
||||
in_siteprofile = True
|
||||
continue
|
||||
if s.startswith("### ") and in_siteprofile:
|
||||
if not siteprofile_header.search(s):
|
||||
in_siteprofile = False
|
||||
|
||||
m = field_pattern.match(s)
|
||||
if m:
|
||||
raw = m.group(1).strip()
|
||||
if raw.lower() in {h.lower() for h in CROSS_REF_HEADERS}:
|
||||
continue
|
||||
if table_name in ("settlement_records", "recharge_settlements"):
|
||||
if in_siteprofile:
|
||||
continue
|
||||
if raw.startswith("siteProfile."):
|
||||
continue
|
||||
if raw.lower() in WRAPPER_FIELDS and raw.lower() not in ("siteprofile", "tableprofile"):
|
||||
continue
|
||||
fields.add(raw.lower())
|
||||
|
||||
return fields
|
||||
|
||||
|
||||
def main():
|
||||
samples = sorted([
|
||||
f.replace(".json", "")
|
||||
for f in os.listdir(SAMPLES_DIR)
|
||||
if f.endswith(".json")
|
||||
])
|
||||
|
||||
results = []
|
||||
for table in samples:
|
||||
json_fields = extract_json_fields(table)
|
||||
md_fields = extract_md_fields(table)
|
||||
|
||||
# JSON 中有但 .md 中没有的
|
||||
json_only = json_fields - md_fields
|
||||
# .md 中有但 JSON 中没有的(可能是条件性字段,仅供参考)
|
||||
md_only = md_fields - json_fields
|
||||
|
||||
results.append({
|
||||
"table": table,
|
||||
"json_count": len(json_fields),
|
||||
"md_count": len(md_fields),
|
||||
"json_only": sorted(json_only),
|
||||
"md_only": sorted(md_only),
|
||||
})
|
||||
|
||||
# 输出
|
||||
print("=" * 80)
|
||||
print("JSON 样本 vs .md 文档 字段比对报告")
|
||||
print("=" * 80)
|
||||
|
||||
issues = 0
|
||||
for r in results:
|
||||
if r["json_only"]:
|
||||
issues += 1
|
||||
print(f"\n❌ {r['table']} — JSON={r['json_count']}, MD={r['md_count']}")
|
||||
print(f" JSON 中有但 .md 缺失 ({len(r['json_only'])} 个):")
|
||||
for f in r["json_only"]:
|
||||
print(f" - {f}")
|
||||
if r["md_only"]:
|
||||
print(f" .md 中有但 JSON 无 ({len(r['md_only'])} 个,可能是条件性字段):")
|
||||
for f in r["md_only"]:
|
||||
print(f" - {f}")
|
||||
else:
|
||||
status = "✅" if not r["md_only"] else "⚠️"
|
||||
extra = ""
|
||||
if r["md_only"]:
|
||||
extra = f" (.md 多 {len(r['md_only'])} 个条件性字段)"
|
||||
print(f"\n{status} {r['table']} — JSON={r['json_count']}, MD={r['md_count']}{extra}")
|
||||
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"总计: {len(results)} 个表, {issues} 个有 JSON→MD 缺失")
|
||||
|
||||
# 输出 JSON 格式供后续处理
|
||||
out_path = os.path.join("docs", "reports", "json_vs_md_gaps.json")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n详细结果已写入: {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
# AI_CHANGELOG:
|
||||
# - 日期: 2026-02-14
|
||||
# - Prompt: P20260214-044500 — "md文档和json数据不对应!全面排查"
|
||||
# - 直接原因: 用户要求全面排查 JSON 样本与 .md 文档的字段一致性
|
||||
# - 变更摘要: 新建脚本,从 JSON 样本提取字段与 .md 文档"响应字段详解"章节比对;
|
||||
# 修复 3 个 bug(type 过滤、siteProfile/tableProfile 例外、roleAreaRelations 包装器)
|
||||
# - 风险与验证: 纯分析脚本,无运行时影响;运行 `python scripts/check_json_vs_md.py` 验证输出
|
||||
Reference in New Issue
Block a user