init: 项目初始提交 - NeoZQYY Monorepo 完整代码

2026-02-15 14:58:14 +08:00
commit ded6dfb9d8
769 changed files with 182616 additions and 0 deletions
--- a/apps/etl/pipelines/feiqiu/scripts/check_json_vs_md.py
+++ b/apps/etl/pipelines/feiqiu/scripts/check_json_vs_md.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+"""
+比对 JSON 样本字段 vs API 参考文档(.md)字段。
+找出 JSON 中存在但 .md 文档"四、响应字段详解"中缺失的字段。
+
+特殊处理：
+- settlement_records / recharge_settlements: 从 settleList 内层提取字段
+  siteProfile 子字段不提取（ODS 中存为 siteprofile jsonb 列）
+- stock_goods_category_tree: 从 goodsCategoryList 内层提取字段
+- 嵌套对象（siteProfile, tableProfile）作为整体字段名
+"""
+import json
+import os
+import re
+import sys
+
+SAMPLES_DIR = os.path.join("docs", "api-reference", "samples")
+DOCS_DIR = os.path.join("docs", "api-reference")
+
+# 结构包装器字段（不应出现在比对中）
+WRAPPER_FIELDS = {"settleList", "siteProfile", "tableProfile",
+                  "goodsCategoryList", "data", "code", "msg",
+                  "settlelist", "siteprofile", "tableprofile",
+                  "goodscategorylist"}
+
+# 表头关键字（跳过）— 注意 "type" 不能放这里，因为有些表有 type 业务字段
+CROSS_REF_HEADERS = {"字段名", "类型", "示例值", "说明", "field", "example", "description"}
+
+
+def extract_json_fields(table_name: str) -> set:
+    """从 JSON 样本提取所有字段名（小写）"""
+    path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
+    if not os.path.exists(path):
+        return set()
+
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # settlement_records / recharge_settlements: settleList 内层
+    if table_name in ("settlement_records", "recharge_settlements"):
+        settle = data.get("settleList", {})
+        if isinstance(settle, list):
+            settle = settle[0] if settle else {}
+        fields = set()
+        for k in settle.keys():
+            kl = k.lower()
+            if kl in {"siteprofile"}:
+                fields.add(kl)  # 作为整体 jsonb 列
+                continue
+            fields.add(kl)
+        return fields
+
+    # stock_goods_category_tree: goodsCategoryList 内层
+    if table_name == "stock_goods_category_tree":
+        cat_list = data.get("goodsCategoryList", [])
+        if cat_list:
+            return {k.lower() for k in cat_list[0].keys()
+                    if k.lower() not in WRAPPER_FIELDS}
+        return set()
+
+    # role_area_association: roleAreaRelations 内层
+    if table_name == "role_area_association":
+        rel_list = data.get("roleAreaRelations", [])
+        if rel_list:
+            return {k.lower() for k in rel_list[0].keys()
+                    if k.lower() not in WRAPPER_FIELDS}
+        return set()
+
+    # 通用：顶层字段
+    fields = set()
+    for k in data.keys():
+        kl = k.lower()
+        if kl in WRAPPER_FIELDS:
+            # 嵌套对象作为整体
+            if kl in ("siteprofile", "tableprofile"):
+                fields.add(kl)
+            continue
+        fields.add(kl)
+    return fields
+
+
+def extract_md_fields(table_name: str) -> set:
+    """从 .md 文档的"四、响应字段详解"章节提取字段名（小写）"""
+    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
+    if not os.path.exists(md_path):
+        return set()
+
+    with open(md_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    fields = set()
+    in_section = False
+    in_siteprofile = False
+    field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|')
+    siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE)
+
+    for line in lines:
+        s = line.strip()
+
+        if s.startswith("## 四、") and "响应字段" in s:
+            in_section = True
+            in_siteprofile = False
+            continue
+
+        if in_section and s.startswith("## ") and not s.startswith("## 四"):
+            break
+
+        if not in_section:
+            continue
+
+        # siteProfile 子章节处理
+        if table_name in ("settlement_records", "recharge_settlements"):
+            if siteprofile_header.search(s):
+                in_siteprofile = True
+                continue
+            if s.startswith("### ") and in_siteprofile:
+                if not siteprofile_header.search(s):
+                    in_siteprofile = False
+
+        m = field_pattern.match(s)
+        if m:
+            raw = m.group(1).strip()
+            if raw.lower() in {h.lower() for h in CROSS_REF_HEADERS}:
+                continue
+            if table_name in ("settlement_records", "recharge_settlements"):
+                if in_siteprofile:
+                    continue
+                if raw.startswith("siteProfile."):
+                    continue
+            if raw.lower() in WRAPPER_FIELDS and raw.lower() not in ("siteprofile", "tableprofile"):
+                continue
+            fields.add(raw.lower())
+
+    return fields
+
+
+def main():
+    samples = sorted([
+        f.replace(".json", "")
+        for f in os.listdir(SAMPLES_DIR)
+        if f.endswith(".json")
+    ])
+
+    results = []
+    for table in samples:
+        json_fields = extract_json_fields(table)
+        md_fields = extract_md_fields(table)
+
+        # JSON 中有但 .md 中没有的
+        json_only = json_fields - md_fields
+        # .md 中有但 JSON 中没有的（可能是条件性字段，仅供参考）
+        md_only = md_fields - json_fields
+
+        results.append({
+            "table": table,
+            "json_count": len(json_fields),
+            "md_count": len(md_fields),
+            "json_only": sorted(json_only),
+            "md_only": sorted(md_only),
+        })
+
+    # 输出
+    print("=" * 80)
+    print("JSON 样本 vs .md 文档 字段比对报告")
+    print("=" * 80)
+
+    issues = 0
+    for r in results:
+        if r["json_only"]:
+            issues += 1
+            print(f"\n❌ {r['table']} — JSON={r['json_count']}, MD={r['md_count']}")
+            print(f"   JSON 中有但 .md 缺失 ({len(r['json_only'])} 个):")
+            for f in r["json_only"]:
+                print(f"     - {f}")
+            if r["md_only"]:
+                print(f"   .md 中有但 JSON 无 ({len(r['md_only'])} 个，可能是条件性字段):")
+                for f in r["md_only"]:
+                    print(f"     - {f}")
+        else:
+            status = "✅" if not r["md_only"] else "⚠️"
+            extra = ""
+            if r["md_only"]:
+                extra = f" (.md 多 {len(r['md_only'])} 个条件性字段)"
+            print(f"\n{status} {r['table']} — JSON={r['json_count']}, MD={r['md_count']}{extra}")
+
+    print(f"\n{'=' * 80}")
+    print(f"总计: {len(results)} 个表, {issues} 个有 JSON→MD 缺失")
+
+    # 输出 JSON 格式供后续处理
+    out_path = os.path.join("docs", "reports", "json_vs_md_gaps.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\n详细结果已写入: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
+
+# AI_CHANGELOG:
+# - 日期: 2026-02-14
+# - Prompt: P20260214-044500 — "md文档和json数据不对应！全面排查"
+# - 直接原因: 用户要求全面排查 JSON 样本与 .md 文档的字段一致性
+# - 变更摘要: 新建脚本，从 JSON 样本提取字段与 .md 文档"响应字段详解"章节比对；
+#   修复 3 个 bug（type 过滤、siteProfile/tableProfile 例外、roleAreaRelations 包装器）
+# - 风险与验证: 纯分析脚本，无运行时影响；运行 `python scripts/check_json_vs_md.py` 验证输出