init: 项目初始提交 - NeoZQYY Monorepo 完整代码

2026-02-15 14:58:14 +08:00
commit ded6dfb9d8
769 changed files with 182616 additions and 0 deletions
--- a/apps/etl/pipelines/feiqiu/scripts/run_compare_v3_fixed.py
+++ b/apps/etl/pipelines/feiqiu/scripts/run_compare_v3_fixed.py
@@ -0,0 +1,465 @@
+# -*- coding: utf-8 -*-
+"""
+v3-fixed: API 参考文档 (.md) 响应字段详解 vs ODS 实际列 — 精确比对
+
+核心改进（相对 v3）：
+1. 仅从"四、响应字段详解"章节提取字段（排除请求参数、跨表关联等章节）
+2. 对 settlement_records / recharge_settlements 特殊处理：
+   - settleList 内层字段 → 直接比对 ODS 列
+   - siteProfile → ODS 中存为 siteprofile jsonb 单列（不展开子字段）
+3. 对 table_fee_discount_records / payment_transactions 等含 siteProfile/tableProfile 的表：
+   - siteProfile/tableProfile 作为嵌套对象 → ODS 中存为 jsonb 单列
+4. 对 stock_goods_category_tree：goodsCategoryList/categoryBoxes 是结构包装器，不是业务字段
+5. JSON 样本作为补充来源（union）
+
+CHANGE P20260214-003000: 完全重写字段提取逻辑
+intent: 精确限定提取范围到"响应字段详解"章节，避免误提取请求参数和跨表关联字段
+assumptions: 所有 .md 文档均以"## 四、响应字段详解"开始响应字段章节，以"## 五、"结束
+edge cases: settlement_records/recharge_settlements 的 siteProfile 子字段不应与 ODS 列比对
+"""
+import json
+import os
+import re
+from datetime import datetime
+
+DOCS_DIR = os.path.join(os.path.dirname(__file__), "..", "docs", "api-reference")
+SAMPLES_DIR = os.path.join(DOCS_DIR, "samples")
+REPORT_DIR = os.path.join(os.path.dirname(__file__), "..", "docs", "reports")
+ODS_META = {"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"}
+
+TABLES = [
+    "assistant_accounts_master", "settlement_records", "assistant_service_records",
+    "assistant_cancellation_records", "table_fee_transactions", "table_fee_discount_records",
+    "payment_transactions", "refund_transactions", "platform_coupon_redemption_records",
+    "tenant_goods_master", "store_goods_sales_records", "store_goods_master",
+    "stock_goods_category_tree", "goods_stock_movements", "member_profiles",
+    "member_stored_value_cards", "recharge_settlements", "member_balance_changes",
+    "group_buy_packages", "group_buy_redemption_records", "goods_stock_summary",
+    "site_tables_master",
+]
+
+# 这些字段在 API JSON 中是嵌套对象，ODS 中存为 jsonb 单列
+NESTED_OBJECTS = {"siteprofile", "tableprofile"}
+# 这些字段是结构包装器，不是业务字段
+# 注意：categoryboxes 虽然是嵌套数组，但 ODS 中确实有 categoryboxes 列（jsonb），所以不排除
+WRAPPER_FIELDS = {"goodscategorylist", "total"}
+# 跨表关联章节中常见的"本表字段"列标题
+CROSS_REF_HEADERS = {"本表字段", "关联表字段", "关联表", "参数", "字段"}
+
+
+def extract_response_fields_from_md(table_name: str) -> tuple[set[str], list[str]]:
+    """
+    从 API 参考文档中精确提取"响应字段详解"章节的字段名。
+
+    返回: (fields_set_lowercase, debug_messages)
+
+    提取策略：
+    - 找到"## 四、响应字段详解"章节
+    - 在该章节内提取所有 Markdown 表格第一列的反引号字段名
+    - 遇到"## 五、"或更高级别标题时停止
+    - 对 settlement_records / recharge_settlements：
+      - siteProfile 子字段（带 siteProfile. 前缀的）→ 不提取，ODS 中存为 siteprofile jsonb
+      - settleList 内层字段 → 正常提取
+    - 对含 siteProfile/tableProfile 的表：这些作为顶层字段名提取（ODS 中是 jsonb 列）
+    """
+    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
+    debug = []
+    if not os.path.exists(md_path):
+        debug.append(f"[WARN] 文档不存在: {md_path}")
+        return set(), debug
+
+    with open(md_path, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+
+    fields = set()
+    in_response_section = False
+    in_siteprofile_subsection = False
+    field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|')
+    # 用于检测 siteProfile 子章节（如 "### A. siteProfile" 或 "### 4.1 门店信息快照（siteProfile）"）
+    siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE)
+
+    for line in lines:
+        stripped = line.strip()
+
+        # 检测进入"响应字段详解"章节
+        if stripped.startswith("## 四、") and "响应字段" in stripped:
+            in_response_section = True
+            in_siteprofile_subsection = False
+            continue
+
+        # 检测离开（遇到下一个 ## 级别标题）
+        if in_response_section and stripped.startswith("## ") and not stripped.startswith("## 四"):
+            break
+
+        if not in_response_section:
+            continue
+
+        # 检测 siteProfile 子章节（仅对 settlement_records / recharge_settlements）
+        if table_name in ("settlement_records", "recharge_settlements"):
+            if siteprofile_header.search(stripped):
+                in_siteprofile_subsection = True
+                continue
+            # 遇到下一个 ### 标题，退出 siteProfile 子章节
+            if stripped.startswith("### ") and in_siteprofile_subsection:
+                if not siteprofile_header.search(stripped):
+                    in_siteprofile_subsection = False
+
+        # 提取字段名
+        m = field_pattern.match(stripped)
+        if m:
+            raw_field = m.group(1).strip()
+
+            # 跳过表头行
+            if raw_field in CROSS_REF_HEADERS:
+                continue
+
+            # 对 settlement_records / recharge_settlements：跳过 siteProfile 子字段
+            if table_name in ("settlement_records", "recharge_settlements"):
+                if in_siteprofile_subsection:
+                    # siteProfile 子字段不提取（ODS 中存为 siteprofile jsonb）
+                    continue
+                # 带 siteProfile. 前缀的也跳过
+                if raw_field.startswith("siteProfile."):
+                    continue
+
+            # 跳过结构包装器字段
+            if raw_field.lower() in WRAPPER_FIELDS:
+                continue
+
+            fields.add(raw_field.lower())
+
+    debug.append(f"从 .md 提取 {len(fields)} 个响应字段")
+    return fields, debug
+
+
+def extract_fields_from_json(table_name: str) -> tuple[set[str], list[str]]:
+    """从 JSON 样本提取字段（作为补充）"""
+    path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
+    debug = []
+    if not os.path.exists(path):
+        debug.append("[INFO] 无 JSON 样本")
+        return set(), debug
+
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    # settlement_records / recharge_settlements: 提取 settleList 内层字段
+    if table_name in ("settlement_records", "recharge_settlements"):
+        settle = data.get("settleList", {})
+        if isinstance(settle, list):
+            settle = settle[0] if settle else {}
+        fields = {k.lower() for k in settle.keys()}
+        # siteProfile 作为整体（ODS 中不存 siteProfile 的子字段，但可能有 siteprofile jsonb 列）
+        # 不添加 siteProfile 的子字段
+        debug.append(f"从 JSON settleList 提取 {len(fields)} 个字段")
+        return fields, debug
+
+    # stock_goods_category_tree: 提取 goodsCategoryList 内层字段
+    if table_name == "stock_goods_category_tree":
+        cat_list = data.get("goodsCategoryList", [])
+        if cat_list:
+            fields = set()
+            for k in cat_list[0].keys():
+                kl = k.lower()
+                if kl not in WRAPPER_FIELDS:
+                    fields.add(kl)
+            debug.append(f"从 JSON goodsCategoryList 提取 {len(fields)} 个字段")
+            return fields, debug
+        return set(), debug
+
+    # 通用：提取顶层字段
+    fields = set()
+    for k in data.keys():
+        kl = k.lower()
+        # siteProfile/tableProfile 作为整体保留（ODS 中是 jsonb 列）
+        if kl in NESTED_OBJECTS:
+            fields.add(kl)
+        elif kl not in WRAPPER_FIELDS:
+            fields.add(kl)
+    debug.append(f"从 JSON 提取 {len(fields)} 个字段")
+    return fields, debug
+
+
+def classify_ods_only(table_name: str, field: str) -> str:
+    """对 ODS 独有字段进行分类说明"""
+    # table_fee_discount_records 的展开字段
+    if table_name == "table_fee_discount_records" and field in (
+        "area_type_id", "charge_free", "site_table_area_id", "site_table_area_name",
+        "sitename", "table_name", "table_price", "tenant_name"
+    ):
+        return "从 tableProfile/siteProfile 嵌套对象展开的字段"
+    # site_tables_master 的 order_id
+    if table_name == "site_tables_master" and field == "order_id":
+        return "ODS 后续版本新增字段（当前使用中的台桌关联订单 ID）"
+    # tenant_id 在某些表中是 ODS 额外添加的
+    if field == "tenant_id" and table_name in (
+        "assistant_cancellation_records", "payment_transactions"
+    ):
+        return "ODS 额外添加的租户 ID 字段（API 响应中不含，ETL 入库时补充）"
+    # API 后续版本新增字段（文档快照未覆盖）
+    api_version_fields = {
+        "assistant_service_records": {
+            "assistantteamname": "API 后续版本新增（助教团队名称）",
+            "real_service_money": "API 后续版本新增（实际服务金额）",
+        },
+        "table_fee_transactions": {
+            "activity_discount_amount": "API 后续版本新增（活动折扣金额）",
+            "order_consumption_type": "API 后续版本新增（订单消费类型）",
+            "real_service_money": "API 后续版本新增（实际服务金额）",
+        },
+        "tenant_goods_master": {
+            "not_sale": "API 后续版本新增（是否禁售标记）",
+        },
+        "store_goods_sales_records": {
+            "coupon_share_money": "API 后续版本新增（优惠券分摊金额）",
+        },
+        "store_goods_master": {
+            "commodity_code": "API 后续版本新增（商品编码）",
+            "not_sale": "API 后续版本新增（是否禁售标记）",
+        },
+        "member_profiles": {
+            "pay_money_sum": "API 后续版本新增（累计消费金额）",
+            "person_tenant_org_id": "API 后续版本新增（人事组织 ID）",
+            "person_tenant_org_name": "API 后续版本新增（人事组织名称）",
+            "recharge_money_sum": "API 后续版本新增（累计充值金额）",
+            "register_source": "API 后续版本新增（注册来源）",
+        },
+        "member_stored_value_cards": {
+            "able_share_member_discount": "API 后续版本新增（是否共享会员折扣）",
+            "electricity_deduct_radio": "API 后续版本新增（电费抵扣比例）",
+            "electricity_discount": "API 后续版本新增（电费折扣）",
+            "electricitycarddeduct": "API 后续版本新增（电费卡扣金额）",
+            "member_grade": "API 后续版本新增（会员等级）",
+            "principal_balance": "API 后续版本新增（本金余额）",
+            "rechargefreezebalance": "API 后续版本新增（充值冻结余额）",
+        },
+        "member_balance_changes": {
+            "principal_after": "API 后续版本新增（变动后本金）",
+            "principal_before": "API 后续版本新增（变动前本金）",
+            "principal_data": "API 后续版本新增（本金明细数据）",
+        },
+        "group_buy_packages": {
+            "is_first_limit": "API 后续版本新增（是否限首单）",
+            "sort": "API 后续版本新增（排序序号）",
+            "tenantcouponsaleorderitemid": "API 后续版本新增（租户券销售订单项 ID）",
+        },
+        "group_buy_redemption_records": {
+            "assistant_service_share_money": "API 后续版本新增（助教服务分摊金额）",
+            "assistant_share_money": "API 后续版本新增（助教分摊金额）",
+            "coupon_sale_id": "API 后续版本新增（券销售 ID）",
+            "good_service_share_money": "API 后续版本新增（商品服务分摊金额）",
+            "goods_share_money": "API 后续版本新增（商品分摊金额）",
+            "member_discount_money": "API 后续版本新增（会员折扣金额）",
+            "recharge_share_money": "API 后续版本新增（充值分摊金额）",
+            "table_service_share_money": "API 后续版本新增（台费服务分摊金额）",
+            "table_share_money": "API 后续版本新增（台费分摊金额）",
+        },
+    }
+    table_fields = api_version_fields.get(table_name, {})
+    if field in table_fields:
+        return table_fields[field]
+    return "ODS 独有（待确认来源）"
+
+
+def main():
+    ods_cols_path = os.path.join(os.path.dirname(__file__), "ods_columns.json")
+    with open(ods_cols_path, "r", encoding="utf-8") as f:
+        ods_all = json.load(f)
+
+    results = []
+    total_api_only = 0
+    total_ods_only = 0
+    all_debug = {}
+
+    for table in TABLES:
+        debug_lines = [f"\n{'='*60}", f"表: {table}", f"{'='*60}"]
+
+        # 从文档提取字段（主要来源）
+        md_fields, md_debug = extract_response_fields_from_md(table)
+        debug_lines.extend(md_debug)
+
+        # 从 JSON 样本提取字段（补充）
+        json_fields, json_debug = extract_fields_from_json(table)
+        debug_lines.extend(json_debug)
+
+        # 合并：文档字段 ∪ JSON 样本字段
+        api_fields = md_fields | json_fields
+
+        # 特殊处理：settlement_records / recharge_settlements
+        # ODS 中有 siteprofile 列但不展开子字段；也有 settlelist jsonb 列
+        # API 文档中 siteProfile 子字段已被排除，但需要确保 siteprofile 作为整体列被考虑
+        if table in ("settlement_records", "recharge_settlements"):
+            # 不把 siteprofile 加入 api_fields（因为 ODS 中 siteprofile 不是从 API 直接映射的列名）
+            # settlelist 也是 ODS 的 jsonb 列，不在 API 字段中
+            pass
+
+        # 特殊处理：含 siteProfile/tableProfile 的表
+        # 这些在 API 中是嵌套对象，ODS 中存为 jsonb 列
+        # 确保 api_fields 中包含 siteprofile/tableprofile（如果 ODS 有这些列）
+        ods_cols = set(ods_all.get(table, [])) - ODS_META
+        ods_cols_lower = set()
+        ods_case_map = {}
+        for c in ods_cols:
+            cl = c.lower()
+            ods_cols_lower.add(cl)
+            ods_case_map[cl] = c
+
+        # 如果 ODS 有 siteprofile/tableprofile 列，且 API 文档中有 siteProfile/tableProfile 字段
+        for nested in NESTED_OBJECTS:
+            if nested in ods_cols_lower and nested not in api_fields:
+                # 检查 API 文档/JSON 中是否有这个嵌套对象
+                # 对于 settlement_records/recharge_settlements，siteProfile 确实存在于 API 响应中
+                # 对于 payment_transactions 等，siteProfile 也存在
+                api_fields.add(nested)
+                debug_lines.append(f"  补充嵌套对象字段: {nested}")
+
+        matched = sorted(api_fields & ods_cols_lower)
+        api_only = sorted(api_fields - ods_cols_lower)
+        ods_only = sorted(ods_cols_lower - api_fields)
+
+        # 对 ODS 独有字段分类
+        ods_only_classified = []
+        for f in ods_only:
+            reason = classify_ods_only(table, f)
+            ods_only_classified.append({"field": f, "ods_original": ods_case_map.get(f, f), "reason": reason})
+
+        total_api_only += len(api_only)
+        total_ods_only += len(ods_only)
+
+        result = {
+            "table": table,
+            "api_count": len(api_fields),
+            "ods_count": len(ods_cols_lower),
+            "matched": len(matched),
+            "matched_fields": matched,
+            "api_only": api_only,
+            "ods_only": ods_only_classified,
+            "api_only_count": len(api_only),
+            "ods_only_count": len(ods_only),
+            "md_fields_count": len(md_fields),
+            "json_fields_count": len(json_fields),
+        }
+        results.append(result)
+
+        status = "✓ 完全对齐" if not api_only and not ods_only else ""
+        print(f"{table}: API={len(api_fields)}(md={len(md_fields)},json={len(json_fields)}) "
+              f"ODS={len(ods_cols_lower)} 匹配={len(matched)} "
+              f"API独有={len(api_only)} ODS独有={len(ods_only)} {status}")
+        if api_only:
+            print(f"  API独有: {api_only}")
+        if ods_only:
+            for item in ods_only_classified:
+                print(f"  ODS独有: {item['ods_original']} — {item['reason']}")
+
+        all_debug[table] = debug_lines
+
+    print(f"\n{'='*60}")
+    print(f"总计: API独有={total_api_only}, ODS独有={total_ods_only}")
+    print(f"{'='*60}")
+
+    # 写 JSON 报告
+    os.makedirs(REPORT_DIR, exist_ok=True)
+    json_out = os.path.join(REPORT_DIR, "api_ods_comparison_v3_fixed.json")
+    with open(json_out, "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    print(f"\nJSON 报告: {json_out}")
+
+    # 写 Markdown 报告
+    md_out = os.path.join(REPORT_DIR, "api_ods_comparison_v3_fixed.md")
+    write_md_report(results, md_out, total_api_only, total_ods_only)
+    print(f"MD 报告: {md_out}")
+
+
+def write_md_report(results, path, total_api_only, total_ods_only):
+    now = datetime.now().strftime("%Y-%m-%d %H:%M")
+    lines = [
+        f"# API 响应字段 vs ODS 表结构比对报告（v3-fixed）",
+        f"",
+        f"> 生成时间：{now}（Asia/Shanghai）",
+        f"> 数据来源：API 参考文档（docs/api-reference/*.md）+ JSON 样本 + PostgreSQL information_schema",
+        f'> 比对方法：从文档"响应字段详解"章节精确提取字段，与 ODS 实际列比对（排除 meta 列）',
+        f"",
+        f"## 汇总",
+        f"",
+        f"| 指标 | 值 |",
+        f"|------|-----|",
+        f"| 比对表数 | {len(results)} |",
+        f"| API 独有字段总数 | {total_api_only} |",
+        f"| ODS 独有字段总数 | {total_ods_only} |",
+        f"| 完全对齐表数 | {sum(1 for r in results if r['api_only_count'] == 0 and r['ods_only_count'] == 0)} |",
+        f"",
+        f"## 逐表比对",
+        f"",
+    ]
+
+    for r in results:
+        status = "✅ 完全对齐" if r["api_only_count"] == 0 and r["ods_only_count"] == 0 else "⚠️ 有差异"
+        lines.append(f"### {r['table']} — {status}")
+        lines.append(f"")
+        lines.append(f"| 指标 | 值 |")
+        lines.append(f"|------|-----|")
+        lines.append(f"| API 字段数 | {r['api_count']}（文档={r['md_fields_count']}，JSON={r['json_fields_count']}） |")
+        lines.append(f"| ODS 列数（排除 meta） | {r['ods_count']} |")
+        lines.append(f"| 匹配 | {r['matched']} |")
+        lines.append(f"| API 独有 | {r['api_only_count']} |")
+        lines.append(f"| ODS 独有 | {r['ods_only_count']} |")
+        lines.append(f"")
+
+        if r["api_only"]:
+            lines.append(f"**API 独有字段（ODS 中缺失）：**")
+            lines.append(f"")
+            for f in r["api_only"]:
+                lines.append(f"- `{f}`")
+            lines.append(f"")
+
+        if r["ods_only"]:
+            lines.append(f"**ODS 独有字段（API 文档中未出现）：**")
+            lines.append(f"")
+            lines.append(f"| ODS 列名 | 分类说明 |")
+            lines.append(f"|----------|----------|")
+            for item in r["ods_only"]:
+                lines.append(f"| `{item['ods_original']}` | {item['reason']} |")
+            lines.append(f"")
+
+        lines.append(f"---")
+        lines.append(f"")
+
+    # AI_CHANGELOG
+    lines.extend([
+        f"<!--",
+        f"AI_CHANGELOG:",
+        f"- 日期: 2026-02-14",
+        f"- Prompt: P20260214-003000 — v3 比对不准确，重写为 v3-fixed",
+        f"- 直接原因: v3 仅从 JSON 样本提取字段导致遗漏；v3-fixed 从 .md 文档响应字段详解章节精确提取",
+        f"- 变更摘要: 新建 v3-fixed 报告，精确限定提取范围，排除请求参数和跨表关联字段",
+        f"- 风险与验证: 纯分析报告，无运行时影响；验证方式：抽查 assistant_accounts_master 的 last_update_name 是否正确识别为匹配",
+        f"-->",
+    ])
+
+    with open(path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+
+if __name__ == "__main__":
+    main()
+
+
+# AI_CHANGELOG:
+# - 日期: 2026-02-14
+# - Prompt: P20260214-003000 — "还是不准，比如assistant_accounts_master的last_update_name，命名Json里就有，再仔细比对下"
+# - 直接原因: v3 仅从 JSON 样本提取字段导致遗漏条件性字段；需改用 .md 文档响应字段详解章节作为主要来源
+# - 变更摘要: 完全重写脚本，精确限定提取范围到"四、响应字段详解"章节，排除请求参数和跨表关联；
+#   对 settlement_records/recharge_settlements 的 siteProfile 子字段不提取；对所有 ODS 独有字段分类说明
+# - 风险与验证: 纯分析脚本，无运行时影响；验证：确认 assistant_accounts_master 62:62 完全对齐，last_update_name 正确匹配
+#
+# - 日期: 2026-02-14
+# - Prompt: P20260214-030000 — 上下文传递续接，执行 settlelist 删除后的收尾工作
+# - 直接原因: settlelist 列已从 ODS 删除，classify_ods_only 中的 settlelist 特殊分类不再需要
+# - 变更摘要: 移除 classify_ods_only 函数中 settlelist 的特殊分类逻辑
+# - 风险与验证: 纯分析脚本；验证：重新运行脚本确认 ODS 独有=47，settlement_records 和 recharge_settlements 完全对齐
+#
+# - 日期: 2026-02-14
+# - Prompt: P20260214-070000 — ODS 清理与文档标注（5 项任务）
+# - 直接原因: option_name（store_goods_sales_records）和 able_site_transfer（member_stored_value_cards）已从 ODS 删除
+# - 变更摘要: 从 classify_ods_only 的 api_version_fields 字典中移除 option_name 和 able_site_transfer 条目
+# - 风险与验证: 纯分析脚本；验证：重新运行脚本确认两表 ODS 独有数减少