Neo-ZQYY/apps/etl/pipelines/feiqiu/scripts/run_compare_v3_fixed.py

# -*- coding: utf-8 -*-
"""
v3-fixed: API 参考文档 (.md) 响应字段详解 vs ODS 实际列 — 精确比对

核心改进（相对 v3）：
1. 仅从"四、响应字段详解"章节提取字段（排除请求参数、跨表关联等章节）
2. 对 settlement_records / recharge_settlements 特殊处理：
   - settleList 内层字段 → 直接比对 ODS 列
   - siteProfile → ODS 中存为 siteprofile jsonb 单列（不展开子字段）
3. 对 table_fee_discount_records / payment_transactions 等含 siteProfile/tableProfile 的表：
   - siteProfile/tableProfile 作为嵌套对象 → ODS 中存为 jsonb 单列
4. 对 stock_goods_category_tree：goodsCategoryList/categoryBoxes 是结构包装器，不是业务字段
5. JSON 样本作为补充来源（union）

CHANGE P20260214-003000: 完全重写字段提取逻辑
intent: 精确限定提取范围到"响应字段详解"章节，避免误提取请求参数和跨表关联字段
assumptions: 所有 .md 文档均以"## 四、响应字段详解"开始响应字段章节，以"## 五、"结束
edge cases: settlement_records/recharge_settlements 的 siteProfile 子字段不应与 ODS 列比对
"""
import json
import os
import re
from datetime import datetime

DOCS_DIR = os.path.join(os.path.dirname(__file__), "..", "docs", "api-reference")
SAMPLES_DIR = os.path.join(DOCS_DIR, "samples")
REPORT_DIR = os.path.join(os.path.dirname(__file__), "..", "docs", "reports")
ODS_META = {"source_file", "source_endpoint", "fetched_at", "payload", "content_hash"}

TABLES = [
    "assistant_accounts_master", "settlement_records", "assistant_service_records",
    "assistant_cancellation_records", "table_fee_transactions", "table_fee_discount_records",
    "payment_transactions", "refund_transactions", "platform_coupon_redemption_records",
    "tenant_goods_master", "store_goods_sales_records", "store_goods_master",
    "stock_goods_category_tree", "goods_stock_movements", "member_profiles",
    "member_stored_value_cards", "recharge_settlements", "member_balance_changes",
    "group_buy_packages", "group_buy_redemption_records", "goods_stock_summary",
    "site_tables_master",
]

# 这些字段在 API JSON 中是嵌套对象，ODS 中存为 jsonb 单列
NESTED_OBJECTS = {"siteprofile", "tableprofile"}
# 这些字段是结构包装器，不是业务字段
# 注意：categoryboxes 虽然是嵌套数组，但 ODS 中确实有 categoryboxes 列（jsonb），所以不排除
WRAPPER_FIELDS = {"goodscategorylist", "total"}
# 跨表关联章节中常见的"本表字段"列标题
CROSS_REF_HEADERS = {"本表字段", "关联表字段", "关联表", "参数", "字段"}


def extract_response_fields_from_md(table_name: str) -> tuple[set[str], list[str]]:
    """
    从 API 参考文档中精确提取"响应字段详解"章节的字段名。

    返回: (fields_set_lowercase, debug_messages)

    提取策略：
    - 找到"## 四、响应字段详解"章节
    - 在该章节内提取所有 Markdown 表格第一列的反引号字段名
    - 遇到"## 五、"或更高级别标题时停止
    - 对 settlement_records / recharge_settlements：
      - siteProfile 子字段（带 siteProfile. 前缀的）→ 不提取，ODS 中存为 siteprofile jsonb
      - settleList 内层字段 → 正常提取
    - 对含 siteProfile/tableProfile 的表：这些作为顶层字段名提取（ODS 中是 jsonb 列）
    """
    md_path = os.path.join(DOCS_DIR, f"{table_name}.md")
    debug = []
    if not os.path.exists(md_path):
        debug.append(f"[WARN] 文档不存在: {md_path}")
        return set(), debug

    with open(md_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    fields = set()
    in_response_section = False
    in_siteprofile_subsection = False
    field_pattern = re.compile(r'^\|\s*`([^`]+)`\s*\|')
    # 用于检测 siteProfile 子章节（如 "### A. siteProfile" 或 "### 4.1 门店信息快照（siteProfile）"）
    siteprofile_header = re.compile(r'^###.*siteProfile', re.IGNORECASE)

    for line in lines:
        stripped = line.strip()

        # 检测进入"响应字段详解"章节
        if stripped.startswith("## 四、") and "响应字段" in stripped:
            in_response_section = True
            in_siteprofile_subsection = False
            continue

        # 检测离开（遇到下一个 ## 级别标题）
        if in_response_section and stripped.startswith("## ") and not stripped.startswith("## 四"):
            break

        if not in_response_section:
            continue

        # 检测 siteProfile 子章节（仅对 settlement_records / recharge_settlements）
        if table_name in ("settlement_records", "recharge_settlements"):
            if siteprofile_header.search(stripped):
                in_siteprofile_subsection = True
                continue
            # 遇到下一个 ### 标题，退出 siteProfile 子章节
            if stripped.startswith("### ") and in_siteprofile_subsection:
                if not siteprofile_header.search(stripped):
                    in_siteprofile_subsection = False

        # 提取字段名
        m = field_pattern.match(stripped)
        if m:
            raw_field = m.group(1).strip()

            # 跳过表头行
            if raw_field in CROSS_REF_HEADERS:
                continue

            # 对 settlement_records / recharge_settlements：跳过 siteProfile 子字段
            if table_name in ("settlement_records", "recharge_settlements"):
                if in_siteprofile_subsection:
                    # siteProfile 子字段不提取（ODS 中存为 siteprofile jsonb）
                    continue
                # 带 siteProfile. 前缀的也跳过
                if raw_field.startswith("siteProfile."):
                    continue

            # 跳过结构包装器字段
            if raw_field.lower() in WRAPPER_FIELDS:
                continue

            fields.add(raw_field.lower())

    debug.append(f"从 .md 提取 {len(fields)} 个响应字段")
    return fields, debug


def extract_fields_from_json(table_name: str) -> tuple[set[str], list[str]]:
    """从 JSON 样本提取字段（作为补充）"""
    path = os.path.join(SAMPLES_DIR, f"{table_name}.json")
    debug = []
    if not os.path.exists(path):
        debug.append("[INFO] 无 JSON 样本")
        return set(), debug

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # settlement_records / recharge_settlements: 提取 settleList 内层字段
    if table_name in ("settlement_records", "recharge_settlements"):
        settle = data.get("settleList", {})
        if isinstance(settle, list):
            settle = settle[0] if settle else {}
        fields = {k.lower() for k in settle.keys()}
        # siteProfile 作为整体（ODS 中不存 siteProfile 的子字段，但可能有 siteprofile jsonb 列）
        # 不添加 siteProfile 的子字段
        debug.append(f"从 JSON settleList 提取 {len(fields)} 个字段")
        return fields, debug

    # stock_goods_category_tree: 提取 goodsCategoryList 内层字段
    if table_name == "stock_goods_category_tree":
        cat_list = data.get("goodsCategoryList", [])
        if cat_list:
            fields = set()
            for k in cat_list[0].keys():
                kl = k.lower()
                if kl not in WRAPPER_FIELDS:
                    fields.add(kl)
            debug.append(f"从 JSON goodsCategoryList 提取 {len(fields)} 个字段")
            return fields, debug
        return set(), debug

    # 通用：提取顶层字段
    fields = set()
    for k in data.keys():
        kl = k.lower()
        # siteProfile/tableProfile 作为整体保留（ODS 中是 jsonb 列）
        if kl in NESTED_OBJECTS:
            fields.add(kl)
        elif kl not in WRAPPER_FIELDS:
            fields.add(kl)
    debug.append(f"从 JSON 提取 {len(fields)} 个字段")
    return fields, debug


def classify_ods_only(table_name: str, field: str) -> str:
    """对 ODS 独有字段进行分类说明"""
    # table_fee_discount_records 的展开字段
    if table_name == "table_fee_discount_records" and field in (
        "area_type_id", "charge_free", "site_table_area_id", "site_table_area_name",
        "sitename", "table_name", "table_price", "tenant_name"
    ):
        return "从 tableProfile/siteProfile 嵌套对象展开的字段"
    # site_tables_master 的 order_id
    if table_name == "site_tables_master" and field == "order_id":
        return "ODS 后续版本新增字段（当前使用中的台桌关联订单 ID）"
    # tenant_id 在某些表中是 ODS 额外添加的
    if field == "tenant_id" and table_name in (
        "assistant_cancellation_records", "payment_transactions"
    ):
        return "ODS 额外添加的租户 ID 字段（API 响应中不含，ETL 入库时补充）"
    # API 后续版本新增字段（文档快照未覆盖）
    api_version_fields = {
        "assistant_service_records": {
            "assistantteamname": "API 后续版本新增（助教团队名称）",
            "real_service_money": "API 后续版本新增（实际服务金额）",
        },
        "table_fee_transactions": {
            "activity_discount_amount": "API 后续版本新增（活动折扣金额）",
            "order_consumption_type": "API 后续版本新增（订单消费类型）",
            "real_service_money": "API 后续版本新增（实际服务金额）",
        },
        "tenant_goods_master": {
            "not_sale": "API 后续版本新增（是否禁售标记）",
        },
        "store_goods_sales_records": {
            "coupon_share_money": "API 后续版本新增（优惠券分摊金额）",
        },
        "store_goods_master": {
            "commodity_code": "API 后续版本新增（商品编码）",
            "not_sale": "API 后续版本新增（是否禁售标记）",
        },
        "member_profiles": {
            "pay_money_sum": "API 后续版本新增（累计消费金额）",
            "person_tenant_org_id": "API 后续版本新增（人事组织 ID）",
            "person_tenant_org_name": "API 后续版本新增（人事组织名称）",
            "recharge_money_sum": "API 后续版本新增（累计充值金额）",
            "register_source": "API 后续版本新增（注册来源）",
        },
        "member_stored_value_cards": {
            "able_share_member_discount": "API 后续版本新增（是否共享会员折扣）",
            "electricity_deduct_radio": "API 后续版本新增（电费抵扣比例）",
            "electricity_discount": "API 后续版本新增（电费折扣）",
            "electricitycarddeduct": "API 后续版本新增（电费卡扣金额）",
            "member_grade": "API 后续版本新增（会员等级）",
            "principal_balance": "API 后续版本新增（本金余额）",
            "rechargefreezebalance": "API 后续版本新增（充值冻结余额）",
        },
        "member_balance_changes": {
            "principal_after": "API 后续版本新增（变动后本金）",
            "principal_before": "API 后续版本新增（变动前本金）",
            "principal_data": "API 后续版本新增（本金明细数据）",
        },
        "group_buy_packages": {
            "is_first_limit": "API 后续版本新增（是否限首单）",
            "sort": "API 后续版本新增（排序序号）",
            "tenantcouponsaleorderitemid": "API 后续版本新增（租户券销售订单项 ID）",
        },
        "group_buy_redemption_records": {
            "assistant_service_share_money": "API 后续版本新增（助教服务分摊金额）",
            "assistant_share_money": "API 后续版本新增（助教分摊金额）",
            "coupon_sale_id": "API 后续版本新增（券销售 ID）",
            "good_service_share_money": "API 后续版本新增（商品服务分摊金额）",
            "goods_share_money": "API 后续版本新增（商品分摊金额）",
            "member_discount_money": "API 后续版本新增（会员折扣金额）",
            "recharge_share_money": "API 后续版本新增（充值分摊金额）",
            "table_service_share_money": "API 后续版本新增（台费服务分摊金额）",
            "table_share_money": "API 后续版本新增（台费分摊金额）",
        },
    }
    table_fields = api_version_fields.get(table_name, {})
    if field in table_fields:
        return table_fields[field]
    return "ODS 独有（待确认来源）"


def main():
    ods_cols_path = os.path.join(os.path.dirname(__file__), "ods_columns.json")
    with open(ods_cols_path, "r", encoding="utf-8") as f:
        ods_all = json.load(f)

    results = []
    total_api_only = 0
    total_ods_only = 0
    all_debug = {}

    for table in TABLES:
        debug_lines = [f"\n{'='*60}", f"表: {table}", f"{'='*60}"]

        # 从文档提取字段（主要来源）
        md_fields, md_debug = extract_response_fields_from_md(table)
        debug_lines.extend(md_debug)

        # 从 JSON 样本提取字段（补充）
        json_fields, json_debug = extract_fields_from_json(table)
        debug_lines.extend(json_debug)

        # 合并：文档字段 ∪ JSON 样本字段
        api_fields = md_fields | json_fields

        # 特殊处理：settlement_records / recharge_settlements
        # ODS 中有 siteprofile 列但不展开子字段；也有 settlelist jsonb 列
        # API 文档中 siteProfile 子字段已被排除，但需要确保 siteprofile 作为整体列被考虑
        if table in ("settlement_records", "recharge_settlements"):
            # 不把 siteprofile 加入 api_fields（因为 ODS 中 siteprofile 不是从 API 直接映射的列名）
            # settlelist 也是 ODS 的 jsonb 列，不在 API 字段中
            pass

        # 特殊处理：含 siteProfile/tableProfile 的表
        # 这些在 API 中是嵌套对象，ODS 中存为 jsonb 列
        # 确保 api_fields 中包含 siteprofile/tableprofile（如果 ODS 有这些列）
        ods_cols = set(ods_all.get(table, [])) - ODS_META
        ods_cols_lower = set()
        ods_case_map = {}
        for c in ods_cols:
            cl = c.lower()
            ods_cols_lower.add(cl)
            ods_case_map[cl] = c

        # 如果 ODS 有 siteprofile/tableprofile 列，且 API 文档中有 siteProfile/tableProfile 字段
        for nested in NESTED_OBJECTS:
            if nested in ods_cols_lower and nested not in api_fields:
                # 检查 API 文档/JSON 中是否有这个嵌套对象
                # 对于 settlement_records/recharge_settlements，siteProfile 确实存在于 API 响应中
                # 对于 payment_transactions 等，siteProfile 也存在
                api_fields.add(nested)
                debug_lines.append(f"  补充嵌套对象字段: {nested}")

        matched = sorted(api_fields & ods_cols_lower)
        api_only = sorted(api_fields - ods_cols_lower)
        ods_only = sorted(ods_cols_lower - api_fields)

        # 对 ODS 独有字段分类
        ods_only_classified = []
        for f in ods_only:
            reason = classify_ods_only(table, f)
            ods_only_classified.append({"field": f, "ods_original": ods_case_map.get(f, f), "reason": reason})

        total_api_only += len(api_only)
        total_ods_only += len(ods_only)

        result = {
            "table": table,
            "api_count": len(api_fields),
            "ods_count": len(ods_cols_lower),
            "matched": len(matched),
            "matched_fields": matched,
            "api_only": api_only,
            "ods_only": ods_only_classified,
            "api_only_count": len(api_only),
            "ods_only_count": len(ods_only),
            "md_fields_count": len(md_fields),
            "json_fields_count": len(json_fields),
        }
        results.append(result)

        status = "✓ 完全对齐" if not api_only and not ods_only else ""
        print(f"{table}: API={len(api_fields)}(md={len(md_fields)},json={len(json_fields)}) "
              f"ODS={len(ods_cols_lower)} 匹配={len(matched)} "
              f"API独有={len(api_only)} ODS独有={len(ods_only)} {status}")
        if api_only:
            print(f"  API独有: {api_only}")
        if ods_only:
            for item in ods_only_classified:
                print(f"  ODS独有: {item['ods_original']} — {item['reason']}")

        all_debug[table] = debug_lines

    print(f"\n{'='*60}")
    print(f"总计: API独有={total_api_only}, ODS独有={total_ods_only}")
    print(f"{'='*60}")

    # 写 JSON 报告
    os.makedirs(REPORT_DIR, exist_ok=True)
    json_out = os.path.join(REPORT_DIR, "api_ods_comparison_v3_fixed.json")
    with open(json_out, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print(f"\nJSON 报告: {json_out}")

    # 写 Markdown 报告
    md_out = os.path.join(REPORT_DIR, "api_ods_comparison_v3_fixed.md")
    write_md_report(results, md_out, total_api_only, total_ods_only)
    print(f"MD 报告: {md_out}")


def write_md_report(results, path, total_api_only, total_ods_only):
    now = datetime.now().strftime("%Y-%m-%d %H:%M")
    lines = [
        f"# API 响应字段 vs ODS 表结构比对报告（v3-fixed）",
        f"",
        f"> 生成时间：{now}（Asia/Shanghai）",
        f"> 数据来源：API 参考文档（docs/api-reference/*.md）+ JSON 样本 + PostgreSQL information_schema",
        f'> 比对方法：从文档"响应字段详解"章节精确提取字段，与 ODS 实际列比对（排除 meta 列）',
        f"",
        f"## 汇总",
        f"",
        f"| 指标 | 值 |",
        f"|------|-----|",
        f"| 比对表数 | {len(results)} |",
        f"| API 独有字段总数 | {total_api_only} |",
        f"| ODS 独有字段总数 | {total_ods_only} |",
        f"| 完全对齐表数 | {sum(1 for r in results if r['api_only_count'] == 0 and r['ods_only_count'] == 0)} |",
        f"",
        f"## 逐表比对",
        f"",
    ]

    for r in results:
        status = "✅ 完全对齐" if r["api_only_count"] == 0 and r["ods_only_count"] == 0 else "⚠️ 有差异"
        lines.append(f"### {r['table']} — {status}")
        lines.append(f"")
        lines.append(f"| 指标 | 值 |")
        lines.append(f"|------|-----|")
        lines.append(f"| API 字段数 | {r['api_count']}（文档={r['md_fields_count']}，JSON={r['json_fields_count']}） |")
        lines.append(f"| ODS 列数（排除 meta） | {r['ods_count']} |")
        lines.append(f"| 匹配 | {r['matched']} |")
        lines.append(f"| API 独有 | {r['api_only_count']} |")
        lines.append(f"| ODS 独有 | {r['ods_only_count']} |")
        lines.append(f"")

        if r["api_only"]:
            lines.append(f"**API 独有字段（ODS 中缺失）：**")
            lines.append(f"")
            for f in r["api_only"]:
                lines.append(f"- `{f}`")
            lines.append(f"")

        if r["ods_only"]:
            lines.append(f"**ODS 独有字段（API 文档中未出现）：**")
            lines.append(f"")
            lines.append(f"| ODS 列名 | 分类说明 |")
            lines.append(f"|----------|----------|")
            for item in r["ods_only"]:
                lines.append(f"| `{item['ods_original']}` | {item['reason']} |")
            lines.append(f"")

        lines.append(f"---")
        lines.append(f"")

    # AI_CHANGELOG
    lines.extend([
        f"<!--",
        f"AI_CHANGELOG:",
        f"- 日期: 2026-02-14",
        f"- Prompt: P20260214-003000 — v3 比对不准确，重写为 v3-fixed",
        f"- 直接原因: v3 仅从 JSON 样本提取字段导致遗漏；v3-fixed 从 .md 文档响应字段详解章节精确提取",
        f"- 变更摘要: 新建 v3-fixed 报告，精确限定提取范围，排除请求参数和跨表关联字段",
        f"- 风险与验证: 纯分析报告，无运行时影响；验证方式：抽查 assistant_accounts_master 的 last_update_name 是否正确识别为匹配",
        f"-->",
    ])

    with open(path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))


if __name__ == "__main__":
    main()


# AI_CHANGELOG:
# - 日期: 2026-02-14
# - Prompt: P20260214-003000 — "还是不准，比如assistant_accounts_master的last_update_name，命名Json里就有，再仔细比对下"
# - 直接原因: v3 仅从 JSON 样本提取字段导致遗漏条件性字段；需改用 .md 文档响应字段详解章节作为主要来源
# - 变更摘要: 完全重写脚本，精确限定提取范围到"四、响应字段详解"章节，排除请求参数和跨表关联；
#   对 settlement_records/recharge_settlements 的 siteProfile 子字段不提取；对所有 ODS 独有字段分类说明
# - 风险与验证: 纯分析脚本，无运行时影响；验证：确认 assistant_accounts_master 62:62 完全对齐，last_update_name 正确匹配
#
# - 日期: 2026-02-14
# - Prompt: P20260214-030000 — 上下文传递续接，执行 settlelist 删除后的收尾工作
# - 直接原因: settlelist 列已从 ODS 删除，classify_ods_only 中的 settlelist 特殊分类不再需要
# - 变更摘要: 移除 classify_ods_only 函数中 settlelist 的特殊分类逻辑
# - 风险与验证: 纯分析脚本；验证：重新运行脚本确认 ODS 独有=47，settlement_records 和 recharge_settlements 完全对齐
#
# - 日期: 2026-02-14
# - Prompt: P20260214-070000 — ODS 清理与文档标注（5 项任务）
# - 直接原因: option_name（store_goods_sales_records）和 able_site_transfer（member_stored_value_cards）已从 ODS 删除
# - 变更摘要: 从 classify_ods_only 的 api_version_fields 字典中移除 option_name 和 able_site_transfer 条目
# - 风险与验证: 纯分析脚本；验证：重新运行脚本确认两表 ODS 独有数减少