微信小程序页面迁移校验之前 P5任务处理之前

2026-03-09 01:19:21 +08:00
parent 263bf96035
commit 6e20987d2f
1112 changed files with 153824 additions and 219694 deletions
--- a/apps/etl/connectors/feiqiu/quality/consistency_checker.py
+++ b/apps/etl/connectors/feiqiu/quality/consistency_checker.py
@@ -71,7 +71,6 @@ class ConsistencyReport:
 ODS_TABLE_TO_JSON_FILE: Dict[str, str] = {
    "assistant_accounts_master": "assistant_accounts_master.json",
    "assistant_service_records": "assistant_service_records.json",
-    "assistant_cancellation_records": "assistant_cancellation_records.json",
    "member_profiles": "member_profiles.json",
    "member_stored_value_cards": "member_stored_value_cards.json",
    "member_balance_changes": "member_balance_changes.json",
@@ -93,6 +92,35 @@ ODS_TABLE_TO_JSON_FILE: Dict[str, str] = {
    "stock_goods_category_tree": "stock_goods_category_tree.json",
 }

+# CHANGE 2026-02-26 | ODS 表名 → task_code 映射，用于从 FETCH_ROOT 定位分页 JSON
+# FETCH_ROOT 目录结构：{task_code}/{task_code}-{run_id}-{date}-{time}/{ods_table}.json
+ODS_TABLE_TO_TASK_CODE: Dict[str, str] = {
+    "assistant_accounts_master": "ODS_ASSISTANT_ACCOUNT",
+    "assistant_service_records": "ODS_ASSISTANT_LEDGER",
+    "member_profiles": "ODS_MEMBER",
+    "member_stored_value_cards": "ODS_MEMBER_CARD",
+    "member_balance_changes": "ODS_MEMBER_BALANCE",
+    "recharge_settlements": "ODS_RECHARGE_SETTLE",
+    "settlement_records": "ODS_SETTLEMENT_RECORDS",
+    "table_fee_transactions": "ODS_TABLE_USE",
+    "table_fee_discount_records": "ODS_TABLE_FEE_DISCOUNT",
+    "store_goods_sales_records": "ODS_STORE_GOODS_SALES",
+    "store_goods_master": "ODS_STORE_GOODS",
+    "tenant_goods_master": "ODS_TENANT_GOODS",
+    "site_tables_master": "ODS_TABLES",
+    "group_buy_packages": "ODS_GROUP_PACKAGE",
+    "group_buy_redemption_records": "ODS_GROUP_BUY_REDEMPTION",
+    "platform_coupon_redemption_records": "ODS_PLATFORM_COUPON",
+    "payment_transactions": "ODS_PAYMENT",
+    "refund_transactions": "ODS_REFUND",
+    "goods_stock_summary": "ODS_INVENTORY_STOCK",
+    "goods_stock_movements": "ODS_INVENTORY_CHANGE",
+    "stock_goods_category_tree": "ODS_GOODS_CATEGORY",
+    "staff_info_master": "ODS_STAFF_INFO",
+    "settlement_ticket_records": "ODS_SETTLEMENT_TICKET",
+    "json_archive_records": "ODS_JSON_ARCHIVE",
+}
+
 # ODS 元数据列——不来自 API，由 ETL 框架自动填充
 ODS_META_COLUMNS = frozenset({
    "payload", "source_file", "source_endpoint",
@@ -145,6 +173,86 @@ def _extract_records(data: Any) -> list[dict]:
    return []


+def extract_api_fields_from_fetch_root(
+    fetch_root: Path,
+    ods_table: str,
+) -> set[str] | None:
+    """从 FETCH_ROOT 分页 JSON 中提取 API 原始字段名。
+
+    CHANGE 2026-02-26 | 替代 extract_api_fields_from_json 的 API_SAMPLE_CACHE_ROOT 依赖，
+    直接读取 ETL 实际抓取的分页 JSON，无需额外手动生成缓存。
+
+    目录结构：FETCH_ROOT/{task_code}/{task_code}-{run_id}-{date}-{time}/{ods_table}.json
+    分页 JSON 结构：{ "pages": [{ "response": { "data": { "{listKey}": [...] } } }] }
+    """
+    task_code = ODS_TABLE_TO_TASK_CODE.get(ods_table)
+    if not task_code:
+        return None
+
+    task_dir = fetch_root / task_code
+    if not task_dir.is_dir():
+        return None
+
+    # 取最新 run 目录（按目录名排序，格式含时间戳）
+    run_dirs = sorted(
+        (d for d in task_dir.iterdir() if d.is_dir()),
+        key=lambda d: d.name,
+        reverse=True,
+    )
+    if not run_dirs:
+        return None
+
+    # 在最新 run 目录中查找 {ods_table}.json
+    json_file = run_dirs[0] / f"{ods_table}.json"
+    if not json_file.exists():
+        return None
+
+    try:
+        with json_file.open("r", encoding="utf-8") as f:
+            data = json.load(f)
+    except (json.JSONDecodeError, OSError):
+        return None
+
+    records = _extract_records_from_paged_json(data)
+    if not records:
+        return None
+
+    all_fields: set[str] = set()
+    for rec in records[:10]:
+        if isinstance(rec, dict):
+            all_fields.update(rec.keys())
+    return all_fields
+
+
+def _extract_records_from_paged_json(data: Any) -> list[dict]:
+    """从 ETL 分页 JSON 中提取业务记录。
+
+    分页 JSON 格式：
+      { "pages": [{ "response": { "data": { "{listKey}": [record, ...] } } }] }
+    也兼容 gen_full_dataflow_doc 的扁平缓存格式（直接列表 / {"data": [...]}）。
+    """
+    if not isinstance(data, dict):
+        return _extract_records(data)
+
+    pages = data.get("pages")
+    if not isinstance(pages, list) or not pages:
+        # 回退到扁平格式
+        return _extract_records(data)
+
+    # 从第一个有数据的 page 中提取记录
+    for page in pages:
+        if not isinstance(page, dict):
+            continue
+        response = page.get("response")
+        if not isinstance(response, dict):
+            continue
+        records = _extract_records(response)
+        if records:
+            return records
+
+    return []
+
+
 def check_api_vs_ods_fields(
    api_fields: set[str],
    ods_columns: set[str],
@@ -494,6 +602,7 @@ def run_consistency_check(
    db_conn,
    *,
    api_sample_dir: Path | None = None,
+    fetch_root: Path | None = None,
    include_api_vs_ods: bool = True,
    include_ods_vs_dwd: bool = True,
    sample_limit: int = 5,
@@ -504,7 +613,8 @@ def run_consistency_check(

    参数:
        db_conn: 数据库连接对象（需有 .conn 属性返回 psycopg2 connection）
-        api_sample_dir: API JSON 缓存目录（用于 API vs ODS 检查）
+        api_sample_dir: API JSON 缓存目录（旧方式，兼容保留）
+        fetch_root: FETCH_ROOT 目录（优先使用，从 ETL 实际抓取的分页 JSON 提取字段）
        include_api_vs_ods: 是否执行 API vs ODS 检查
        include_ods_vs_dwd: 是否执行 ODS vs DWD 检查
        sample_limit: 值不一致时的采样行数
@@ -519,16 +629,28 @@ def run_consistency_check(

    with db_conn.conn.cursor() as cur:
        # --- 1. API vs ODS 字段完整性检查 ---
-        if include_api_vs_ods and api_sample_dir:
+        # CHANGE 2026-02-26 | 优先从 FETCH_ROOT 读取实际抓取数据，回退到 api_sample_dir 缓存
+        if include_api_vs_ods and (fetch_root or api_sample_dir):
            for ods_table, json_file in sorted(ODS_TABLE_TO_JSON_FILE.items()):
-                json_path = api_sample_dir / json_file
-                api_fields = extract_api_fields_from_json(json_path)
+                # 优先尝试 FETCH_ROOT（ETL 实际抓取的分页 JSON）
+                api_fields = None
+                source_hint = ""
+                if fetch_root:
+                    api_fields = extract_api_fields_from_fetch_root(fetch_root, ods_table)
+                    source_hint = "FETCH_ROOT"
+
+                # 回退到 api_sample_dir（gen_full_dataflow_doc 缓存）
+                if api_fields is None and api_sample_dir:
+                    json_path = api_sample_dir / json_file
+                    api_fields = extract_api_fields_from_json(json_path)
+                    source_hint = "API_SAMPLE_CACHE"
+
                if api_fields is None:
                    result = TableCheckResult(
                        table_name=f"ods.{ods_table}",
                        check_type="api_vs_ods",
-                        passed=True,  # 无 JSON 缓存时跳过，不算失败
-                        error=f"API JSON 缓存不存在: {json_file}",
+                        passed=True,  # 无 JSON 数据时跳过，不算失败
+                        error=f"无可用 JSON 数据（FETCH_ROOT 和 API 缓存均未找到）",
                    )
                    report.api_vs_ods_results.append(result)
                    continue