feiqiu-ETL/etl_billiards/tasks/manual_ingest_task.py

# -*- coding: utf-8 -*-
import os
import json
from datetime import datetime
from .base_task import BaseTask
from loaders.ods.generic import GenericODSLoader

class ManualIngestTask(BaseTask):
    """
    Task to ingest manually fetched JSON files from a directory into ODS tables.
    """

    FILE_MAPPING = {
        "小票详情": "billiards_ods.ods_ticket_detail",
        "结账记录": "billiards_ods.ods_order_settle",
        "支付记录": "billiards_ods.ods_payment",
        "助教流水": "billiards_ods.ods_assistant_ledger",
        "助教废除": "billiards_ods.ods_assistant_abolish",
        "商品档案": "billiards_ods.ods_goods_ledger", # Note: This might be dim_product source, but mapping to ledger for now if it's sales
        "库存变化": "billiards_ods.ods_inventory_change",
        "会员档案": "billiards_ods.ods_member",
        "充值记录": "billiards_ods.ods_member_card", # Approx
        "团购套餐": "billiards_ods.ods_package_coupon",
        "库存汇总": "billiards_ods.ods_inventory_stock"
    }

    def get_task_code(self) -> str:
        return "MANUAL_INGEST"

    def execute(self) -> dict:
        self.logger.info("Starting Manual Ingest Task")

        # Configurable directory, default to tests/testdata_json for now
        data_dir = self.config.get("manual.data_dir", r"c:\dev\LLTQ\ETL\feiqiu-ETL\etl_billiards\tests\testdata_json")

        if not os.path.exists(data_dir):
            self.logger.error(f"Data directory not found: {data_dir}")
            return {"status": "error", "message": "Directory not found"}

        total_files = 0
        total_rows = 0

        for filename in os.listdir(data_dir):
            if not filename.endswith(".json"):
                continue

            # Determine target table
            target_table = None
            for key, table in self.FILE_MAPPING.items():
                if key in filename:
                    target_table = table
                    break

            if not target_table:
                self.logger.warning(f"No mapping found for file: {filename}, skipping.")
                continue

            self.logger.info(f"Ingesting {filename} into {target_table}")

            try:
                with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                    data = json.load(f)

                if not isinstance(data, list):
                    data = [data]

                # Prepare rows for GenericODSLoader
                # We need to adapt the data to what GenericODSLoader expects (or update it)
                # GenericODSLoader expects dicts. It handles normalization.
                # But we need to ensure the primary keys are present in the payload or extracted.
                # The GenericODSLoader might need configuration for PK extraction if it's not standard.
                # For now, let's assume the payload IS the row, and we wrap it.

                # Actually, GenericODSLoader.upsert_rows expects the raw API result list.
                # It calls _normalize_row.
                # We need to make sure _normalize_row works for these files.
                # Most files have 'id' or similar.

                # Let's instantiate a loader for this table
                # We need to know the PK for the table.
                # This is usually defined in ODS_TASK_CLASSES but here we are dynamic.
                # We might need a simpler loader or reuse GenericODSLoader with specific PK config.

                # For simplicity, let's use a custom ingestion here that mimics GenericODSLoader but is file-aware.
                rows_to_insert = []
                for item in data:
                    # Extract Store ID (usually in siteProfile or data root)
                    store_id = self._extract_store_id(item) or self.config.get("app.store_id")

                    # Extract PK (id, orderSettleId, etc.)
                    pk_val = self._extract_pk(item, target_table)

                    if not pk_val:
                        # Try to find 'id' in the item
                        pk_val = item.get("id")

                    if not pk_val:
                         # Special case for Ticket Detail
                        if "ods_ticket_detail" in target_table:
                            pk_val = item.get("orderSettleId")

                    if not pk_val:
                        continue

                    row = {
                        "store_id": store_id,
                        "payload": json.dumps(item, ensure_ascii=False),
                        "source_file": filename,
                        "fetched_at": datetime.now()
                    }

                    # Add specific PK column
                    pk_col = self._get_pk_column(target_table)
                    row[pk_col] = pk_val

                    rows_to_insert.append(row)

                if rows_to_insert:
                    self._bulk_insert(target_table, rows_to_insert)
                    total_rows += len(rows_to_insert)
                    total_files += 1

            except Exception as e:
                self.logger.error(f"Error processing {filename}: {e}", exc_info=True)

        return {"status": "success", "files_processed": total_files, "rows_inserted": total_rows}

    def _extract_store_id(self, item):
        # Try common paths
        if "store_id" in item: return item["store_id"]
        if "siteProfile" in item and "id" in item["siteProfile"]: return item["siteProfile"]["id"]
        if "data" in item and "data" in item["data"] and "siteId" in item["data"]["data"]: return item["data"]["data"]["siteId"]
        return None

    def _extract_pk(self, item, table):
        # Helper to find PK based on table
        if "ods_order_settle" in table:
            # Check for nested structure in some files
            if "settleList" in item and "settleList" in item["settleList"]:
                return item["settleList"]["settleList"].get("id")
            return item.get("id")
        return item.get("id")

    def _get_pk_column(self, table):
        if "ods_ticket_detail" in table: return "order_settle_id"
        if "ods_order_settle" in table: return "order_settle_id"
        if "ods_payment" in table: return "pay_id"
        if "ods_member" in table: return "member_id"
        if "ods_assistant_ledger" in table: return "ledger_id"
        if "ods_goods_ledger" in table: return "order_goods_id"
        if "ods_inventory_change" in table: return "change_id"
        if "ods_assistant_abolish" in table: return "abolish_id"
        if "ods_coupon_verify" in table: return "coupon_id"
        if "ods_member_card" in table: return "card_id"
        if "ods_package_coupon" in table: return "package_id"
        return "id" # Fallback

    def _bulk_insert(self, table, rows):
        if not rows: return

        keys = list(rows[0].keys())
        cols = ", ".join(keys)
        vals = ", ".join([f"%({k})s" for k in keys])

        # Determine PK col for conflict
        pk_col = self._get_pk_column(table)

        sql = f"""
            INSERT INTO {table} ({cols})
            VALUES ({vals})
            ON CONFLICT (store_id, {pk_col}) DO UPDATE SET
                payload = EXCLUDED.payload,
                fetched_at = EXCLUDED.fetched_at,
                source_file = EXCLUDED.source_file;
        """
        self.db.batch_execute(sql, rows)