Files
feiqiu-ETL/etl_billiards/tasks/manual_ingest_task.py
2025-11-20 01:27:33 +08:00

177 lines
7.5 KiB
Python

# -*- coding: utf-8 -*-
import os
import json
from datetime import datetime
from .base_task import BaseTask
from loaders.ods.generic import GenericODSLoader
class ManualIngestTask(BaseTask):
"""
Task to ingest manually fetched JSON files from a directory into ODS tables.
"""
FILE_MAPPING = {
"小票详情": "billiards_ods.ods_ticket_detail",
"结账记录": "billiards_ods.ods_order_settle",
"支付记录": "billiards_ods.ods_payment",
"助教流水": "billiards_ods.ods_assistant_ledger",
"助教废除": "billiards_ods.ods_assistant_abolish",
"商品档案": "billiards_ods.ods_goods_ledger", # Note: This might be dim_product source, but mapping to ledger for now if it's sales
"库存变化": "billiards_ods.ods_inventory_change",
"会员档案": "billiards_ods.ods_member",
"充值记录": "billiards_ods.ods_member_card", # Approx
"团购套餐": "billiards_ods.ods_package_coupon",
"库存汇总": "billiards_ods.ods_inventory_stock"
}
def get_task_code(self) -> str:
return "MANUAL_INGEST"
def execute(self) -> dict:
self.logger.info("Starting Manual Ingest Task")
# Configurable directory, default to tests/testdata_json for now
data_dir = self.config.get("manual.data_dir", r"c:\dev\LLTQ\ETL\feiqiu-ETL\etl_billiards\tests\testdata_json")
if not os.path.exists(data_dir):
self.logger.error(f"Data directory not found: {data_dir}")
return {"status": "error", "message": "Directory not found"}
total_files = 0
total_rows = 0
for filename in os.listdir(data_dir):
if not filename.endswith(".json"):
continue
# Determine target table
target_table = None
for key, table in self.FILE_MAPPING.items():
if key in filename:
target_table = table
break
if not target_table:
self.logger.warning(f"No mapping found for file: {filename}, skipping.")
continue
self.logger.info(f"Ingesting {filename} into {target_table}")
try:
with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
data = [data]
# Prepare rows for GenericODSLoader
# We need to adapt the data to what GenericODSLoader expects (or update it)
# GenericODSLoader expects dicts. It handles normalization.
# But we need to ensure the primary keys are present in the payload or extracted.
# The GenericODSLoader might need configuration for PK extraction if it's not standard.
# For now, let's assume the payload IS the row, and we wrap it.
# Actually, GenericODSLoader.upsert_rows expects the raw API result list.
# It calls _normalize_row.
# We need to make sure _normalize_row works for these files.
# Most files have 'id' or similar.
# Let's instantiate a loader for this table
# We need to know the PK for the table.
# This is usually defined in ODS_TASK_CLASSES but here we are dynamic.
# We might need a simpler loader or reuse GenericODSLoader with specific PK config.
# For simplicity, let's use a custom ingestion here that mimics GenericODSLoader but is file-aware.
rows_to_insert = []
for item in data:
# Extract Store ID (usually in siteProfile or data root)
store_id = self._extract_store_id(item) or self.config.get("app.store_id")
# Extract PK (id, orderSettleId, etc.)
pk_val = self._extract_pk(item, target_table)
if not pk_val:
# Try to find 'id' in the item
pk_val = item.get("id")
if not pk_val:
# Special case for Ticket Detail
if "ods_ticket_detail" in target_table:
pk_val = item.get("orderSettleId")
if not pk_val:
continue
row = {
"store_id": store_id,
"payload": json.dumps(item, ensure_ascii=False),
"source_file": filename,
"fetched_at": datetime.now()
}
# Add specific PK column
pk_col = self._get_pk_column(target_table)
row[pk_col] = pk_val
rows_to_insert.append(row)
if rows_to_insert:
self._bulk_insert(target_table, rows_to_insert)
total_rows += len(rows_to_insert)
total_files += 1
except Exception as e:
self.logger.error(f"Error processing {filename}: {e}", exc_info=True)
return {"status": "success", "files_processed": total_files, "rows_inserted": total_rows}
def _extract_store_id(self, item):
# Try common paths
if "store_id" in item: return item["store_id"]
if "siteProfile" in item and "id" in item["siteProfile"]: return item["siteProfile"]["id"]
if "data" in item and "data" in item["data"] and "siteId" in item["data"]["data"]: return item["data"]["data"]["siteId"]
return None
def _extract_pk(self, item, table):
# Helper to find PK based on table
if "ods_order_settle" in table:
# Check for nested structure in some files
if "settleList" in item and "settleList" in item["settleList"]:
return item["settleList"]["settleList"].get("id")
return item.get("id")
return item.get("id")
def _get_pk_column(self, table):
if "ods_ticket_detail" in table: return "order_settle_id"
if "ods_order_settle" in table: return "order_settle_id"
if "ods_payment" in table: return "pay_id"
if "ods_member" in table: return "member_id"
if "ods_assistant_ledger" in table: return "ledger_id"
if "ods_goods_ledger" in table: return "order_goods_id"
if "ods_inventory_change" in table: return "change_id"
if "ods_assistant_abolish" in table: return "abolish_id"
if "ods_coupon_verify" in table: return "coupon_id"
if "ods_member_card" in table: return "card_id"
if "ods_package_coupon" in table: return "package_id"
return "id" # Fallback
def _bulk_insert(self, table, rows):
if not rows: return
keys = list(rows[0].keys())
cols = ", ".join(keys)
vals = ", ".join([f"%({k})s" for k in keys])
# Determine PK col for conflict
pk_col = self._get_pk_column(table)
sql = f"""
INSERT INTO {table} ({cols})
VALUES ({vals})
ON CONFLICT (store_id, {pk_col}) DO UPDATE SET
payload = EXCLUDED.payload,
fetched_at = EXCLUDED.fetched_at,
source_file = EXCLUDED.source_file;
"""
self.db.batch_execute(sql, rows)