Files
feiqiu-ETL/etl_billiards/backups/manual_ingest_task.py
2025-12-09 04:57:05 +08:00

322 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""鎵嬪伐绀轰緥鏁版嵁鐏屽叆锛氭寜 schema_ODS_doc.sql 涓婚敭/鍞竴閿壒閲忓啓鍏?ODS銆?""
from __future__ import annotations
import json
import os
from datetime import datetime
from typing import Any, Iterable
from psycopg2.extras import Json
from .base_task import BaseTask
class ManualIngestTask(BaseTask):
"""湴绀轰緥 JSON 鐏屽叆 ODS锛岀淇濊鍚嶃佷富閿佹彃鍏ュ垪涓?schema_ODS_doc.sql 瀵归綈銆?""
def __init__(self, config, db_connection, api_client, logger):
"""鍒濆鍖栫紦瀛橈紝閬垮厤閲嶅鏌ヨ琛ㄧ粨鏋勩€?""
super().__init__(config, db_connection, api_client, logger)
self._table_columns_cache: dict[str, list[str]] = {}
# 鏂囦欢鍏抽敭璇?-> 鐩爣琛紙鍖归厤 C:\dev\LLTQ\export\temp\source-data-doc 涓嬬ず鑼?JSON 鍚嶇О锛? FILE_MAPPING: list[tuple[tuple[str, ...], str]] = [
(("浼氬憳妗f", "member_profiles"), "billiards_ods.member_profiles"),
(("浣欓鍙樻洿璁板綍", "member_balance_changes"), "billiards_ods.member_balance_changes"),
(("鍌ㄥ€煎崱鍒楄〃", "member_stored_value_cards"), "billiards_ods.member_stored_value_cards"),
(("鍏呭€艰褰?, "recharge_settlements"), "billiards_ods.recharge_settlements"),
(("缁撹处璁板綍", "settlement_records"), "billiards_ods.settlement_records"),
(("鍔╂暀搴熼櫎", "assistant_cancellation_records"), "billiards_ods.assistant_cancellation_records"),
(("鍔╂暀璐﹀彿", "assistant_accounts_master"), "billiards_ods.assistant_accounts_master"),
(("鍔╂暀娴佹按", "assistant_service_records"), "billiards_ods.assistant_service_records"),
(("鍙版鍒楄〃", "site_tables_master"), "billiards_ods.site_tables_master"),
(("鍙拌垂鎵撴姌", "table_fee_discount_records"), "billiards_ods.table_fee_discount_records"),
(("鍙拌垂娴佹按", "table_fee_transactions"), "billiards_ods.table_fee_transactions"),
(("搴撳瓨鍙樺寲璁板綍1", "goods_stock_movements"), "billiards_ods.goods_stock_movements"),
(("搴撳瓨鍙樺寲璁板綍2", "stock_goods_category_tree"), "billiards_ods.stock_goods_category_tree"),
(("搴撳瓨姹囨€?, "goods_stock_summary"), "billiards_ods.goods_stock_summary"),
(("鏀粯璁板綍", "payment_transactions"), "billiards_ods.payment_transactions"),
(("閫€娆捐褰?, "refund_transactions"), "billiards_ods.refund_transactions"),
(("骞冲彴楠屽埜璁板綍", "platform_coupon_redemption_records"), "billiards_ods.platform_coupon_redemption_records"),
(("鍥㈣喘濂楅娴佹按", "group_buy_redemption_records"), "billiards_ods.group_buy_packages_ledger"),
(("鍥㈣喘濂楅", "group_buy_packages"), "billiards_ods.group_buy_packages"),
(("灏忕エ璇︽儏", "settlement_ticket_details"), "billiards_ods.settlement_ticket_details"),
(("闂ㄥ簵鍟嗗搧妗f", "store_goods_master"), "billiards_ods.store_goods_master"),
(("鍟嗗搧妗f", "tenant_goods_master"), "billiards_ods.tenant_goods_master"),
(("闂ㄥ簵鍟嗗搧閿€鍞褰?, "store_goods_sales_records"), "billiards_ods.store_goods_sales_records"),
]
# 琛ㄧ粨鏋勮鏄庯細pk=涓婚敭鍒?None 琛ㄧず鏃犲啿绐佹洿鏂?锛宩son_cols=闇€瑕佸崟鍒楀瓨 JSONB 鐨勫瓧娈? TABLE_SPECS: dict[str, dict[str, Any]] = {
"billiards_ods.member_profiles": {"pk": "id"},
"billiards_ods.member_balance_changes": {"pk": "id"},
"billiards_ods.member_stored_value_cards": {"pk": "id"},
"billiards_ods.recharge_settlements": {"pk": None, "json_cols": ["settleList", "siteProfile"]},
"billiards_ods.settlement_records": {"pk": None, "json_cols": ["settleList", "siteProfile"]},
"billiards_ods.assistant_cancellation_records": {"pk": "id", "json_cols": ["siteProfile"]},
"billiards_ods.assistant_accounts_master": {"pk": "id"},
"billiards_ods.assistant_service_records": {"pk": "id", "json_cols": ["siteProfile"]},
"billiards_ods.site_tables_master": {"pk": "id"},
"billiards_ods.table_fee_discount_records": {"pk": "id", "json_cols": ["siteProfile", "tableProfile"]},
"billiards_ods.table_fee_transactions": {"pk": "id", "json_cols": ["siteProfile"]},
"billiards_ods.goods_stock_movements": {"pk": "siteGoodsStockId"},
"billiards_ods.stock_goods_category_tree": {"pk": "id", "json_cols": ["categoryBoxes"]},
"billiards_ods.goods_stock_summary": {"pk": "siteGoodsId"},
"billiards_ods.payment_transactions": {"pk": "id", "json_cols": ["siteProfile"]},
"billiards_ods.refund_transactions": {"pk": "id", "json_cols": ["siteProfile"]},
"billiards_ods.platform_coupon_redemption_records": {"pk": "id"},
"billiards_ods.tenant_goods_master": {"pk": "id"},
"billiards_ods.group_buy_packages": {"pk": "id"},
"billiards_ods.group_buy_packages_ledger": {"pk": "id"},
"billiards_ods.settlement_ticket_details": {
"pk": "orderSettleId",
"json_cols": ["memberProfile", "orderItem", "tenantMemberCardLogs"],
},
"billiards_ods.store_goods_master": {"pk": "id"},
"billiards_ods.store_goods_sales_records": {"pk": "id"},
}
def get_task_code(self) -> str:
"""杩斿洖浠诲姟缂栫爜銆?""
return "MANUAL_INGEST"
def execute(self, cursor_data: dict | None = None) -> dict:
"""浠庣ず鑼冪洰褰曡鍙?JSON锛屾寜琛?涓婚敭鎵归噺鍏ュ簱銆?""
data_dir = (
self.config.get("manual.data_dir")
or self.config.get("pipeline.ingest_source_dir")
or r"c:\dev\LLTQ\ETL\feiqiu-ETL\etl_billiards\tests\testdata_json"
)
if not os.path.exists(data_dir):
self.logger.error("Data directory not found: %s", data_dir)
return {"status": "error", "message": "Directory not found"}
counts = {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}
for filename in sorted(os.listdir(data_dir)):
if not filename.endswith(".json"):
continue
filepath = os.path.join(data_dir, filename)
try:
with open(filepath, "r", encoding="utf-8") as fh:
raw_entries = json.load(fh)
except Exception:
counts["errors"] += 1
self.logger.exception("Failed to read %s", filename)
continue
if not isinstance(raw_entries, list):
raw_entries = [raw_entries]
records = self._extract_records(raw_entries)
if not records:
counts["skipped"] += 1
continue
target_table = self._match_by_filename(filename)
if not target_table:
self.logger.warning("No mapping found for file: %s", filename)
counts["skipped"] += 1
continue
self.logger.info("Ingesting %s into %s", filename, target_table)
try:
inserted, updated = self._ingest_table(target_table, records, filename)
counts["inserted"] += inserted
counts["updated"] += updated
counts["fetched"] += len(records)
except Exception:
counts["errors"] += 1
self.logger.exception("Error processing %s", filename)
self.db.rollback()
continue
try:
self.db.commit()
except Exception:
self.db.rollback()
raise
return {"status": "SUCCESS", "counts": counts}
# ------------------------------------------------------------------ helpers
def _match_by_filename(self, filename: str) -> str | None:
"""鏍规嵁鏂囦欢鍚嶅叧閿瘝鎵惧埌鐩爣琛ㄣ?""
for keywords, table in self.FILE_MAPPING:
if any(keyword and keyword in filename for keyword in keywords):
return table
return None
def _extract_records(self, raw_entries: Iterable[Any]) -> list[dict]:
"""鍏煎澶氱 JSON 缁撴瀯锛屾彁鍙栨垚璁板綍鍒楄〃銆?""
records: list[dict] = []
for entry in raw_entries:
if isinstance(entry, dict):
# 濡傛灉鍚?data 涓旇繕鍖呭惈鍏朵粬閿紙濡?orderSettleId锛夛紝浼樺厛淇濈暀澶栧眰浠ュ厤涓㈠け涓婚敭
preferred = entry
if "data" in entry and not any(k not in {"data", "code"} for k in entry.keys()):
preferred = entry["data"]
data = preferred
if isinstance(data, dict):
list_used = False
for v in data.values():
if isinstance(v, list) and v and isinstance(v[0], dict):
records.extend(v)
list_used = True
break
if list_used:
continue
if isinstance(data, list) and data and isinstance(data[0], dict):
records.extend(data)
elif isinstance(data, dict):
records.append(data)
elif isinstance(entry, list):
records.extend([item for item in entry if isinstance(item, dict)])
return records
def _get_table_columns(self, table: str) -> list[str]:
"""鏌ヨ伅_schema锛岃幏鍙栫洰鏍囪鐨勫叏閮ㄥ垪鍚嶏紙鎸夐搴忥級銆?""
if table in self._table_columns_cache:
return self._table_columns_cache[table]
if "." in table:
schema, name = table.split(".", 1)
else:
schema, name = "public", table
sql = """
SELECT column_name, data_type, udt_name
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
ORDER BY ordinal_position
"""
with self.db.conn.cursor() as cur:
cur.execute(sql, (schema, name))
cols = [(r[0], (r[1] or "").lower(), (r[2] or "").lower()) for r in cur.fetchall()]
self._table_columns_cache[table] = cols
return cols
def _ingest_table(self, table: str, records: list[dict], source_file: str) -> tuple[int, int]:
"""鏋勯€?INSERT/ON CONFLICT 璇彞骞舵壒閲忔墽琛屻€?""
spec = self.TABLE_SPECS.get(table)
if not spec:
raise ValueError(f"No table spec for {table}")
pk_col = spec.get("pk")
json_cols = set(spec.get("json_cols", []))
json_cols_lower = {c.lower() for c in json_cols}
columns_info = self._get_table_columns(table)
columns = [c[0] for c in columns_info]
db_json_cols_lower = {
c[0].lower() for c in columns_info if c[1] in ("json", "jsonb") or c[2] in ("json", "jsonb")
}
pk_col_db = None
if pk_col:
pk_col_db = next((c for c in columns if c.lower() == pk_col.lower()), pk_col)
placeholders = ", ".join(["%s"] * len(columns))
col_list = ", ".join(f'"{c}"' for c in columns)
sql = f'INSERT INTO {table} ({col_list}) VALUES ({placeholders})'
if pk_col_db:
update_cols = [c for c in columns if c != pk_col_db]
set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
sql += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'
sql += " RETURNING (xmax = 0) AS inserted"
params = []
now = datetime.now()
json_dump = lambda v: json.dumps(v, ensure_ascii=False) # noqa: E731
for rec in records:
merged_rec = rec if isinstance(rec, dict) else {}
# 閫愬眰灞曞紑 data -> data.data 缁撴瀯锛屽~鍏呯己澶卞瓧娈? data_part = merged_rec.get("data")
while isinstance(data_part, dict):
merged_rec = {**data_part, **merged_rec}
data_part = data_part.get("data")
pk_val = self._get_value_case_insensitive(merged_rec, pk_col) if pk_col else None
if pk_col and (pk_val is None or pk_val == ""):
continue
row_vals = []
for col_name, data_type, udt in columns_info:
col_lower = col_name.lower()
if col_lower == "payload":
row_vals.append(Json(rec, dumps=json_dump))
continue
if col_lower == "source_file":
row_vals.append(source_file)
continue
if col_lower == "fetched_at":
row_vals.append(merged_rec.get(col_name, now))
continue
value = self._normalize_scalar(self._get_value_case_insensitive(merged_rec, col_name))
if col_lower in json_cols_lower or col_lower in db_json_cols_lower:
row_vals.append(Json(value, dumps=json_dump) if value is not None else None)
continue
casted = self._cast_value(value, data_type)
row_vals.append(casted)
params.append(tuple(row_vals))
if not params:
return 0, 0
inserted = 0
updated = 0
with self.db.conn.cursor() as cur:
for row in params:
cur.execute(sql, row)
try:
flag = cur.fetchone()[0]
except Exception:
flag = None
if flag:
inserted += 1
else:
updated += 1
return inserted, updated
def _get_value_case_insensitive(self, record: dict, col: str):
"""蹇界暐澶у皬鍐欒幏鍙栧硷紝鍏煎 information_schema 灏忓啓鍒楀悕涓?JSON 鍘熷澶у皬鍐欍?""
if record is None:
return None
if col is None:
return None
if col in record:
return record.get(col)
col_lower = col.lower()
for k, v in record.items():
if isinstance(k, str) and k.lower() == col_lower:
return v
return None
def _normalize_scalar(self, value):
"""灏嗙┖瀛楃涓叉爣鍑嗗寲涓?None锛岄伩鍏嶆暟鍊?鏃堕棿瀛楁绫诲瀷閿欒銆?""
if value == "" or value == "{}" or value == "[]":
return None
return value
def _cast_value(self, value, data_type: str):
"""鏍规嵁鍒楃被鍨嬪仛杞婚噺杞崲锛岄伩鍏嶇被鍨嬩笉鍖归厤銆?""
if value is None:
return None
dt = (data_type or "").lower()
if dt in ("integer", "bigint", "smallint"):
if isinstance(value, bool):
return int(value)
try:
return int(value)
except Exception:
return None
if dt in ("numeric", "double precision", "real", "decimal"):
if isinstance(value, bool):
return int(value)
try:
return float(value)
except Exception:
return None
if dt.startswith("timestamp") or dt in ("date", "time", "interval"):
# 浠呮帴鍙楀瓧绗︿覆/鏃ユ湡锛屾暟鍊肩瓑涓€寰嬬疆绌? return value if isinstance(value, str) else None
return value