初始提交:飞球 ETL 系统全量代码
This commit is contained in:
2
tasks/dwd/__init__.py
Normal file
2
tasks/dwd/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""DWD 层装载任务"""
|
||||
79
tasks/dwd/base_dwd_task.py
Normal file
79
tasks/dwd/base_dwd_task.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""DWD任务基类"""
|
||||
import json
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
|
||||
from tasks.base_task import BaseTask
|
||||
from models.parsers import TypeParser
|
||||
|
||||
class BaseDwdTask(BaseTask):
|
||||
"""
|
||||
DWD 层任务基类
|
||||
负责从 ODS 表读取数据,供子类清洗和写入事实/维度表
|
||||
"""
|
||||
|
||||
def _get_ods_cursor(self, task_code: str) -> datetime:
|
||||
"""
|
||||
获取上次处理的 ODS 数据的时间点 (fetched_at)
|
||||
这里简化处理,实际应该从 etl_cursor 表读取
|
||||
目前先依赖 BaseTask 的时间窗口逻辑,或者子类自己管理
|
||||
"""
|
||||
# TODO: 对接真正的 CursorManager
|
||||
# 暂时返回一个较早的时间,或者由子类通过 _get_time_window 获取
|
||||
return None
|
||||
|
||||
def iter_ods_rows(
|
||||
self,
|
||||
table_name: str,
|
||||
columns: List[str],
|
||||
start_time: datetime,
|
||||
end_time: datetime,
|
||||
time_col: str = "fetched_at",
|
||||
batch_size: int = 1000
|
||||
) -> Iterator[List[Dict[str, Any]]]:
|
||||
"""
|
||||
分批迭代读取 ODS 表数据
|
||||
|
||||
Args:
|
||||
table_name: ODS 表名
|
||||
columns: 需要查询的字段列表 (必须包含 payload)
|
||||
start_time: 开始时间 (包含)
|
||||
end_time: 结束时间 (包含)
|
||||
time_col: 时间过滤字段,默认 fetched_at
|
||||
batch_size: 批次大小
|
||||
"""
|
||||
offset = 0
|
||||
cols_str = ", ".join(columns)
|
||||
|
||||
while True:
|
||||
sql = f"""
|
||||
SELECT {cols_str}
|
||||
FROM {table_name}
|
||||
WHERE {time_col} >= %s AND {time_col} <= %s
|
||||
ORDER BY {time_col} ASC
|
||||
LIMIT %s OFFSET %s
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (start_time, end_time, batch_size, offset))
|
||||
|
||||
if not rows:
|
||||
break
|
||||
|
||||
yield rows
|
||||
|
||||
if len(rows) < batch_size:
|
||||
break
|
||||
|
||||
offset += batch_size
|
||||
|
||||
def parse_payload(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
解析 ODS 行中的 payload JSON
|
||||
"""
|
||||
payload = row.get("payload")
|
||||
if isinstance(payload, str):
|
||||
return json.loads(payload)
|
||||
elif isinstance(payload, dict):
|
||||
return payload
|
||||
return {}
|
||||
1681
tasks/dwd/dwd_load_task.py
Normal file
1681
tasks/dwd/dwd_load_task.py
Normal file
File diff suppressed because it is too large
Load Diff
105
tasks/dwd/dwd_quality_task.py
Normal file
105
tasks/dwd/dwd_quality_task.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""DWD 质量核对任务:按 dwd_quality_check.md 输出行数/金额对照报表。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Sequence, Tuple
|
||||
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
from tasks.base_task import BaseTask, TaskContext
|
||||
from tasks.dwd.dwd_load_task import DwdLoadTask
|
||||
|
||||
|
||||
class DwdQualityTask(BaseTask):
|
||||
"""对 ODS 与 DWD 进行行数、金额对照核查,生成 JSON 报表。"""
|
||||
|
||||
REPORT_PATH = Path("reports/dwd_quality_report.json")
|
||||
AMOUNT_KEYWORDS = ("amount", "money", "fee", "balance")
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
"""返回任务编码。"""
|
||||
return "DWD_QUALITY_CHECK"
|
||||
|
||||
def extract(self, context: TaskContext) -> dict[str, Any]:
|
||||
"""准备运行时上下文。"""
|
||||
return {"now": datetime.now()}
|
||||
|
||||
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
|
||||
"""输出行数/金额差异报表到本地文件。"""
|
||||
report: Dict[str, Any] = {
|
||||
"generated_at": extracted["now"].isoformat(),
|
||||
"tables": [],
|
||||
"note": "行数/金额核对,金额字段基于列名包含 amount/money/fee/balance 的数值列自动扫描。",
|
||||
}
|
||||
|
||||
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
for dwd_table, ods_table in DwdLoadTask.TABLE_MAP.items():
|
||||
count_info = self._compare_counts(cur, dwd_table, ods_table)
|
||||
amount_info = self._compare_amounts(cur, dwd_table, ods_table)
|
||||
report["tables"].append(
|
||||
{
|
||||
"dwd_table": dwd_table,
|
||||
"ods_table": ods_table,
|
||||
"count": count_info,
|
||||
"amounts": amount_info,
|
||||
}
|
||||
)
|
||||
|
||||
self.REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
self.logger.info("DWD 质检报表已生成:%s", self.REPORT_PATH)
|
||||
return {"report_path": str(self.REPORT_PATH)}
|
||||
|
||||
# ---------------------- 辅助方法 ----------------------
|
||||
def _compare_counts(self, cur, dwd_table: str, ods_table: str) -> Dict[str, Any]:
|
||||
"""统计两端行数并返回差异。"""
|
||||
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
|
||||
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
|
||||
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{dwd_schema}"."{dwd_name}"')
|
||||
dwd_cnt = cur.fetchone()["cnt"]
|
||||
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{ods_schema}"."{ods_name}"')
|
||||
ods_cnt = cur.fetchone()["cnt"]
|
||||
return {"dwd": dwd_cnt, "ods": ods_cnt, "diff": dwd_cnt - ods_cnt}
|
||||
|
||||
def _compare_amounts(self, cur, dwd_table: str, ods_table: str) -> List[Dict[str, Any]]:
|
||||
"""扫描金额相关列,生成 ODS 与 DWD 的汇总对照。"""
|
||||
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
|
||||
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
|
||||
|
||||
dwd_amount_cols = self._get_numeric_amount_columns(cur, dwd_schema, dwd_name)
|
||||
ods_amount_cols = self._get_numeric_amount_columns(cur, ods_schema, ods_name)
|
||||
common_amount_cols = sorted(set(dwd_amount_cols) & set(ods_amount_cols))
|
||||
|
||||
results: List[Dict[str, Any]] = []
|
||||
for col in common_amount_cols:
|
||||
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{dwd_schema}"."{dwd_name}"')
|
||||
dwd_sum = cur.fetchone()["val"]
|
||||
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{ods_schema}"."{ods_name}"')
|
||||
ods_sum = cur.fetchone()["val"]
|
||||
results.append({"column": col, "dwd_sum": float(dwd_sum or 0), "ods_sum": float(ods_sum or 0), "diff": float(dwd_sum or 0) - float(ods_sum or 0)})
|
||||
return results
|
||||
|
||||
def _get_numeric_amount_columns(self, cur, schema: str, table: str) -> List[str]:
|
||||
"""获取列名包含金额关键词的数值型字段。"""
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s
|
||||
AND table_name = %s
|
||||
AND data_type IN ('numeric','double precision','integer','bigint','smallint','real','decimal')
|
||||
""",
|
||||
(schema, table),
|
||||
)
|
||||
cols = [r["column_name"].lower() for r in cur.fetchall()]
|
||||
return [c for c in cols if any(key in c for key in self.AMOUNT_KEYWORDS)]
|
||||
|
||||
def _split_table_name(self, name: str, default_schema: str) -> Tuple[str, str]:
|
||||
"""拆分 schema 与表名,缺省使用 default_schema。"""
|
||||
parts = name.split(".")
|
||||
if len(parts) == 2:
|
||||
return parts[0], parts[1]
|
||||
return default_schema, name
|
||||
110
tasks/dwd/members_dwd_task.py
Normal file
110
tasks/dwd/members_dwd_task.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .base_dwd_task import BaseDwdTask
|
||||
from loaders.dimensions.member import MemberLoader
|
||||
from models.parsers import TypeParser
|
||||
import json
|
||||
from utils.windowing import build_window_segments
|
||||
|
||||
class MembersDwdTask(BaseDwdTask):
|
||||
"""
|
||||
DWD Task: Process Member Records from ODS to Dimension Table
|
||||
Source: billiards_ods.member_profiles
|
||||
Target: billiards.dim_member
|
||||
"""
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
return "MEMBERS_DWD"
|
||||
|
||||
def execute(self) -> dict:
|
||||
self.logger.info(f"Starting {self.get_task_code()} task")
|
||||
|
||||
base_start, base_end, _ = self._get_time_window()
|
||||
segments = build_window_segments(
|
||||
self.config,
|
||||
base_start,
|
||||
base_end,
|
||||
tz=self.tz,
|
||||
override_only=True,
|
||||
)
|
||||
if not segments:
|
||||
segments = [(base_start, base_end)]
|
||||
|
||||
total_segments = len(segments)
|
||||
if total_segments > 1:
|
||||
self.logger.info(f"{self.get_task_code()}: ????? {total_segments} ?")
|
||||
|
||||
loader = MemberLoader(self.db)
|
||||
store_id = self.config.get("app.store_id")
|
||||
|
||||
total_inserted = 0
|
||||
total_updated = 0
|
||||
|
||||
for idx, (window_start, window_end) in enumerate(segments, start=1):
|
||||
self.logger.info(
|
||||
f"Processing window {idx}/{total_segments}: {window_start} to {window_end}"
|
||||
)
|
||||
batches = self.iter_ods_rows(
|
||||
table_name="billiards_ods.member_profiles",
|
||||
columns=["site_id", "member_id", "payload", "fetched_at"],
|
||||
start_time=window_start,
|
||||
end_time=window_end
|
||||
)
|
||||
|
||||
for batch in batches:
|
||||
if not batch:
|
||||
continue
|
||||
|
||||
parsed_rows = []
|
||||
for row in batch:
|
||||
payload = self.parse_payload(row)
|
||||
if not payload:
|
||||
continue
|
||||
|
||||
parsed = self._parse_member(payload, store_id)
|
||||
if parsed:
|
||||
parsed_rows.append(parsed)
|
||||
|
||||
if parsed_rows:
|
||||
inserted, updated, skipped = loader.upsert_members(parsed_rows, store_id)
|
||||
total_inserted += inserted
|
||||
total_updated += updated
|
||||
|
||||
self.db.commit()
|
||||
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
|
||||
self.logger.info(
|
||||
f"Task {self.get_task_code()} completed. Inserted: {total_inserted}, Updated: {total_updated}"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"inserted": total_inserted,
|
||||
"updated": total_updated,
|
||||
"window_start": overall_start.isoformat(),
|
||||
"window_end": overall_end.isoformat()
|
||||
}
|
||||
|
||||
def _parse_member(self, raw: dict, store_id: int) -> dict:
|
||||
"""Parse ODS payload into Dim structure"""
|
||||
try:
|
||||
# 兼容 API 格式(驼峰命名)和手动导入格式
|
||||
member_id = raw.get("id") or raw.get("memberId")
|
||||
if not member_id:
|
||||
return None
|
||||
|
||||
return {
|
||||
"store_id": store_id,
|
||||
"member_id": member_id,
|
||||
"member_name": raw.get("name") or raw.get("memberName"),
|
||||
"phone": raw.get("phone") or raw.get("mobile"),
|
||||
"balance": raw.get("balance", 0),
|
||||
"status": str(raw.get("status", "NORMAL")),
|
||||
"register_time": raw.get("createTime") or raw.get("registerTime"),
|
||||
"raw_data": json.dumps(raw, ensure_ascii=False)
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error parsing member: {e}")
|
||||
return None
|
||||
|
||||
158
tasks/dwd/payments_dwd_task.py
Normal file
158
tasks/dwd/payments_dwd_task.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .base_dwd_task import BaseDwdTask
|
||||
from loaders.facts.payment import PaymentLoader
|
||||
from models.parsers import TypeParser
|
||||
import json
|
||||
from utils.windowing import build_window_segments
|
||||
|
||||
class PaymentsDwdTask(BaseDwdTask):
|
||||
"""
|
||||
DWD Task: Process Payment Records from ODS to Fact Table
|
||||
Source: billiards_ods.ods_payment
|
||||
Target: billiards.fact_payment
|
||||
"""
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
return "PAYMENTS_DWD"
|
||||
|
||||
def execute(self) -> dict:
|
||||
self.logger.info(f"Starting {self.get_task_code()} task")
|
||||
|
||||
base_start, base_end, _ = self._get_time_window()
|
||||
segments = build_window_segments(
|
||||
self.config,
|
||||
base_start,
|
||||
base_end,
|
||||
tz=self.tz,
|
||||
override_only=True,
|
||||
)
|
||||
if not segments:
|
||||
segments = [(base_start, base_end)]
|
||||
|
||||
total_segments = len(segments)
|
||||
if total_segments > 1:
|
||||
self.logger.info(f"{self.get_task_code()}: ????? {total_segments} ?")
|
||||
|
||||
loader = PaymentLoader(self.db, logger=self.logger)
|
||||
store_id = self.config.get("app.store_id")
|
||||
|
||||
total_inserted = 0
|
||||
total_updated = 0
|
||||
total_skipped = 0
|
||||
|
||||
for idx, (window_start, window_end) in enumerate(segments, start=1):
|
||||
self.logger.info(
|
||||
f"Processing window {idx}/{total_segments}: {window_start} to {window_end}"
|
||||
)
|
||||
batches = self.iter_ods_rows(
|
||||
table_name="billiards_ods.payment_transactions",
|
||||
columns=["site_id", "pay_id", "payload", "fetched_at"],
|
||||
start_time=window_start,
|
||||
end_time=window_end
|
||||
)
|
||||
|
||||
for batch in batches:
|
||||
if not batch:
|
||||
continue
|
||||
|
||||
parsed_rows = []
|
||||
for row in batch:
|
||||
payload = self.parse_payload(row)
|
||||
if not payload:
|
||||
continue
|
||||
|
||||
parsed = self._parse_payment(payload, store_id)
|
||||
if parsed:
|
||||
parsed_rows.append(parsed)
|
||||
|
||||
if parsed_rows:
|
||||
inserted, updated, skipped = loader.upsert_payments(parsed_rows, store_id)
|
||||
total_inserted += inserted
|
||||
total_updated += updated
|
||||
total_skipped += skipped
|
||||
|
||||
self.db.commit()
|
||||
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
|
||||
self.logger.info(
|
||||
"Task %s completed. inserted=%s updated=%s skipped=%s",
|
||||
self.get_task_code(),
|
||||
total_inserted,
|
||||
total_updated,
|
||||
total_skipped,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "SUCCESS",
|
||||
"counts": {
|
||||
"inserted": total_inserted,
|
||||
"updated": total_updated,
|
||||
"skipped": total_skipped,
|
||||
},
|
||||
"window_start": overall_start,
|
||||
"window_end": overall_end,
|
||||
}
|
||||
|
||||
def _parse_payment(self, raw: dict, store_id: int) -> dict:
|
||||
"""Parse ODS payload into Fact structure"""
|
||||
try:
|
||||
pay_id = TypeParser.parse_int(raw.get("payId") or raw.get("id"))
|
||||
if not pay_id:
|
||||
return None
|
||||
|
||||
relate_type = str(raw.get("relateType") or raw.get("relate_type") or "")
|
||||
relate_id = TypeParser.parse_int(raw.get("relateId") or raw.get("relate_id"))
|
||||
|
||||
# 尝试填充结账/交易标识符
|
||||
order_settle_id = TypeParser.parse_int(
|
||||
raw.get("orderSettleId") or raw.get("order_settle_id")
|
||||
)
|
||||
order_trade_no = TypeParser.parse_int(
|
||||
raw.get("orderTradeNo") or raw.get("order_trade_no")
|
||||
)
|
||||
|
||||
if relate_type in {"1", "SETTLE", "ORDER"}:
|
||||
order_settle_id = order_settle_id or relate_id
|
||||
|
||||
return {
|
||||
"store_id": store_id,
|
||||
"pay_id": pay_id,
|
||||
"order_id": TypeParser.parse_int(raw.get("orderId") or raw.get("order_id")),
|
||||
"order_settle_id": order_settle_id,
|
||||
"order_trade_no": order_trade_no,
|
||||
"relate_type": relate_type,
|
||||
"relate_id": relate_id,
|
||||
"site_id": TypeParser.parse_int(
|
||||
raw.get("siteId") or raw.get("site_id") or store_id
|
||||
),
|
||||
"tenant_id": TypeParser.parse_int(raw.get("tenantId") or raw.get("tenant_id")),
|
||||
"create_time": TypeParser.parse_timestamp(
|
||||
raw.get("createTime") or raw.get("create_time"), self.tz
|
||||
),
|
||||
"pay_time": TypeParser.parse_timestamp(raw.get("payTime"), self.tz),
|
||||
"pay_amount": TypeParser.parse_decimal(raw.get("payAmount")),
|
||||
"fee_amount": TypeParser.parse_decimal(
|
||||
raw.get("feeAmount")
|
||||
or raw.get("serviceFee")
|
||||
or raw.get("channelFee")
|
||||
or raw.get("fee_amount")
|
||||
),
|
||||
"discount_amount": TypeParser.parse_decimal(
|
||||
raw.get("discountAmount")
|
||||
or raw.get("couponAmount")
|
||||
or raw.get("discount_amount")
|
||||
),
|
||||
"payment_method": str(raw.get("paymentMethod") or raw.get("payment_method") or ""),
|
||||
"pay_type": raw.get("payType") or raw.get("pay_type"),
|
||||
"online_pay_channel": raw.get("onlinePayChannel") or raw.get("online_pay_channel"),
|
||||
"pay_terminal": raw.get("payTerminal") or raw.get("pay_terminal"),
|
||||
"pay_status": str(raw.get("payStatus") or raw.get("pay_status") or ""),
|
||||
"remark": raw.get("remark"),
|
||||
"raw_data": json.dumps(raw, ensure_ascii=False)
|
||||
}
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error parsing payment: {e}")
|
||||
return None
|
||||
|
||||
85
tasks/dwd/ticket_dwd_task.py
Normal file
85
tasks/dwd/ticket_dwd_task.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from .base_dwd_task import BaseDwdTask
|
||||
from loaders.facts.ticket import TicketLoader
|
||||
from utils.windowing import build_window_segments
|
||||
|
||||
class TicketDwdTask(BaseDwdTask):
|
||||
"""
|
||||
DWD Task: Process Ticket Details from ODS to Fact Tables
|
||||
Source: billiards_ods.ods_ticket_detail
|
||||
Targets:
|
||||
- billiards.fact_order
|
||||
- billiards.fact_order_goods
|
||||
- billiards.fact_table_usage
|
||||
- billiards.fact_assistant_service
|
||||
"""
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
return "TICKET_DWD"
|
||||
|
||||
def execute(self) -> dict:
|
||||
self.logger.info(f"Starting {self.get_task_code()} task")
|
||||
|
||||
base_start, base_end, _ = self._get_time_window()
|
||||
segments = build_window_segments(
|
||||
self.config,
|
||||
base_start,
|
||||
base_end,
|
||||
tz=self.tz,
|
||||
override_only=True,
|
||||
)
|
||||
if not segments:
|
||||
segments = [(base_start, base_end)]
|
||||
|
||||
total_segments = len(segments)
|
||||
if total_segments > 1:
|
||||
self.logger.info(f"{self.get_task_code()}: ????? {total_segments} ?")
|
||||
|
||||
loader = TicketLoader(self.db, logger=self.logger)
|
||||
store_id = self.config.get("app.store_id")
|
||||
|
||||
total_inserted = 0
|
||||
total_errors = 0
|
||||
|
||||
for idx, (window_start, window_end) in enumerate(segments, start=1):
|
||||
self.logger.info(
|
||||
f"Processing window {idx}/{total_segments}: {window_start} to {window_end}"
|
||||
)
|
||||
batches = self.iter_ods_rows(
|
||||
table_name="billiards_ods.settlement_ticket_details",
|
||||
columns=["payload", "fetched_at", "source_file", "record_index"],
|
||||
start_time=window_start,
|
||||
end_time=window_end
|
||||
)
|
||||
|
||||
for batch in batches:
|
||||
if not batch:
|
||||
continue
|
||||
|
||||
tickets = []
|
||||
for row in batch:
|
||||
payload = self.parse_payload(row)
|
||||
if payload:
|
||||
tickets.append(payload)
|
||||
|
||||
inserted, errors = loader.process_tickets(tickets, store_id)
|
||||
total_inserted += inserted
|
||||
total_errors += errors
|
||||
|
||||
self.db.commit()
|
||||
|
||||
overall_start = segments[0][0]
|
||||
overall_end = segments[-1][1]
|
||||
|
||||
self.logger.info(
|
||||
f"Task {self.get_task_code()} completed. Inserted: {total_inserted}, Errors: {total_errors}"
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"inserted": total_inserted,
|
||||
"errors": total_errors,
|
||||
"window_start": overall_start.isoformat(),
|
||||
"window_end": overall_end.isoformat()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user