初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

2
tasks/dwd/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
"""DWD 层装载任务"""

View File

@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
"""DWD任务基类"""
import json
from typing import Any, Dict, Iterator, List, Optional, Tuple
from datetime import datetime
from tasks.base_task import BaseTask
from models.parsers import TypeParser
class BaseDwdTask(BaseTask):
"""
DWD 层任务基类
负责从 ODS 表读取数据,供子类清洗和写入事实/维度表
"""
def _get_ods_cursor(self, task_code: str) -> datetime:
"""
获取上次处理的 ODS 数据的时间点 (fetched_at)
这里简化处理,实际应该从 etl_cursor 表读取
目前先依赖 BaseTask 的时间窗口逻辑,或者子类自己管理
"""
# TODO: 对接真正的 CursorManager
# 暂时返回一个较早的时间,或者由子类通过 _get_time_window 获取
return None
def iter_ods_rows(
self,
table_name: str,
columns: List[str],
start_time: datetime,
end_time: datetime,
time_col: str = "fetched_at",
batch_size: int = 1000
) -> Iterator[List[Dict[str, Any]]]:
"""
分批迭代读取 ODS 表数据
Args:
table_name: ODS 表名
columns: 需要查询的字段列表 (必须包含 payload)
start_time: 开始时间 (包含)
end_time: 结束时间 (包含)
time_col: 时间过滤字段,默认 fetched_at
batch_size: 批次大小
"""
offset = 0
cols_str = ", ".join(columns)
while True:
sql = f"""
SELECT {cols_str}
FROM {table_name}
WHERE {time_col} >= %s AND {time_col} <= %s
ORDER BY {time_col} ASC
LIMIT %s OFFSET %s
"""
rows = self.db.query(sql, (start_time, end_time, batch_size, offset))
if not rows:
break
yield rows
if len(rows) < batch_size:
break
offset += batch_size
def parse_payload(self, row: Dict[str, Any]) -> Dict[str, Any]:
"""
解析 ODS 行中的 payload JSON
"""
payload = row.get("payload")
if isinstance(payload, str):
return json.loads(payload)
elif isinstance(payload, dict):
return payload
return {}

1681
tasks/dwd/dwd_load_task.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
"""DWD 质量核对任务:按 dwd_quality_check.md 输出行数/金额对照报表。"""
from __future__ import annotations
import json
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Sequence, Tuple
from psycopg2.extras import RealDictCursor
from tasks.base_task import BaseTask, TaskContext
from tasks.dwd.dwd_load_task import DwdLoadTask
class DwdQualityTask(BaseTask):
"""对 ODS 与 DWD 进行行数、金额对照核查,生成 JSON 报表。"""
REPORT_PATH = Path("reports/dwd_quality_report.json")
AMOUNT_KEYWORDS = ("amount", "money", "fee", "balance")
def get_task_code(self) -> str:
"""返回任务编码。"""
return "DWD_QUALITY_CHECK"
def extract(self, context: TaskContext) -> dict[str, Any]:
"""准备运行时上下文。"""
return {"now": datetime.now()}
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
"""输出行数/金额差异报表到本地文件。"""
report: Dict[str, Any] = {
"generated_at": extracted["now"].isoformat(),
"tables": [],
"note": "行数/金额核对,金额字段基于列名包含 amount/money/fee/balance 的数值列自动扫描。",
}
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
for dwd_table, ods_table in DwdLoadTask.TABLE_MAP.items():
count_info = self._compare_counts(cur, dwd_table, ods_table)
amount_info = self._compare_amounts(cur, dwd_table, ods_table)
report["tables"].append(
{
"dwd_table": dwd_table,
"ods_table": ods_table,
"count": count_info,
"amounts": amount_info,
}
)
self.REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
self.REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
self.logger.info("DWD 质检报表已生成:%s", self.REPORT_PATH)
return {"report_path": str(self.REPORT_PATH)}
# ---------------------- 辅助方法 ----------------------
def _compare_counts(self, cur, dwd_table: str, ods_table: str) -> Dict[str, Any]:
"""统计两端行数并返回差异。"""
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{dwd_schema}"."{dwd_name}"')
dwd_cnt = cur.fetchone()["cnt"]
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{ods_schema}"."{ods_name}"')
ods_cnt = cur.fetchone()["cnt"]
return {"dwd": dwd_cnt, "ods": ods_cnt, "diff": dwd_cnt - ods_cnt}
def _compare_amounts(self, cur, dwd_table: str, ods_table: str) -> List[Dict[str, Any]]:
"""扫描金额相关列,生成 ODS 与 DWD 的汇总对照。"""
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
dwd_amount_cols = self._get_numeric_amount_columns(cur, dwd_schema, dwd_name)
ods_amount_cols = self._get_numeric_amount_columns(cur, ods_schema, ods_name)
common_amount_cols = sorted(set(dwd_amount_cols) & set(ods_amount_cols))
results: List[Dict[str, Any]] = []
for col in common_amount_cols:
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{dwd_schema}"."{dwd_name}"')
dwd_sum = cur.fetchone()["val"]
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{ods_schema}"."{ods_name}"')
ods_sum = cur.fetchone()["val"]
results.append({"column": col, "dwd_sum": float(dwd_sum or 0), "ods_sum": float(ods_sum or 0), "diff": float(dwd_sum or 0) - float(ods_sum or 0)})
return results
def _get_numeric_amount_columns(self, cur, schema: str, table: str) -> List[str]:
"""获取列名包含金额关键词的数值型字段。"""
cur.execute(
"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = %s
AND table_name = %s
AND data_type IN ('numeric','double precision','integer','bigint','smallint','real','decimal')
""",
(schema, table),
)
cols = [r["column_name"].lower() for r in cur.fetchall()]
return [c for c in cols if any(key in c for key in self.AMOUNT_KEYWORDS)]
def _split_table_name(self, name: str, default_schema: str) -> Tuple[str, str]:
"""拆分 schema 与表名,缺省使用 default_schema。"""
parts = name.split(".")
if len(parts) == 2:
return parts[0], parts[1]
return default_schema, name

View File

@@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-
from .base_dwd_task import BaseDwdTask
from loaders.dimensions.member import MemberLoader
from models.parsers import TypeParser
import json
from utils.windowing import build_window_segments
class MembersDwdTask(BaseDwdTask):
"""
DWD Task: Process Member Records from ODS to Dimension Table
Source: billiards_ods.member_profiles
Target: billiards.dim_member
"""
def get_task_code(self) -> str:
return "MEMBERS_DWD"
def execute(self) -> dict:
self.logger.info(f"Starting {self.get_task_code()} task")
base_start, base_end, _ = self._get_time_window()
segments = build_window_segments(
self.config,
base_start,
base_end,
tz=self.tz,
override_only=True,
)
if not segments:
segments = [(base_start, base_end)]
total_segments = len(segments)
if total_segments > 1:
self.logger.info(f"{self.get_task_code()}: ????? {total_segments} ?")
loader = MemberLoader(self.db)
store_id = self.config.get("app.store_id")
total_inserted = 0
total_updated = 0
for idx, (window_start, window_end) in enumerate(segments, start=1):
self.logger.info(
f"Processing window {idx}/{total_segments}: {window_start} to {window_end}"
)
batches = self.iter_ods_rows(
table_name="billiards_ods.member_profiles",
columns=["site_id", "member_id", "payload", "fetched_at"],
start_time=window_start,
end_time=window_end
)
for batch in batches:
if not batch:
continue
parsed_rows = []
for row in batch:
payload = self.parse_payload(row)
if not payload:
continue
parsed = self._parse_member(payload, store_id)
if parsed:
parsed_rows.append(parsed)
if parsed_rows:
inserted, updated, skipped = loader.upsert_members(parsed_rows, store_id)
total_inserted += inserted
total_updated += updated
self.db.commit()
overall_start = segments[0][0]
overall_end = segments[-1][1]
self.logger.info(
f"Task {self.get_task_code()} completed. Inserted: {total_inserted}, Updated: {total_updated}"
)
return {
"status": "success",
"inserted": total_inserted,
"updated": total_updated,
"window_start": overall_start.isoformat(),
"window_end": overall_end.isoformat()
}
def _parse_member(self, raw: dict, store_id: int) -> dict:
"""Parse ODS payload into Dim structure"""
try:
# 兼容 API 格式(驼峰命名)和手动导入格式
member_id = raw.get("id") or raw.get("memberId")
if not member_id:
return None
return {
"store_id": store_id,
"member_id": member_id,
"member_name": raw.get("name") or raw.get("memberName"),
"phone": raw.get("phone") or raw.get("mobile"),
"balance": raw.get("balance", 0),
"status": str(raw.get("status", "NORMAL")),
"register_time": raw.get("createTime") or raw.get("registerTime"),
"raw_data": json.dumps(raw, ensure_ascii=False)
}
except Exception as e:
self.logger.warning(f"Error parsing member: {e}")
return None

View File

@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
from .base_dwd_task import BaseDwdTask
from loaders.facts.payment import PaymentLoader
from models.parsers import TypeParser
import json
from utils.windowing import build_window_segments
class PaymentsDwdTask(BaseDwdTask):
"""
DWD Task: Process Payment Records from ODS to Fact Table
Source: billiards_ods.ods_payment
Target: billiards.fact_payment
"""
def get_task_code(self) -> str:
return "PAYMENTS_DWD"
def execute(self) -> dict:
self.logger.info(f"Starting {self.get_task_code()} task")
base_start, base_end, _ = self._get_time_window()
segments = build_window_segments(
self.config,
base_start,
base_end,
tz=self.tz,
override_only=True,
)
if not segments:
segments = [(base_start, base_end)]
total_segments = len(segments)
if total_segments > 1:
self.logger.info(f"{self.get_task_code()}: ????? {total_segments} ?")
loader = PaymentLoader(self.db, logger=self.logger)
store_id = self.config.get("app.store_id")
total_inserted = 0
total_updated = 0
total_skipped = 0
for idx, (window_start, window_end) in enumerate(segments, start=1):
self.logger.info(
f"Processing window {idx}/{total_segments}: {window_start} to {window_end}"
)
batches = self.iter_ods_rows(
table_name="billiards_ods.payment_transactions",
columns=["site_id", "pay_id", "payload", "fetched_at"],
start_time=window_start,
end_time=window_end
)
for batch in batches:
if not batch:
continue
parsed_rows = []
for row in batch:
payload = self.parse_payload(row)
if not payload:
continue
parsed = self._parse_payment(payload, store_id)
if parsed:
parsed_rows.append(parsed)
if parsed_rows:
inserted, updated, skipped = loader.upsert_payments(parsed_rows, store_id)
total_inserted += inserted
total_updated += updated
total_skipped += skipped
self.db.commit()
overall_start = segments[0][0]
overall_end = segments[-1][1]
self.logger.info(
"Task %s completed. inserted=%s updated=%s skipped=%s",
self.get_task_code(),
total_inserted,
total_updated,
total_skipped,
)
return {
"status": "SUCCESS",
"counts": {
"inserted": total_inserted,
"updated": total_updated,
"skipped": total_skipped,
},
"window_start": overall_start,
"window_end": overall_end,
}
def _parse_payment(self, raw: dict, store_id: int) -> dict:
"""Parse ODS payload into Fact structure"""
try:
pay_id = TypeParser.parse_int(raw.get("payId") or raw.get("id"))
if not pay_id:
return None
relate_type = str(raw.get("relateType") or raw.get("relate_type") or "")
relate_id = TypeParser.parse_int(raw.get("relateId") or raw.get("relate_id"))
# 尝试填充结账/交易标识符
order_settle_id = TypeParser.parse_int(
raw.get("orderSettleId") or raw.get("order_settle_id")
)
order_trade_no = TypeParser.parse_int(
raw.get("orderTradeNo") or raw.get("order_trade_no")
)
if relate_type in {"1", "SETTLE", "ORDER"}:
order_settle_id = order_settle_id or relate_id
return {
"store_id": store_id,
"pay_id": pay_id,
"order_id": TypeParser.parse_int(raw.get("orderId") or raw.get("order_id")),
"order_settle_id": order_settle_id,
"order_trade_no": order_trade_no,
"relate_type": relate_type,
"relate_id": relate_id,
"site_id": TypeParser.parse_int(
raw.get("siteId") or raw.get("site_id") or store_id
),
"tenant_id": TypeParser.parse_int(raw.get("tenantId") or raw.get("tenant_id")),
"create_time": TypeParser.parse_timestamp(
raw.get("createTime") or raw.get("create_time"), self.tz
),
"pay_time": TypeParser.parse_timestamp(raw.get("payTime"), self.tz),
"pay_amount": TypeParser.parse_decimal(raw.get("payAmount")),
"fee_amount": TypeParser.parse_decimal(
raw.get("feeAmount")
or raw.get("serviceFee")
or raw.get("channelFee")
or raw.get("fee_amount")
),
"discount_amount": TypeParser.parse_decimal(
raw.get("discountAmount")
or raw.get("couponAmount")
or raw.get("discount_amount")
),
"payment_method": str(raw.get("paymentMethod") or raw.get("payment_method") or ""),
"pay_type": raw.get("payType") or raw.get("pay_type"),
"online_pay_channel": raw.get("onlinePayChannel") or raw.get("online_pay_channel"),
"pay_terminal": raw.get("payTerminal") or raw.get("pay_terminal"),
"pay_status": str(raw.get("payStatus") or raw.get("pay_status") or ""),
"remark": raw.get("remark"),
"raw_data": json.dumps(raw, ensure_ascii=False)
}
except Exception as e:
self.logger.warning(f"Error parsing payment: {e}")
return None

View File

@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
from .base_dwd_task import BaseDwdTask
from loaders.facts.ticket import TicketLoader
from utils.windowing import build_window_segments
class TicketDwdTask(BaseDwdTask):
"""
DWD Task: Process Ticket Details from ODS to Fact Tables
Source: billiards_ods.ods_ticket_detail
Targets:
- billiards.fact_order
- billiards.fact_order_goods
- billiards.fact_table_usage
- billiards.fact_assistant_service
"""
def get_task_code(self) -> str:
return "TICKET_DWD"
def execute(self) -> dict:
self.logger.info(f"Starting {self.get_task_code()} task")
base_start, base_end, _ = self._get_time_window()
segments = build_window_segments(
self.config,
base_start,
base_end,
tz=self.tz,
override_only=True,
)
if not segments:
segments = [(base_start, base_end)]
total_segments = len(segments)
if total_segments > 1:
self.logger.info(f"{self.get_task_code()}: ????? {total_segments} ?")
loader = TicketLoader(self.db, logger=self.logger)
store_id = self.config.get("app.store_id")
total_inserted = 0
total_errors = 0
for idx, (window_start, window_end) in enumerate(segments, start=1):
self.logger.info(
f"Processing window {idx}/{total_segments}: {window_start} to {window_end}"
)
batches = self.iter_ods_rows(
table_name="billiards_ods.settlement_ticket_details",
columns=["payload", "fetched_at", "source_file", "record_index"],
start_time=window_start,
end_time=window_end
)
for batch in batches:
if not batch:
continue
tickets = []
for row in batch:
payload = self.parse_payload(row)
if payload:
tickets.append(payload)
inserted, errors = loader.process_tickets(tickets, store_id)
total_inserted += inserted
total_errors += errors
self.db.commit()
overall_start = segments[0][0]
overall_end = segments[-1][1]
self.logger.info(
f"Task {self.get_task_code()} completed. Inserted: {total_inserted}, Errors: {total_errors}"
)
return {
"status": "success",
"inserted": total_inserted,
"errors": total_errors,
"window_start": overall_start.isoformat(),
"window_end": overall_end.isoformat()
}