init: 项目初始提交 - NeoZQYY Monorepo 完整代码
This commit is contained in:
2
apps/etl/pipelines/feiqiu/tasks/dwd/__init__.py
Normal file
2
apps/etl/pipelines/feiqiu/tasks/dwd/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""DWD 层装载任务"""
|
||||
79
apps/etl/pipelines/feiqiu/tasks/dwd/base_dwd_task.py
Normal file
79
apps/etl/pipelines/feiqiu/tasks/dwd/base_dwd_task.py
Normal file
@@ -0,0 +1,79 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""DWD任务基类"""
|
||||
import json
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
|
||||
from tasks.base_task import BaseTask
|
||||
from models.parsers import TypeParser
|
||||
|
||||
class BaseDwdTask(BaseTask):
|
||||
"""
|
||||
DWD 层任务基类
|
||||
负责从 ODS 表读取数据,供子类清洗和写入事实/维度表
|
||||
"""
|
||||
|
||||
def _get_ods_cursor(self, task_code: str) -> datetime:
|
||||
"""
|
||||
获取上次处理的 ODS 数据的时间点 (fetched_at)
|
||||
这里简化处理,实际应该从 etl_cursor 表读取
|
||||
目前先依赖 BaseTask 的时间窗口逻辑,或者子类自己管理
|
||||
"""
|
||||
# TODO: 对接真正的 CursorManager
|
||||
# 暂时返回一个较早的时间,或者由子类通过 _get_time_window 获取
|
||||
return None
|
||||
|
||||
def iter_ods_rows(
|
||||
self,
|
||||
table_name: str,
|
||||
columns: List[str],
|
||||
start_time: datetime,
|
||||
end_time: datetime,
|
||||
time_col: str = "fetched_at",
|
||||
batch_size: int = 1000
|
||||
) -> Iterator[List[Dict[str, Any]]]:
|
||||
"""
|
||||
分批迭代读取 ODS 表数据
|
||||
|
||||
Args:
|
||||
table_name: ODS 表名
|
||||
columns: 需要查询的字段列表 (必须包含 payload)
|
||||
start_time: 开始时间 (包含)
|
||||
end_time: 结束时间 (包含)
|
||||
time_col: 时间过滤字段,默认 fetched_at
|
||||
batch_size: 批次大小
|
||||
"""
|
||||
offset = 0
|
||||
cols_str = ", ".join(columns)
|
||||
|
||||
while True:
|
||||
sql = f"""
|
||||
SELECT {cols_str}
|
||||
FROM {table_name}
|
||||
WHERE {time_col} >= %s AND {time_col} <= %s
|
||||
ORDER BY {time_col} ASC
|
||||
LIMIT %s OFFSET %s
|
||||
"""
|
||||
|
||||
rows = self.db.query(sql, (start_time, end_time, batch_size, offset))
|
||||
|
||||
if not rows:
|
||||
break
|
||||
|
||||
yield rows
|
||||
|
||||
if len(rows) < batch_size:
|
||||
break
|
||||
|
||||
offset += batch_size
|
||||
|
||||
def parse_payload(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
解析 ODS 行中的 payload JSON
|
||||
"""
|
||||
payload = row.get("payload")
|
||||
if isinstance(payload, str):
|
||||
return json.loads(payload)
|
||||
elif isinstance(payload, dict):
|
||||
return payload
|
||||
return {}
|
||||
1698
apps/etl/pipelines/feiqiu/tasks/dwd/dwd_load_task.py
Normal file
1698
apps/etl/pipelines/feiqiu/tasks/dwd/dwd_load_task.py
Normal file
File diff suppressed because it is too large
Load Diff
105
apps/etl/pipelines/feiqiu/tasks/dwd/dwd_quality_task.py
Normal file
105
apps/etl/pipelines/feiqiu/tasks/dwd/dwd_quality_task.py
Normal file
@@ -0,0 +1,105 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""DWD 质量核对任务:按 dwd_quality_check.md 输出行数/金额对照报表。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Sequence, Tuple
|
||||
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
from tasks.base_task import BaseTask, TaskContext
|
||||
from tasks.dwd.dwd_load_task import DwdLoadTask
|
||||
|
||||
|
||||
class DwdQualityTask(BaseTask):
|
||||
"""对 ODS 与 DWD 进行行数、金额对照核查,生成 JSON 报表。"""
|
||||
|
||||
REPORT_PATH = Path("reports/dwd_quality_report.json")
|
||||
AMOUNT_KEYWORDS = ("amount", "money", "fee", "balance")
|
||||
|
||||
def get_task_code(self) -> str:
|
||||
"""返回任务编码。"""
|
||||
return "DWD_QUALITY_CHECK"
|
||||
|
||||
def extract(self, context: TaskContext) -> dict[str, Any]:
|
||||
"""准备运行时上下文。"""
|
||||
return {"now": datetime.now()}
|
||||
|
||||
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
|
||||
"""输出行数/金额差异报表到本地文件。"""
|
||||
report: Dict[str, Any] = {
|
||||
"generated_at": extracted["now"].isoformat(),
|
||||
"tables": [],
|
||||
"note": "行数/金额核对,金额字段基于列名包含 amount/money/fee/balance 的数值列自动扫描。",
|
||||
}
|
||||
|
||||
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
for dwd_table, ods_table in DwdLoadTask.TABLE_MAP.items():
|
||||
count_info = self._compare_counts(cur, dwd_table, ods_table)
|
||||
amount_info = self._compare_amounts(cur, dwd_table, ods_table)
|
||||
report["tables"].append(
|
||||
{
|
||||
"dwd_table": dwd_table,
|
||||
"ods_table": ods_table,
|
||||
"count": count_info,
|
||||
"amounts": amount_info,
|
||||
}
|
||||
)
|
||||
|
||||
self.REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
self.REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
self.logger.info("DWD 质检报表已生成:%s", self.REPORT_PATH)
|
||||
return {"report_path": str(self.REPORT_PATH)}
|
||||
|
||||
# ---------------------- 辅助方法 ----------------------
|
||||
def _compare_counts(self, cur, dwd_table: str, ods_table: str) -> Dict[str, Any]:
|
||||
"""统计两端行数并返回差异。"""
|
||||
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
|
||||
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
|
||||
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{dwd_schema}"."{dwd_name}"')
|
||||
dwd_cnt = cur.fetchone()["cnt"]
|
||||
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{ods_schema}"."{ods_name}"')
|
||||
ods_cnt = cur.fetchone()["cnt"]
|
||||
return {"dwd": dwd_cnt, "ods": ods_cnt, "diff": dwd_cnt - ods_cnt}
|
||||
|
||||
def _compare_amounts(self, cur, dwd_table: str, ods_table: str) -> List[Dict[str, Any]]:
|
||||
"""扫描金额相关列,生成 ODS 与 DWD 的汇总对照。"""
|
||||
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
|
||||
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
|
||||
|
||||
dwd_amount_cols = self._get_numeric_amount_columns(cur, dwd_schema, dwd_name)
|
||||
ods_amount_cols = self._get_numeric_amount_columns(cur, ods_schema, ods_name)
|
||||
common_amount_cols = sorted(set(dwd_amount_cols) & set(ods_amount_cols))
|
||||
|
||||
results: List[Dict[str, Any]] = []
|
||||
for col in common_amount_cols:
|
||||
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{dwd_schema}"."{dwd_name}"')
|
||||
dwd_sum = cur.fetchone()["val"]
|
||||
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{ods_schema}"."{ods_name}"')
|
||||
ods_sum = cur.fetchone()["val"]
|
||||
results.append({"column": col, "dwd_sum": float(dwd_sum or 0), "ods_sum": float(ods_sum or 0), "diff": float(dwd_sum or 0) - float(ods_sum or 0)})
|
||||
return results
|
||||
|
||||
def _get_numeric_amount_columns(self, cur, schema: str, table: str) -> List[str]:
|
||||
"""获取列名包含金额关键词的数值型字段。"""
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s
|
||||
AND table_name = %s
|
||||
AND data_type IN ('numeric','double precision','integer','bigint','smallint','real','decimal')
|
||||
""",
|
||||
(schema, table),
|
||||
)
|
||||
cols = [r["column_name"].lower() for r in cur.fetchall()]
|
||||
return [c for c in cols if any(key in c for key in self.AMOUNT_KEYWORDS)]
|
||||
|
||||
def _split_table_name(self, name: str, default_schema: str) -> Tuple[str, str]:
|
||||
"""拆分 schema 与表名,缺省使用 default_schema。"""
|
||||
parts = name.split(".")
|
||||
if len(parts) == 2:
|
||||
return parts[0], parts[1]
|
||||
return default_schema, name
|
||||
Reference in New Issue
Block a user