init: 项目初始提交 - NeoZQYY Monorepo 完整代码

This commit is contained in:
Neo
2026-02-15 14:58:14 +08:00
commit ded6dfb9d8
769 changed files with 182616 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
# -*- coding: utf-8 -*-
"""DWD 层装载任务"""

View File

@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
"""DWD任务基类"""
import json
from typing import Any, Dict, Iterator, List, Optional, Tuple
from datetime import datetime
from tasks.base_task import BaseTask
from models.parsers import TypeParser
class BaseDwdTask(BaseTask):
"""
DWD 层任务基类
负责从 ODS 表读取数据,供子类清洗和写入事实/维度表
"""
def _get_ods_cursor(self, task_code: str) -> datetime:
"""
获取上次处理的 ODS 数据的时间点 (fetched_at)
这里简化处理,实际应该从 etl_cursor 表读取
目前先依赖 BaseTask 的时间窗口逻辑,或者子类自己管理
"""
# TODO: 对接真正的 CursorManager
# 暂时返回一个较早的时间,或者由子类通过 _get_time_window 获取
return None
def iter_ods_rows(
self,
table_name: str,
columns: List[str],
start_time: datetime,
end_time: datetime,
time_col: str = "fetched_at",
batch_size: int = 1000
) -> Iterator[List[Dict[str, Any]]]:
"""
分批迭代读取 ODS 表数据
Args:
table_name: ODS 表名
columns: 需要查询的字段列表 (必须包含 payload)
start_time: 开始时间 (包含)
end_time: 结束时间 (包含)
time_col: 时间过滤字段,默认 fetched_at
batch_size: 批次大小
"""
offset = 0
cols_str = ", ".join(columns)
while True:
sql = f"""
SELECT {cols_str}
FROM {table_name}
WHERE {time_col} >= %s AND {time_col} <= %s
ORDER BY {time_col} ASC
LIMIT %s OFFSET %s
"""
rows = self.db.query(sql, (start_time, end_time, batch_size, offset))
if not rows:
break
yield rows
if len(rows) < batch_size:
break
offset += batch_size
def parse_payload(self, row: Dict[str, Any]) -> Dict[str, Any]:
"""
解析 ODS 行中的 payload JSON
"""
payload = row.get("payload")
if isinstance(payload, str):
return json.loads(payload)
elif isinstance(payload, dict):
return payload
return {}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
"""DWD 质量核对任务:按 dwd_quality_check.md 输出行数/金额对照报表。"""
from __future__ import annotations
import json
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Sequence, Tuple
from psycopg2.extras import RealDictCursor
from tasks.base_task import BaseTask, TaskContext
from tasks.dwd.dwd_load_task import DwdLoadTask
class DwdQualityTask(BaseTask):
"""对 ODS 与 DWD 进行行数、金额对照核查,生成 JSON 报表。"""
REPORT_PATH = Path("reports/dwd_quality_report.json")
AMOUNT_KEYWORDS = ("amount", "money", "fee", "balance")
def get_task_code(self) -> str:
"""返回任务编码。"""
return "DWD_QUALITY_CHECK"
def extract(self, context: TaskContext) -> dict[str, Any]:
"""准备运行时上下文。"""
return {"now": datetime.now()}
def load(self, extracted: dict[str, Any], context: TaskContext) -> dict[str, Any]:
"""输出行数/金额差异报表到本地文件。"""
report: Dict[str, Any] = {
"generated_at": extracted["now"].isoformat(),
"tables": [],
"note": "行数/金额核对,金额字段基于列名包含 amount/money/fee/balance 的数值列自动扫描。",
}
with self.db.conn.cursor(cursor_factory=RealDictCursor) as cur:
for dwd_table, ods_table in DwdLoadTask.TABLE_MAP.items():
count_info = self._compare_counts(cur, dwd_table, ods_table)
amount_info = self._compare_amounts(cur, dwd_table, ods_table)
report["tables"].append(
{
"dwd_table": dwd_table,
"ods_table": ods_table,
"count": count_info,
"amounts": amount_info,
}
)
self.REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
self.REPORT_PATH.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
self.logger.info("DWD 质检报表已生成:%s", self.REPORT_PATH)
return {"report_path": str(self.REPORT_PATH)}
# ---------------------- 辅助方法 ----------------------
def _compare_counts(self, cur, dwd_table: str, ods_table: str) -> Dict[str, Any]:
"""统计两端行数并返回差异。"""
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{dwd_schema}"."{dwd_name}"')
dwd_cnt = cur.fetchone()["cnt"]
cur.execute(f'SELECT COUNT(1) AS cnt FROM "{ods_schema}"."{ods_name}"')
ods_cnt = cur.fetchone()["cnt"]
return {"dwd": dwd_cnt, "ods": ods_cnt, "diff": dwd_cnt - ods_cnt}
def _compare_amounts(self, cur, dwd_table: str, ods_table: str) -> List[Dict[str, Any]]:
"""扫描金额相关列,生成 ODS 与 DWD 的汇总对照。"""
dwd_schema, dwd_name = self._split_table_name(dwd_table, default_schema="billiards_dwd")
ods_schema, ods_name = self._split_table_name(ods_table, default_schema="billiards_ods")
dwd_amount_cols = self._get_numeric_amount_columns(cur, dwd_schema, dwd_name)
ods_amount_cols = self._get_numeric_amount_columns(cur, ods_schema, ods_name)
common_amount_cols = sorted(set(dwd_amount_cols) & set(ods_amount_cols))
results: List[Dict[str, Any]] = []
for col in common_amount_cols:
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{dwd_schema}"."{dwd_name}"')
dwd_sum = cur.fetchone()["val"]
cur.execute(f'SELECT COALESCE(SUM("{col}"),0) AS val FROM "{ods_schema}"."{ods_name}"')
ods_sum = cur.fetchone()["val"]
results.append({"column": col, "dwd_sum": float(dwd_sum or 0), "ods_sum": float(ods_sum or 0), "diff": float(dwd_sum or 0) - float(ods_sum or 0)})
return results
def _get_numeric_amount_columns(self, cur, schema: str, table: str) -> List[str]:
"""获取列名包含金额关键词的数值型字段。"""
cur.execute(
"""
SELECT column_name
FROM information_schema.columns
WHERE table_schema = %s
AND table_name = %s
AND data_type IN ('numeric','double precision','integer','bigint','smallint','real','decimal')
""",
(schema, table),
)
cols = [r["column_name"].lower() for r in cur.fetchall()]
return [c for c in cols if any(key in c for key in self.AMOUNT_KEYWORDS)]
def _split_table_name(self, name: str, default_schema: str) -> Tuple[str, str]:
"""拆分 schema 与表名,缺省使用 default_schema。"""
parts = name.split(".")
if len(parts) == 2:
return parts[0], parts[1]
return default_schema, name