init: 项目初始提交 - NeoZQYY Monorepo 完整代码
This commit is contained in:
0
apps/etl/pipelines/feiqiu/utils/__init__.py
Normal file
0
apps/etl/pipelines/feiqiu/utils/__init__.py
Normal file
22
apps/etl/pipelines/feiqiu/utils/helpers.py
Normal file
22
apps/etl/pipelines/feiqiu/utils/helpers.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""通用工具函数"""
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
def ensure_dir(path: Path):
|
||||
"""确保目录存在"""
|
||||
path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def make_surrogate_key(*parts) -> int:
|
||||
"""
|
||||
生成代理键
|
||||
将多个字段值拼接后计算SHA1,取前8字节转为无符号64位整数
|
||||
"""
|
||||
raw = "|".join("" if p is None else str(p) for p in parts)
|
||||
h = hashlib.sha1(raw.encode("utf-8")).digest()[:8]
|
||||
return int.from_bytes(h, byteorder="big", signed=False)
|
||||
|
||||
def now_local(tz) -> datetime:
|
||||
"""获取本地当前时间"""
|
||||
return datetime.now(tz)
|
||||
78
apps/etl/pipelines/feiqiu/utils/json_store.py
Normal file
78
apps/etl/pipelines/feiqiu/utils/json_store.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""JSON 归档/读取的通用工具。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
ENDPOINT_FILENAME_MAP: dict[str, str] = {
|
||||
"/memberprofile/gettenantmemberlist": "member_profiles.json",
|
||||
"/memberprofile/getmembercardbalancechange": "member_balance_changes.json",
|
||||
"/memberprofile/gettenantmembercardlist": "member_stored_value_cards.json",
|
||||
"/site/getrechargesettlelist": "recharge_settlements.json",
|
||||
"/assistantperformance/getabolitionassistant": "assistant_cancellation_records.json",
|
||||
"/assistantperformance/getorderassistantdetails": "assistant_service_records.json",
|
||||
"/personnelmanagement/searchassistantinfo": "assistant_accounts_master.json",
|
||||
"/table/getsitetables": "site_tables_master.json",
|
||||
"/site/gettaifeeadjustlist": "table_fee_discount_records.json",
|
||||
"/site/getsitetableorderdetails": "table_fee_transactions.json",
|
||||
"/tenantgoods/querytenantgoods": "tenant_goods_master.json",
|
||||
"/packagecoupon/querypackagecouponlist": "group_buy_packages.json",
|
||||
"/site/getsitetableusedetails": "group_buy_redemption_records.json",
|
||||
"/order/getordersettleticketnew": "settlement_ticket_details.json",
|
||||
"/promotion/getofflinecouponconsumepagelist": "platform_coupon_redemption_records.json",
|
||||
"/goodsstockmanage/querygoodsoutboundreceipt": "goods_stock_movements.json",
|
||||
"/tenantgoodscategory/queryprimarysecondarycategory": "stock_goods_category_tree.json",
|
||||
"/tenantgoods/getgoodsstockreport": "goods_stock_summary.json",
|
||||
"/paylog/getpayloglistpage": "payment_transactions.json",
|
||||
"/site/getallordersettlelist": "settlement_records.json",
|
||||
"/order/getrefundpayloglist": "refund_transactions.json",
|
||||
"/tenantgoods/getgoodsinventorylist": "store_goods_master.json",
|
||||
"/tenantgoods/getgoodssaleslist": "store_goods_sales_records.json",
|
||||
}
|
||||
|
||||
def endpoint_to_filename(endpoint: str) -> str:
|
||||
"""
|
||||
将 API endpoint 转换为规范化的文件名,优先使用 非球接口API.md 中约定的名称。
|
||||
未覆盖的路径会回退到“去掉开头斜杠 -> 用双下划线替换斜杠 -> 小写”的规则。
|
||||
"""
|
||||
normalized = _normalize_endpoint(endpoint)
|
||||
if normalized in ENDPOINT_FILENAME_MAP:
|
||||
return ENDPOINT_FILENAME_MAP[normalized]
|
||||
|
||||
fallback = normalized.strip("/").replace("/", "__").replace(" ", "_")
|
||||
return f"{fallback or 'root'}.json"
|
||||
|
||||
|
||||
def dump_json(path: Path, payload: Any, pretty: bool = False):
|
||||
"""将 JSON 对象写入文件,默认紧凑,可选美化。"""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as fp:
|
||||
json.dump(payload, fp, ensure_ascii=False, indent=2 if pretty else None)
|
||||
|
||||
|
||||
def _normalize_endpoint(endpoint: str) -> str:
|
||||
"""标准化 endpoint,提取路径部分并统一小写、去除 base 前缀。"""
|
||||
raw = str(endpoint or "").strip()
|
||||
if not raw:
|
||||
return ""
|
||||
|
||||
parsed = urlparse(raw)
|
||||
path = parsed.path or raw
|
||||
if not path.startswith("/"):
|
||||
path = f"/{path}"
|
||||
|
||||
path = path.rstrip("/") or "/"
|
||||
lowered = path.lower()
|
||||
for prefix in ("/apiprod/admin/v1", "apiprod/admin/v1"):
|
||||
if lowered.startswith(prefix):
|
||||
path = path[len(prefix) :]
|
||||
if not path.startswith("/"):
|
||||
path = f"/{path}"
|
||||
path = path.rstrip("/") or "/"
|
||||
lowered = path.lower()
|
||||
break
|
||||
|
||||
return lowered
|
||||
142
apps/etl/pipelines/feiqiu/utils/logging_utils.py
Normal file
142
apps/etl/pipelines/feiqiu/utils/logging_utils.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""日志配置工具
|
||||
|
||||
提供统一的日志配置和格式化。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Iterator, TextIO
|
||||
|
||||
|
||||
# 统一日志格式(中文友好)
|
||||
UNIFIED_FORMAT = "[%(asctime)s] %(levelname)-5s | %(name)s | %(message)s"
|
||||
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
||||
|
||||
|
||||
class TeeStream:
|
||||
"""同时输出到多个流"""
|
||||
|
||||
def __init__(self, *streams: TextIO) -> None:
|
||||
self._streams = streams
|
||||
|
||||
def write(self, data: str) -> int:
|
||||
for stream in self._streams:
|
||||
stream.write(data)
|
||||
return len(data)
|
||||
|
||||
def flush(self) -> None:
|
||||
for stream in self._streams:
|
||||
stream.flush()
|
||||
|
||||
def isatty(self) -> bool:
|
||||
return False
|
||||
|
||||
def fileno(self) -> int:
|
||||
return self._streams[0].fileno()
|
||||
|
||||
|
||||
def build_log_path(log_dir: Path, prefix: str, tag: str = "") -> Path:
|
||||
"""构建日志文件路径"""
|
||||
suffix = f"_{tag}" if tag else ""
|
||||
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
return log_dir / f"{prefix}{suffix}_{stamp}.log"
|
||||
|
||||
|
||||
def get_unified_formatter() -> logging.Formatter:
|
||||
"""获取统一格式的日志格式器"""
|
||||
return logging.Formatter(UNIFIED_FORMAT, DATE_FORMAT)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def configure_logging(
|
||||
name: str,
|
||||
log_file: Path | None,
|
||||
*,
|
||||
level: str = "INFO",
|
||||
console: bool = True,
|
||||
tee_std: bool = True,
|
||||
) -> Iterator[logging.Logger]:
|
||||
"""
|
||||
配置日志
|
||||
|
||||
Args:
|
||||
name: 日志器名称
|
||||
log_file: 日志文件路径,None 表示不写文件
|
||||
level: 日志级别
|
||||
console: 是否输出到控制台
|
||||
tee_std: 是否将 stdout/stderr 也写入日志文件
|
||||
|
||||
Yields:
|
||||
配置好的日志器
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
logger.handlers.clear()
|
||||
logger.setLevel(getattr(logging, level.upper(), logging.INFO))
|
||||
logger.propagate = False
|
||||
|
||||
formatter = get_unified_formatter()
|
||||
|
||||
original_stdout = sys.stdout
|
||||
original_stderr = sys.stderr
|
||||
log_fp: TextIO | None = None
|
||||
|
||||
try:
|
||||
if log_file:
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
log_fp = open(log_file, "a", encoding="utf-8", buffering=1)
|
||||
if tee_std:
|
||||
if console:
|
||||
sys.stdout = TeeStream(original_stdout, log_fp)
|
||||
sys.stderr = TeeStream(original_stderr, log_fp)
|
||||
else:
|
||||
sys.stdout = log_fp
|
||||
sys.stderr = log_fp
|
||||
file_handler = logging.StreamHandler(log_fp)
|
||||
file_handler.setFormatter(formatter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
if console:
|
||||
console_handler = logging.StreamHandler(original_stdout)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
yield logger
|
||||
finally:
|
||||
for handler in list(logger.handlers):
|
||||
handler.flush()
|
||||
handler.close()
|
||||
logger.removeHandler(handler)
|
||||
if log_fp:
|
||||
log_fp.flush()
|
||||
log_fp.close()
|
||||
sys.stdout = original_stdout
|
||||
sys.stderr = original_stderr
|
||||
|
||||
|
||||
def setup_root_logger(level: str = "INFO") -> logging.Logger:
|
||||
"""
|
||||
配置根日志器
|
||||
|
||||
Args:
|
||||
level: 日志级别
|
||||
|
||||
Returns:
|
||||
根日志器
|
||||
"""
|
||||
root = logging.getLogger()
|
||||
root.setLevel(getattr(logging, level.upper(), logging.INFO))
|
||||
|
||||
# 清除已有处理器
|
||||
root.handlers.clear()
|
||||
|
||||
# 添加控制台处理器
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(get_unified_formatter())
|
||||
root.addHandler(handler)
|
||||
|
||||
return root
|
||||
55
apps/etl/pipelines/feiqiu/utils/ods_record_utils.py
Normal file
55
apps/etl/pipelines/feiqiu/utils/ods_record_utils.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Shared helpers for ODS/API record normalization."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
def merge_record_layers(record: dict) -> dict:
|
||||
"""Flatten nested data/settleList layers into a single dict."""
|
||||
merged = record
|
||||
data_part = merged.get("data")
|
||||
while isinstance(data_part, dict):
|
||||
merged = {**data_part, **merged}
|
||||
data_part = data_part.get("data")
|
||||
settle_inner = merged.get("settleList")
|
||||
if isinstance(settle_inner, dict):
|
||||
merged = {**settle_inner, **merged}
|
||||
return merged
|
||||
|
||||
|
||||
def get_value_case_insensitive(record: dict | None, col: str | None):
|
||||
"""Fetch column value without case sensitivity."""
|
||||
if record is None or col is None:
|
||||
return None
|
||||
if col in record:
|
||||
return record.get(col)
|
||||
col_lower = col.lower()
|
||||
for k, v in record.items():
|
||||
if isinstance(k, str) and k.lower() == col_lower:
|
||||
return v
|
||||
return None
|
||||
|
||||
|
||||
def normalize_pk_value(value):
|
||||
"""Normalize PK value (e.g., digit string -> int)."""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str) and value.isdigit():
|
||||
try:
|
||||
return int(value)
|
||||
except Exception:
|
||||
return value
|
||||
return value
|
||||
|
||||
|
||||
def pk_tuple_from_record(record: dict, pk_cols: Iterable[str]) -> tuple | None:
|
||||
"""Extract PK tuple from a record."""
|
||||
merged = merge_record_layers(record)
|
||||
values = []
|
||||
for col in pk_cols:
|
||||
val = normalize_pk_value(get_value_case_insensitive(merged, col))
|
||||
if val is None or val == "":
|
||||
return None
|
||||
values.append(val)
|
||||
return tuple(values)
|
||||
247
apps/etl/pipelines/feiqiu/utils/reporting.py
Normal file
247
apps/etl/pipelines/feiqiu/utils/reporting.py
Normal file
@@ -0,0 +1,247 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""任务结果汇总与格式化工具。
|
||||
|
||||
提供多种格式的任务报告输出:
|
||||
- 简单文本格式
|
||||
- 详细表格格式(ASCII)
|
||||
- 任务总结报告
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
|
||||
def summarize_counts(task_results: Iterable[dict]) -> dict:
|
||||
"""
|
||||
汇总多个任务的 counts,返回总计与逐任务明细。
|
||||
task_results: 形如 {"task_code": str, "counts": {...}} 的字典序列。
|
||||
"""
|
||||
totals = {"fetched": 0, "inserted": 0, "updated": 0, "skipped": 0, "errors": 0}
|
||||
details = []
|
||||
|
||||
for res in task_results:
|
||||
code = res.get("task_code") or res.get("code") or "UNKNOWN"
|
||||
counts = res.get("counts") or {}
|
||||
row = {"task_code": code}
|
||||
for key in totals.keys():
|
||||
val = int(counts.get(key, 0) or 0)
|
||||
row[key] = val
|
||||
totals[key] += val
|
||||
details.append(row)
|
||||
|
||||
return {"total": totals, "details": details}
|
||||
|
||||
|
||||
def format_report(summary: dict) -> str:
|
||||
"""将 summarize_counts 的输出格式化为可读文案(简单格式)。"""
|
||||
lines = []
|
||||
totals = summary.get("total", {})
|
||||
lines.append(
|
||||
"TOTAL fetched={fetched} inserted={inserted} updated={updated} skipped={skipped} errors={errors}".format(
|
||||
fetched=totals.get("fetched", 0),
|
||||
inserted=totals.get("inserted", 0),
|
||||
updated=totals.get("updated", 0),
|
||||
skipped=totals.get("skipped", 0),
|
||||
errors=totals.get("errors", 0),
|
||||
)
|
||||
)
|
||||
for row in summary.get("details", []):
|
||||
lines.append(
|
||||
"{task_code}: fetched={fetched} inserted={inserted} updated={updated} skipped={skipped} errors={errors}".format(
|
||||
task_code=row.get("task_code", "UNKNOWN"),
|
||||
fetched=row.get("fetched", 0),
|
||||
inserted=row.get("inserted", 0),
|
||||
updated=row.get("updated", 0),
|
||||
skipped=row.get("skipped", 0),
|
||||
errors=row.get("errors", 0),
|
||||
)
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_task_summary(result: dict) -> str:
|
||||
"""
|
||||
生成格式化的任务总结报告
|
||||
|
||||
Args:
|
||||
result: 任务执行结果字典,包含:
|
||||
- task_code: 任务代码
|
||||
- status: 执行状态
|
||||
- start_time: 开始时间
|
||||
- end_time: 结束时间
|
||||
- elapsed_seconds: 耗时秒数
|
||||
- counts: 统计数据
|
||||
- verification_result: 校验结果(可选)
|
||||
- error_message: 错误信息(可选)
|
||||
|
||||
Returns:
|
||||
格式化的总结字符串(ASCII 边框)
|
||||
"""
|
||||
task_code = result.get("task_code", "UNKNOWN")
|
||||
status = result.get("status", "未知")
|
||||
counts = result.get("counts", {})
|
||||
verification = result.get("verification_result")
|
||||
error_message = result.get("error_message")
|
||||
|
||||
# 计算时间
|
||||
start_time = result.get("start_time")
|
||||
end_time = result.get("end_time")
|
||||
elapsed = result.get("elapsed_seconds", 0)
|
||||
|
||||
if isinstance(start_time, str):
|
||||
start_str = start_time[:19]
|
||||
elif isinstance(start_time, datetime):
|
||||
start_str = start_time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
else:
|
||||
start_str = "-"
|
||||
|
||||
if isinstance(end_time, str):
|
||||
end_str = end_time[11:19] if len(end_time) >= 19 else end_time
|
||||
elif isinstance(end_time, datetime):
|
||||
end_str = end_time.strftime("%H:%M:%S")
|
||||
else:
|
||||
end_str = "-"
|
||||
|
||||
elapsed_str = _format_duration(elapsed)
|
||||
|
||||
# 构建报告
|
||||
lines = [
|
||||
"╔══════════════════════════════════════════════════════════════╗",
|
||||
"║ 任务执行总结 ║",
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
f"║ 任务代码: {task_code:<50} ║",
|
||||
f"║ 执行状态: {status:<50} ║",
|
||||
f"║ 执行时间: {start_str} ~ {end_str} ({elapsed_str}){' '*(31-len(elapsed_str))} ║",
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 数据统计 ║",
|
||||
f"║ - 获取记录: {counts.get('fetched', 0):>10,} ║",
|
||||
f"║ - 新增记录: {counts.get('inserted', 0):>10,} ║",
|
||||
f"║ - 更新记录: {counts.get('updated', 0):>10,} ║",
|
||||
f"║ - 跳过记录: {counts.get('skipped', 0):>10,} ║",
|
||||
f"║ - 错误记录: {counts.get('errors', 0):>10,} ║",
|
||||
]
|
||||
|
||||
# 校验结果
|
||||
if verification:
|
||||
backfilled_missing = verification.get("backfilled_missing_count", verification.get("backfilled_count", 0))
|
||||
backfilled_mismatch = verification.get("backfilled_mismatch_count", 0)
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 校验结果 ║",
|
||||
f"║ - 源数据量: {verification.get('source_count', 0):>10,} ║",
|
||||
f"║ - 目标数据量: {verification.get('target_count', 0):>10,} ║",
|
||||
f"║ - 缺失补齐: {backfilled_missing:>10,} ║",
|
||||
f"║ - 不一致补齐: {backfilled_mismatch:>10,} ║",
|
||||
])
|
||||
|
||||
# 错误信息
|
||||
if error_message:
|
||||
error_str = str(error_message)[:48]
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
f"║ 错误信息: {error_str:<50} ║",
|
||||
])
|
||||
|
||||
lines.append("╚══════════════════════════════════════════════════════════════╝")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_pipeline_summary(
|
||||
pipeline_name: str,
|
||||
task_results: List[dict],
|
||||
start_time: datetime,
|
||||
end_time: datetime,
|
||||
verification_summary: Optional[dict] = None,
|
||||
) -> str:
|
||||
"""
|
||||
生成管道执行总结报告
|
||||
|
||||
Args:
|
||||
pipeline_name: 管道名称
|
||||
task_results: 各任务执行结果列表
|
||||
start_time: 管道开始时间
|
||||
end_time: 管道结束时间
|
||||
verification_summary: 校验汇总(可选)
|
||||
|
||||
Returns:
|
||||
格式化的管道总结字符串
|
||||
"""
|
||||
elapsed = (end_time - start_time).total_seconds()
|
||||
elapsed_str = _format_duration(elapsed)
|
||||
|
||||
# 汇总统计
|
||||
summary = summarize_counts(task_results)
|
||||
totals = summary.get("total", {})
|
||||
|
||||
# 统计成功/失败
|
||||
success_count = sum(1 for r in task_results if r.get("status") == "成功")
|
||||
fail_count = len(task_results) - success_count
|
||||
|
||||
lines = [
|
||||
"╔══════════════════════════════════════════════════════════════╗",
|
||||
"║ 管道执行总结 ║",
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
f"║ 管道名称: {pipeline_name:<50} ║",
|
||||
f"║ 任务数量: {len(task_results)} (成功: {success_count}, 失败: {fail_count}){' '*(32-len(str(len(task_results)))-len(str(success_count))-len(str(fail_count)))} ║",
|
||||
f"║ 执行时间: {start_time.strftime('%Y-%m-%d %H:%M:%S')} ~ {end_time.strftime('%H:%M:%S')} ({elapsed_str}){' '*(31-len(elapsed_str))} ║",
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 数据汇总 ║",
|
||||
f"║ - 总获取: {totals.get('fetched', 0):>12,} ║",
|
||||
f"║ - 总新增: {totals.get('inserted', 0):>12,} ║",
|
||||
f"║ - 总更新: {totals.get('updated', 0):>12,} ║",
|
||||
f"║ - 总跳过: {totals.get('skipped', 0):>12,} ║",
|
||||
f"║ - 总错误: {totals.get('errors', 0):>12,} ║",
|
||||
]
|
||||
|
||||
# 校验汇总
|
||||
if verification_summary:
|
||||
total_backfilled_missing = verification_summary.get(
|
||||
"total_backfilled_missing",
|
||||
verification_summary.get("total_backfilled", 0),
|
||||
)
|
||||
total_backfilled_mismatch = verification_summary.get("total_backfilled_mismatch", 0)
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 校验汇总 ║",
|
||||
f"║ - 校验表数: {verification_summary.get('total_tables', 0):>10,} ║",
|
||||
f"║ - 一致表数: {verification_summary.get('consistent_tables', 0):>10,} ║",
|
||||
f"║ - 总补齐数: {verification_summary.get('total_backfilled', 0):>10,} ║",
|
||||
f"║ - 缺失补齐: {total_backfilled_missing:>10,} ║",
|
||||
f"║ - 不一致补齐: {total_backfilled_mismatch:>8,} ║",
|
||||
])
|
||||
|
||||
# 任务明细
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 任务明细 ║",
|
||||
])
|
||||
|
||||
for result in task_results[:10]: # 最多显示10个
|
||||
task_code = result.get("task_code", "UNKNOWN")[:25]
|
||||
status = "✓" if result.get("status") == "成功" else "✗"
|
||||
counts = result.get("counts", {})
|
||||
fetched = counts.get("fetched", 0)
|
||||
lines.append(f"║ {status} {task_code:<25} 获取:{fetched:>6,} ║")
|
||||
|
||||
if len(task_results) > 10:
|
||||
lines.append(f"║ ... 还有 {len(task_results) - 10} 个任务 ... ║")
|
||||
|
||||
lines.append("╚══════════════════════════════════════════════════════════════╝")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_duration(seconds: float) -> str:
|
||||
"""格式化时长"""
|
||||
if seconds < 60:
|
||||
return f"{seconds:.1f}秒"
|
||||
elif seconds < 3600:
|
||||
mins = int(seconds // 60)
|
||||
secs = seconds % 60
|
||||
return f"{mins}分{secs:.0f}秒"
|
||||
else:
|
||||
hours = int(seconds // 3600)
|
||||
mins = int((seconds % 3600) // 60)
|
||||
return f"{hours}时{mins}分"
|
||||
292
apps/etl/pipelines/feiqiu/utils/task_logger.py
Normal file
292
apps/etl/pipelines/feiqiu/utils/task_logger.py
Normal file
@@ -0,0 +1,292 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""统一任务日志器
|
||||
|
||||
提供统一的日志输出格式,支持:
|
||||
- 任务开始/结束记录
|
||||
- 进度追踪
|
||||
- 统计计数
|
||||
- 格式化的任务总结
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
# 统一日志格式
|
||||
UNIFIED_LOG_FORMAT = "[%(asctime)s] %(levelname)-5s | %(name)s | %(message)s"
|
||||
UNIFIED_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
|
||||
|
||||
|
||||
class TaskLogger:
|
||||
"""任务日志器,统一 print 和 logging 输出"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
task_code: str,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
):
|
||||
"""
|
||||
初始化任务日志器
|
||||
|
||||
Args:
|
||||
task_code: 任务代码
|
||||
logger: 底层日志器,如果不提供则创建新的
|
||||
"""
|
||||
self.task_code = task_code
|
||||
self.logger = logger or logging.getLogger(f"task.{task_code}")
|
||||
|
||||
# 任务状态
|
||||
self.start_time: Optional[datetime] = None
|
||||
self.end_time: Optional[datetime] = None
|
||||
self.status: str = "pending"
|
||||
|
||||
# 统计计数
|
||||
self.counts: Dict[str, int] = {
|
||||
"fetched": 0,
|
||||
"inserted": 0,
|
||||
"updated": 0,
|
||||
"skipped": 0,
|
||||
"errors": 0,
|
||||
}
|
||||
|
||||
# 额外信息
|
||||
self.extra_info: Dict[str, Any] = {}
|
||||
|
||||
# 校验结果(如果有)
|
||||
self.verification_result: Optional[dict] = None
|
||||
|
||||
def start(self, message: str = "任务开始"):
|
||||
"""
|
||||
记录任务开始
|
||||
|
||||
Args:
|
||||
message: 开始消息
|
||||
"""
|
||||
self.start_time = datetime.now()
|
||||
self.status = "running"
|
||||
self.logger.info(
|
||||
"%s | %s | 开始时间: %s",
|
||||
self.task_code, message,
|
||||
self.start_time.strftime(UNIFIED_DATE_FORMAT)
|
||||
)
|
||||
|
||||
def progress(self, message: str, **kwargs):
|
||||
"""
|
||||
记录进度
|
||||
|
||||
Args:
|
||||
message: 进度消息
|
||||
**kwargs: 额外的统计信息
|
||||
"""
|
||||
# 更新计数
|
||||
for key, value in kwargs.items():
|
||||
if key in self.counts:
|
||||
if isinstance(value, int):
|
||||
self.counts[key] += value
|
||||
else:
|
||||
self.counts[key] = value
|
||||
else:
|
||||
self.extra_info[key] = value
|
||||
|
||||
# 构建进度字符串
|
||||
counts_str = ", ".join(f"{k}={v}" for k, v in self.counts.items() if v > 0)
|
||||
if counts_str:
|
||||
self.logger.info("%s | %s | %s", self.task_code, message, counts_str)
|
||||
else:
|
||||
self.logger.info("%s | %s", self.task_code, message)
|
||||
|
||||
def info(self, message: str, *args):
|
||||
"""记录信息级别日志"""
|
||||
if args:
|
||||
self.logger.info(f"{self.task_code} | {message}", *args)
|
||||
else:
|
||||
self.logger.info(f"{self.task_code} | {message}")
|
||||
|
||||
def warning(self, message: str, *args):
|
||||
"""记录警告级别日志"""
|
||||
if args:
|
||||
self.logger.warning(f"{self.task_code} | {message}", *args)
|
||||
else:
|
||||
self.logger.warning(f"{self.task_code} | {message}")
|
||||
|
||||
def error(self, message: str, *args, exc_info: bool = False):
|
||||
"""记录错误级别日志"""
|
||||
self.counts["errors"] += 1
|
||||
if args:
|
||||
self.logger.error(f"{self.task_code} | {message}", *args, exc_info=exc_info)
|
||||
else:
|
||||
self.logger.error(f"{self.task_code} | {message}", exc_info=exc_info)
|
||||
|
||||
def set_counts(self, **counts):
|
||||
"""直接设置计数"""
|
||||
for key, value in counts.items():
|
||||
if key in self.counts:
|
||||
self.counts[key] = value
|
||||
|
||||
def add_counts(self, **counts):
|
||||
"""累加计数"""
|
||||
for key, value in counts.items():
|
||||
if key in self.counts:
|
||||
self.counts[key] += value
|
||||
|
||||
def set_verification_result(self, result: dict):
|
||||
"""设置校验结果"""
|
||||
self.verification_result = result
|
||||
|
||||
def end(self, status: str = "成功", error_message: Optional[str] = None) -> str:
|
||||
"""
|
||||
记录任务结束,返回格式化的总结
|
||||
|
||||
Args:
|
||||
status: 状态 ("成功" / "失败" / "取消")
|
||||
error_message: 错误信息(如果失败)
|
||||
|
||||
Returns:
|
||||
格式化的任务总结字符串
|
||||
"""
|
||||
self.end_time = datetime.now()
|
||||
self.status = status
|
||||
|
||||
# 计算耗时
|
||||
if self.start_time:
|
||||
elapsed = (self.end_time - self.start_time).total_seconds()
|
||||
elapsed_str = self._format_duration(elapsed)
|
||||
else:
|
||||
elapsed = 0
|
||||
elapsed_str = "-"
|
||||
|
||||
# 生成总结
|
||||
summary = self._format_summary(status, elapsed_str, error_message)
|
||||
|
||||
# 记录日志
|
||||
if status == "成功":
|
||||
self.logger.info("\n%s", summary)
|
||||
else:
|
||||
self.logger.error("\n%s", summary)
|
||||
|
||||
return summary
|
||||
|
||||
def _format_duration(self, seconds: float) -> str:
|
||||
"""格式化时长"""
|
||||
if seconds < 60:
|
||||
return f"{seconds:.1f}秒"
|
||||
elif seconds < 3600:
|
||||
mins = int(seconds // 60)
|
||||
secs = seconds % 60
|
||||
return f"{mins}分{secs:.0f}秒"
|
||||
else:
|
||||
hours = int(seconds // 3600)
|
||||
mins = int((seconds % 3600) // 60)
|
||||
return f"{hours}时{mins}分"
|
||||
|
||||
def _format_summary(
|
||||
self,
|
||||
status: str,
|
||||
elapsed_str: str,
|
||||
error_message: Optional[str] = None,
|
||||
) -> str:
|
||||
"""格式化任务总结"""
|
||||
lines = [
|
||||
"╔══════════════════════════════════════════════════════════════╗",
|
||||
"║ 任务执行总结 ║",
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
f"║ 任务代码: {self.task_code:<50} ║",
|
||||
f"║ 执行状态: {status:<50} ║",
|
||||
]
|
||||
|
||||
if self.start_time and self.end_time:
|
||||
time_range = f"{self.start_time.strftime('%Y-%m-%d %H:%M:%S')} ~ {self.end_time.strftime('%H:%M:%S')} ({elapsed_str})"
|
||||
lines.append(f"║ 执行时间: {time_range:<50} ║")
|
||||
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 数据统计 ║",
|
||||
f"║ - 获取记录: {self.counts['fetched']:>10,} ║",
|
||||
f"║ - 新增记录: {self.counts['inserted']:>10,} ║",
|
||||
f"║ - 更新记录: {self.counts['updated']:>10,} ║",
|
||||
f"║ - 跳过记录: {self.counts['skipped']:>10,} ║",
|
||||
f"║ - 错误记录: {self.counts['errors']:>10,} ║",
|
||||
])
|
||||
|
||||
# 校验结果
|
||||
if self.verification_result:
|
||||
backfilled_missing = self.verification_result.get(
|
||||
"backfilled_missing_count",
|
||||
self.verification_result.get("backfilled_count", 0),
|
||||
)
|
||||
backfilled_mismatch = self.verification_result.get("backfilled_mismatch_count", 0)
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
"║ 校验结果 ║",
|
||||
f"║ - 源数据量: {self.verification_result.get('source_count', 0):>10,} ║",
|
||||
f"║ - 目标数据量: {self.verification_result.get('target_count', 0):>10,} ║",
|
||||
f"║ - 缺失补齐: {backfilled_missing:>10,} ║",
|
||||
f"║ - 不一致补齐: {backfilled_mismatch:>10,} ║",
|
||||
])
|
||||
|
||||
# 错误信息
|
||||
if error_message:
|
||||
lines.extend([
|
||||
"╠══════════════════════════════════════════════════════════════╣",
|
||||
f"║ 错误信息: {error_message[:50]:<50} ║",
|
||||
])
|
||||
|
||||
lines.append("╚══════════════════════════════════════════════════════════════╝")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def get_result(self) -> dict:
|
||||
"""获取任务结果字典"""
|
||||
elapsed = 0
|
||||
if self.start_time and self.end_time:
|
||||
elapsed = (self.end_time - self.start_time).total_seconds()
|
||||
|
||||
return {
|
||||
"task_code": self.task_code,
|
||||
"status": self.status,
|
||||
"start_time": self.start_time.isoformat() if self.start_time else None,
|
||||
"end_time": self.end_time.isoformat() if self.end_time else None,
|
||||
"elapsed_seconds": elapsed,
|
||||
"counts": self.counts.copy(),
|
||||
"extra_info": self.extra_info.copy(),
|
||||
"verification_result": self.verification_result,
|
||||
}
|
||||
|
||||
|
||||
def configure_task_logging(
|
||||
name: str = "fq_etl",
|
||||
level: str = "INFO",
|
||||
) -> logging.Logger:
|
||||
"""
|
||||
配置任务日志
|
||||
|
||||
Args:
|
||||
name: 日志器名称
|
||||
level: 日志级别
|
||||
|
||||
Returns:
|
||||
配置好的日志器
|
||||
"""
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(getattr(logging, level.upper(), logging.INFO))
|
||||
|
||||
# 清除已有处理器
|
||||
logger.handlers.clear()
|
||||
|
||||
# 添加控制台处理器
|
||||
handler = logging.StreamHandler()
|
||||
handler.setLevel(logging.DEBUG)
|
||||
|
||||
# 设置格式
|
||||
formatter = logging.Formatter(
|
||||
UNIFIED_LOG_FORMAT,
|
||||
UNIFIED_DATE_FORMAT,
|
||||
)
|
||||
handler.setFormatter(formatter)
|
||||
|
||||
logger.addHandler(handler)
|
||||
logger.propagate = False
|
||||
|
||||
return logger
|
||||
142
apps/etl/pipelines/feiqiu/utils/windowing.py
Normal file
142
apps/etl/pipelines/feiqiu/utils/windowing.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Time window helpers for ETL and validation tasks."""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, time
|
||||
from typing import List, Tuple
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
|
||||
def _ensure_tz(dt: datetime, tz: ZoneInfo | None) -> datetime:
|
||||
if tz is None:
|
||||
return dt
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=tz)
|
||||
return dt.astimezone(tz)
|
||||
|
||||
|
||||
def _next_month_start(dt: datetime, tz: ZoneInfo | None) -> datetime:
|
||||
year = dt.year
|
||||
month = dt.month
|
||||
if month == 12:
|
||||
year += 1
|
||||
month = 1
|
||||
else:
|
||||
month += 1
|
||||
return datetime(year, month, 1, tzinfo=tz)
|
||||
|
||||
|
||||
def calc_window_minutes(start: datetime, end: datetime) -> int:
|
||||
if end <= start:
|
||||
return 0
|
||||
return max(1, int((end - start).total_seconds() // 60))
|
||||
|
||||
|
||||
def calc_window_days(start: datetime, end: datetime) -> float:
|
||||
if end <= start:
|
||||
return 0.0
|
||||
return (end - start).total_seconds() / 86400
|
||||
|
||||
|
||||
def format_window_days(value: float) -> str:
|
||||
if value is None:
|
||||
return "0"
|
||||
if abs(value - round(value)) < 1e-6:
|
||||
return str(int(round(value)))
|
||||
return f"{value:.2f}"
|
||||
|
||||
|
||||
def split_window(
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
*,
|
||||
tz: ZoneInfo | None,
|
||||
split_unit: str | None,
|
||||
compensation_hours: int | float | None,
|
||||
split_days: int | None = None,
|
||||
) -> List[Tuple[datetime, datetime]]:
|
||||
start = _ensure_tz(start, tz)
|
||||
end = _ensure_tz(end, tz)
|
||||
|
||||
comp = int(compensation_hours or 0)
|
||||
if comp:
|
||||
start = start - timedelta(hours=comp)
|
||||
end = end + timedelta(hours=comp)
|
||||
|
||||
if end <= start:
|
||||
return []
|
||||
|
||||
unit = (split_unit or "").strip().lower()
|
||||
if unit in ("", "none", "off", "false", "0"):
|
||||
return [(start, end)]
|
||||
|
||||
if unit in ("day", "daily"):
|
||||
step_days = max(1, int(split_days or 1))
|
||||
windows: List[Tuple[datetime, datetime]] = []
|
||||
cur = start
|
||||
while cur < end:
|
||||
nxt = cur + timedelta(days=step_days)
|
||||
if nxt > end:
|
||||
nxt = end
|
||||
if nxt <= cur:
|
||||
break
|
||||
windows.append((cur, nxt))
|
||||
cur = nxt
|
||||
return windows
|
||||
|
||||
if unit in ("week", "weekly"):
|
||||
step_days = 7
|
||||
windows: List[Tuple[datetime, datetime]] = []
|
||||
cur = start
|
||||
while cur < end:
|
||||
nxt = cur + timedelta(days=step_days)
|
||||
if nxt > end:
|
||||
nxt = end
|
||||
if nxt <= cur:
|
||||
break
|
||||
windows.append((cur, nxt))
|
||||
cur = nxt
|
||||
return windows
|
||||
|
||||
if unit not in ("month", "monthly"):
|
||||
return [(start, end)]
|
||||
|
||||
windows: List[Tuple[datetime, datetime]] = []
|
||||
cur = start
|
||||
while cur < end:
|
||||
boundary = _next_month_start(cur, tz)
|
||||
nxt = boundary if boundary < end else end
|
||||
if nxt <= cur:
|
||||
break
|
||||
windows.append((cur, nxt))
|
||||
cur = nxt
|
||||
return windows
|
||||
|
||||
|
||||
def build_window_segments(
|
||||
cfg,
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
*,
|
||||
tz: ZoneInfo | None,
|
||||
override_only: bool,
|
||||
) -> List[Tuple[datetime, datetime]]:
|
||||
split_unit = cfg.get("run.window_split.unit", "month")
|
||||
split_days = cfg.get("run.window_split.days", 1)
|
||||
compensation_hours = cfg.get("run.window_split.compensation_hours", 0)
|
||||
|
||||
if override_only:
|
||||
override_start = cfg.get("run.window_override.start")
|
||||
override_end = cfg.get("run.window_override.end")
|
||||
if not (override_start and override_end):
|
||||
split_unit = "none"
|
||||
compensation_hours = 0
|
||||
|
||||
return split_window(
|
||||
start,
|
||||
end,
|
||||
tz=tz,
|
||||
split_unit=split_unit,
|
||||
compensation_hours=compensation_hours,
|
||||
split_days=split_days,
|
||||
)
|
||||
Reference in New Issue
Block a user