ETL 完成
This commit is contained in:
@@ -8,6 +8,8 @@ import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from api.endpoint_routing import plan_calls
|
||||
|
||||
DEFAULT_BROWSER_HEADERS = {
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Content-Type": "application/json",
|
||||
@@ -142,7 +144,7 @@ class APIClient:
|
||||
raise ValueError(f"API 返回错误 code={code} msg={msg}")
|
||||
|
||||
# ------------------------------------------------------------------ 分页
|
||||
def iter_paginated(
|
||||
def _iter_paginated_single(
|
||||
self,
|
||||
endpoint: str,
|
||||
params: dict | None,
|
||||
@@ -155,8 +157,7 @@ class APIClient:
|
||||
page_end: int | None = None,
|
||||
) -> Iterable[tuple[int, list, dict, dict]]:
|
||||
"""
|
||||
分页迭代器:逐页拉取数据并产出 (page_no, records, request_params, raw_response)。
|
||||
page_size=None 时不附带分页参数,仅拉取一次。
|
||||
单一 endpoint 的分页迭代器(不包含 recent/former 路由逻辑)。
|
||||
"""
|
||||
base_params = dict(params or {})
|
||||
page = page_start
|
||||
@@ -183,6 +184,42 @@ class APIClient:
|
||||
|
||||
page += 1
|
||||
|
||||
def iter_paginated(
|
||||
self,
|
||||
endpoint: str,
|
||||
params: dict | None,
|
||||
page_size: int | None = 200,
|
||||
page_field: str = "page",
|
||||
size_field: str = "limit",
|
||||
data_path: tuple = ("data",),
|
||||
list_key: str | Sequence[str] | None = None,
|
||||
page_start: int = 1,
|
||||
page_end: int | None = None,
|
||||
) -> Iterable[tuple[int, list, dict, dict]]:
|
||||
"""
|
||||
分页迭代器:逐页拉取数据并产出 (page_no, records, request_params, raw_response)。
|
||||
page_size=None 时不附带分页参数,仅拉取一次。
|
||||
"""
|
||||
# recent/former 路由:当 params 带时间范围字段时,按“3个月自然月”边界决定走哪个 endpoint,
|
||||
# 跨越边界则拆分为两段请求并顺序产出,确保调用方使用 page_no 命名文件时不会被覆盖。
|
||||
call_plan = plan_calls(endpoint, params)
|
||||
global_page = 1
|
||||
|
||||
for call in call_plan:
|
||||
for _, records, request_params, payload in self._iter_paginated_single(
|
||||
endpoint=call.endpoint,
|
||||
params=call.params,
|
||||
page_size=page_size,
|
||||
page_field=page_field,
|
||||
size_field=size_field,
|
||||
data_path=data_path,
|
||||
list_key=list_key,
|
||||
page_start=page_start,
|
||||
page_end=page_end,
|
||||
):
|
||||
yield global_page, records, request_params, payload
|
||||
global_page += 1
|
||||
|
||||
def get_paginated(
|
||||
self,
|
||||
endpoint: str,
|
||||
|
||||
166
etl_billiards/api/endpoint_routing.py
Normal file
166
etl_billiards/api/endpoint_routing.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
“近期记录 / 历史记录(Former)”接口路由规则。
|
||||
|
||||
需求:
|
||||
- 当请求参数包含可定义时间范围的字段时,根据当前时间(北京时间/上海时区)判断:
|
||||
- 3个月(自然月)之前 -> 使用“历史记录”接口
|
||||
- 3个月以内 -> 使用“近期记录”接口
|
||||
- 若时间范围跨越边界 -> 拆分为两段分别请求并合并(由上层分页迭代器顺序产出)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from dateutil import parser as dtparser
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
|
||||
ROUTING_TZ = ZoneInfo("Asia/Shanghai")
|
||||
RECENT_MONTHS = 3
|
||||
|
||||
|
||||
# 按 `fetch-test/recent_vs_former_report.md` 更新(“无”表示没有历史接口;相同 path 表示同一个接口可查历史)
|
||||
RECENT_TO_FORMER_OVERRIDES: dict[str, str | None] = {
|
||||
"/AssistantPerformance/GetAbolitionAssistant": None,
|
||||
"/Site/GetSiteTableUseDetails": "/Site/GetSiteTableUseDetails",
|
||||
"/GoodsStockManage/QueryGoodsOutboundReceipt": "/GoodsStockManage/QueryFormerGoodsOutboundReceipt",
|
||||
"/Promotion/GetOfflineCouponConsumePageList": "/Promotion/GetOfflineCouponConsumePageList",
|
||||
"/Order/GetRefundPayLogList": None,
|
||||
# 已知特殊
|
||||
"/Site/GetAllOrderSettleList": "/Site/GetFormerOrderSettleList",
|
||||
"/PayLog/GetPayLogListPage": "/PayLog/GetFormerPayLogListPage",
|
||||
}
|
||||
|
||||
|
||||
TIME_WINDOW_KEYS: tuple[tuple[str, str], ...] = (
|
||||
("startTime", "endTime"),
|
||||
("rangeStartTime", "rangeEndTime"),
|
||||
("StartPayTime", "EndPayTime"),
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class WindowSpec:
|
||||
start_key: str
|
||||
end_key: str
|
||||
start: datetime
|
||||
end: datetime
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RoutedCall:
|
||||
endpoint: str
|
||||
params: dict
|
||||
|
||||
|
||||
def is_former_endpoint(endpoint: str) -> bool:
|
||||
return "Former" in str(endpoint or "")
|
||||
|
||||
|
||||
def _parse_dt(value: object, tz: ZoneInfo) -> datetime | None:
|
||||
if value is None:
|
||||
return None
|
||||
s = str(value).strip()
|
||||
if not s:
|
||||
return None
|
||||
dt = dtparser.parse(s)
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=tz)
|
||||
return dt.astimezone(tz)
|
||||
|
||||
|
||||
def _fmt_dt(dt: datetime, tz: ZoneInfo) -> str:
|
||||
return dt.astimezone(tz).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
def extract_window_spec(params: dict | None, tz: ZoneInfo = ROUTING_TZ) -> WindowSpec | None:
|
||||
if not isinstance(params, dict) or not params:
|
||||
return None
|
||||
for start_key, end_key in TIME_WINDOW_KEYS:
|
||||
if start_key in params or end_key in params:
|
||||
start = _parse_dt(params.get(start_key), tz)
|
||||
end = _parse_dt(params.get(end_key), tz)
|
||||
if start and end:
|
||||
return WindowSpec(start_key=start_key, end_key=end_key, start=start, end=end)
|
||||
return None
|
||||
|
||||
|
||||
def derive_former_endpoint(recent_endpoint: str) -> str | None:
|
||||
endpoint = str(recent_endpoint or "").strip()
|
||||
if not endpoint:
|
||||
return None
|
||||
|
||||
if endpoint in RECENT_TO_FORMER_OVERRIDES:
|
||||
return RECENT_TO_FORMER_OVERRIDES[endpoint]
|
||||
|
||||
if is_former_endpoint(endpoint):
|
||||
return endpoint
|
||||
|
||||
idx = endpoint.find("Get")
|
||||
if idx == -1:
|
||||
return endpoint
|
||||
return f"{endpoint[:idx]}GetFormer{endpoint[idx + 3:]}"
|
||||
|
||||
|
||||
def recent_boundary(now: datetime, months: int = RECENT_MONTHS) -> datetime:
|
||||
"""
|
||||
3个月(自然月)边界:取 (now - months) 所在月份的 1 号 00:00:00。
|
||||
"""
|
||||
if now.tzinfo is None:
|
||||
raise ValueError("now 必须为时区时间")
|
||||
base = now - relativedelta(months=months)
|
||||
return base.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
|
||||
def plan_calls(
|
||||
endpoint: str,
|
||||
params: dict | None,
|
||||
*,
|
||||
now: datetime | None = None,
|
||||
tz: ZoneInfo = ROUTING_TZ,
|
||||
months: int = RECENT_MONTHS,
|
||||
) -> list[RoutedCall]:
|
||||
"""
|
||||
根据 endpoint + params 的时间窗口,返回要调用的 endpoint/params 列表(可能拆分为两段)。
|
||||
"""
|
||||
base_params = dict(params or {})
|
||||
if not base_params:
|
||||
return [RoutedCall(endpoint=endpoint, params=base_params)]
|
||||
|
||||
# 若调用方显式传了 Former 接口,则不二次路由。
|
||||
if is_former_endpoint(endpoint):
|
||||
return [RoutedCall(endpoint=endpoint, params=base_params)]
|
||||
|
||||
window = extract_window_spec(base_params, tz)
|
||||
if not window:
|
||||
return [RoutedCall(endpoint=endpoint, params=base_params)]
|
||||
|
||||
former_endpoint = derive_former_endpoint(endpoint)
|
||||
if former_endpoint is None or former_endpoint == endpoint:
|
||||
return [RoutedCall(endpoint=endpoint, params=base_params)]
|
||||
|
||||
now_dt = (now or datetime.now(tz)).astimezone(tz)
|
||||
boundary = recent_boundary(now_dt, months=months)
|
||||
|
||||
start, end = window.start, window.end
|
||||
if end <= boundary:
|
||||
return [RoutedCall(endpoint=former_endpoint, params=base_params)]
|
||||
if start >= boundary:
|
||||
return [RoutedCall(endpoint=endpoint, params=base_params)]
|
||||
|
||||
# 跨越边界:拆分两段(老数据 -> former,新数据 -> recent)
|
||||
p1 = dict(base_params)
|
||||
p1[window.start_key] = _fmt_dt(start, tz)
|
||||
p1[window.end_key] = _fmt_dt(boundary, tz)
|
||||
|
||||
p2 = dict(base_params)
|
||||
p2[window.start_key] = _fmt_dt(boundary, tz)
|
||||
p2[window.end_key] = _fmt_dt(end, tz)
|
||||
|
||||
return [RoutedCall(endpoint=former_endpoint, params=p1), RoutedCall(endpoint=endpoint, params=p2)]
|
||||
|
||||
@@ -20,6 +20,10 @@ class LocalJsonClient:
|
||||
if not self.base_dir.exists():
|
||||
raise FileNotFoundError(f"JSON 目录不存在: {self.base_dir}")
|
||||
|
||||
def get_source_hint(self, endpoint: str) -> str:
|
||||
"""Return the JSON file path for this endpoint (for source_file lineage)."""
|
||||
return str(self.base_dir / endpoint_to_filename(endpoint))
|
||||
|
||||
def iter_paginated(
|
||||
self,
|
||||
endpoint: str,
|
||||
|
||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import Any, Iterable, Tuple
|
||||
|
||||
from api.client import APIClient
|
||||
from api.endpoint_routing import plan_calls
|
||||
from utils.json_store import dump_json, endpoint_to_filename
|
||||
|
||||
|
||||
@@ -33,6 +34,10 @@ class RecordingAPIClient:
|
||||
self.last_dump: dict[str, Any] | None = None
|
||||
|
||||
# ------------------------------------------------------------------ public API
|
||||
def get_source_hint(self, endpoint: str) -> str:
|
||||
"""Return the JSON dump path for this endpoint (for source_file lineage)."""
|
||||
return str(self.output_dir / endpoint_to_filename(endpoint))
|
||||
|
||||
def iter_paginated(
|
||||
self,
|
||||
endpoint: str,
|
||||
@@ -99,11 +104,18 @@ class RecordingAPIClient:
|
||||
):
|
||||
filename = endpoint_to_filename(endpoint)
|
||||
path = self.output_dir / filename
|
||||
routing_calls = []
|
||||
try:
|
||||
for call in plan_calls(endpoint, params):
|
||||
routing_calls.append({"endpoint": call.endpoint, "params": call.params})
|
||||
except Exception:
|
||||
routing_calls = []
|
||||
payload = {
|
||||
"task_code": self.task_code,
|
||||
"run_id": self.run_id,
|
||||
"endpoint": endpoint,
|
||||
"params": params or {},
|
||||
"endpoint_routing": {"calls": routing_calls} if routing_calls else None,
|
||||
"page_size": page_size,
|
||||
"pages": pages,
|
||||
"total_records": total_records,
|
||||
|
||||
Reference in New Issue
Block a user