ODS 完成
This commit is contained in:
@@ -15,7 +15,7 @@ DEFAULTS = {
|
||||
"name": "",
|
||||
"user": "",
|
||||
"password": "",
|
||||
"connect_timeout_sec": 5,
|
||||
"connect_timeout_sec": 20,
|
||||
"batch_size": 1000,
|
||||
"session": {
|
||||
"timezone": "Asia/Taipei",
|
||||
@@ -25,10 +25,11 @@ DEFAULTS = {
|
||||
},
|
||||
},
|
||||
"api": {
|
||||
"base_url": None,
|
||||
"base_url": "https://pc.ficoo.vip/apiprod/admin/v1",
|
||||
"token": None,
|
||||
"timeout_sec": 20,
|
||||
"page_size": 200,
|
||||
"params": {},
|
||||
"retries": {
|
||||
"max_attempts": 3,
|
||||
"backoff_sec": [1, 2, 4],
|
||||
@@ -71,6 +72,14 @@ DEFAULTS = {
|
||||
"write_pretty_json": False,
|
||||
"max_file_bytes": 50 * 1024 * 1024,
|
||||
},
|
||||
"pipeline": {
|
||||
# 运行流程:FETCH_ONLY(仅在线抓取落盘)、INGEST_ONLY(本地清洗入库)、FULL(抓取 + 清洗入库)
|
||||
"flow": "FULL",
|
||||
# 在线抓取 JSON 输出根目录(按任务、run_id 与时间自动创建子目录)
|
||||
"fetch_root": r"D:\LLZQ\DB\json_fetch",
|
||||
# 本地清洗入库时的 JSON 输入目录(为空则默认使用本次抓取目录)
|
||||
"ingest_source_dir": "",
|
||||
},
|
||||
"clean": {
|
||||
"log_unknown_fields": True,
|
||||
"unknown_fields_limit": 50,
|
||||
@@ -86,14 +95,6 @@ DEFAULTS = {
|
||||
"redact_keys": ["token", "password", "Authorization"],
|
||||
"echo_token_in_logs": False,
|
||||
},
|
||||
"testing": {
|
||||
# ONLINE: 正常实时 ETL;OFFLINE: 读取归档 JSON 做 T/L
|
||||
"mode": "ONLINE",
|
||||
# 离线归档 JSON 所在目录(测试/离线回放使用)
|
||||
"json_archive_dir": "",
|
||||
# 测试运行时用于生成/复制临时 JSON 的目录
|
||||
"temp_json_dir": "",
|
||||
},
|
||||
}
|
||||
|
||||
# 任务代码常量
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
"""环境变量解析"""
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from copy import deepcopy
|
||||
|
||||
ENV_MAP = {
|
||||
@@ -15,18 +16,22 @@ ENV_MAP = {
|
||||
"PG_NAME": ("db.name",),
|
||||
"PG_USER": ("db.user",),
|
||||
"PG_PASSWORD": ("db.password",),
|
||||
"PG_CONNECT_TIMEOUT": ("db.connect_timeout_sec",),
|
||||
"API_BASE": ("api.base_url",),
|
||||
"API_TOKEN": ("api.token",),
|
||||
"FICOO_TOKEN": ("api.token",),
|
||||
"API_TIMEOUT": ("api.timeout_sec",),
|
||||
"API_PAGE_SIZE": ("api.page_size",),
|
||||
"API_PARAMS": ("api.params",),
|
||||
"EXPORT_ROOT": ("io.export_root",),
|
||||
"LOG_ROOT": ("io.log_root",),
|
||||
"RUN_TASKS": ("run.tasks",),
|
||||
"OVERLAP_SECONDS": ("run.overlap_seconds",),
|
||||
"WINDOW_BUSY_MIN": ("run.window_minutes.default_busy",),
|
||||
"WINDOW_IDLE_MIN": ("run.window_minutes.default_idle",),
|
||||
"TEST_MODE": ("testing.mode",),
|
||||
"TEST_JSON_ARCHIVE_DIR": ("testing.json_archive_dir",),
|
||||
"TEST_JSON_TEMP_DIR": ("testing.temp_json_dir",),
|
||||
"PIPELINE_FLOW": ("pipeline.flow",),
|
||||
"JSON_FETCH_ROOT": ("pipeline.fetch_root",),
|
||||
"JSON_SOURCE_DIR": ("pipeline.ingest_source_dir",),
|
||||
}
|
||||
|
||||
def _deep_set(d, dotted_keys, value):
|
||||
@@ -53,13 +58,97 @@ def _coerce_env(v: str):
|
||||
return s
|
||||
return s
|
||||
|
||||
def load_env_overrides(defaults: dict) -> dict:
|
||||
cfg = deepcopy(defaults)
|
||||
def _strip_inline_comment(value: str) -> str:
|
||||
"""去掉未被引号包裹的内联注释"""
|
||||
result = []
|
||||
in_quote = False
|
||||
quote_char = ""
|
||||
escape = False
|
||||
for ch in value:
|
||||
if escape:
|
||||
result.append(ch)
|
||||
escape = False
|
||||
continue
|
||||
if ch == "\\":
|
||||
escape = True
|
||||
result.append(ch)
|
||||
continue
|
||||
if ch in ("'", '"'):
|
||||
if not in_quote:
|
||||
in_quote = True
|
||||
quote_char = ch
|
||||
elif quote_char == ch:
|
||||
in_quote = False
|
||||
quote_char = ""
|
||||
result.append(ch)
|
||||
continue
|
||||
if ch == "#" and not in_quote:
|
||||
break
|
||||
result.append(ch)
|
||||
return "".join(result).rstrip()
|
||||
|
||||
|
||||
def _unquote_value(value: str) -> str:
|
||||
"""处理引号/原始字符串以及尾随逗号"""
|
||||
trimmed = value.strip()
|
||||
trimmed = _strip_inline_comment(trimmed)
|
||||
trimmed = trimmed.rstrip(",").rstrip()
|
||||
if not trimmed:
|
||||
return trimmed
|
||||
if len(trimmed) >= 2 and trimmed[0] in ("'", '"') and trimmed[-1] == trimmed[0]:
|
||||
return trimmed[1:-1]
|
||||
if (
|
||||
len(trimmed) >= 3
|
||||
and trimmed[0] in ("r", "R")
|
||||
and trimmed[1] in ("'", '"')
|
||||
and trimmed[-1] == trimmed[1]
|
||||
):
|
||||
return trimmed[2:-1]
|
||||
return trimmed
|
||||
|
||||
|
||||
def _parse_dotenv_line(line: str) -> tuple[str, str] | None:
|
||||
"""解析 .env 文件中的单行"""
|
||||
stripped = line.strip()
|
||||
if not stripped or stripped.startswith("#"):
|
||||
return None
|
||||
if stripped.startswith("export "):
|
||||
stripped = stripped[len("export ") :].strip()
|
||||
if "=" not in stripped:
|
||||
return None
|
||||
key, value = stripped.split("=", 1)
|
||||
key = key.strip()
|
||||
value = _unquote_value(value)
|
||||
return key, value
|
||||
|
||||
def _load_dotenv_values() -> dict:
|
||||
"""从项目根目录的 .env 文件读取键值"""
|
||||
root = Path(__file__).resolve().parents[1]
|
||||
dotenv_path = root / ".env"
|
||||
if not dotenv_path.exists():
|
||||
return {}
|
||||
values: dict[str, str] = {}
|
||||
for line in dotenv_path.read_text(encoding="utf-8").splitlines():
|
||||
parsed = _parse_dotenv_line(line)
|
||||
if parsed:
|
||||
key, value = parsed
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def _apply_env_values(cfg: dict, source: dict):
|
||||
for env_key, dotted in ENV_MAP.items():
|
||||
val = os.environ.get(env_key)
|
||||
val = source.get(env_key)
|
||||
if val is None:
|
||||
continue
|
||||
v2 = _coerce_env(val)
|
||||
for path in dotted:
|
||||
if path == "run.tasks" and isinstance(v2, str):
|
||||
v2 = [item.strip() for item in v2.split(",") if item.strip()]
|
||||
_deep_set(cfg, path.split("."), v2)
|
||||
|
||||
def load_env_overrides(defaults: dict) -> dict:
|
||||
cfg = deepcopy(defaults)
|
||||
# 先读取 .env,再读取真实环境变量,确保 CLI 仍然最高优先级
|
||||
_apply_env_values(cfg, _load_dotenv_values())
|
||||
_apply_env_values(cfg, os.environ)
|
||||
return cfg
|
||||
|
||||
Reference in New Issue
Block a user