ODS 完成

This commit is contained in:
Neo
2025-11-30 07:19:05 +08:00
parent b9b050bb5d
commit a6ad343092
81 changed files with 15695 additions and 227180 deletions

View File

@@ -65,18 +65,18 @@ DEFAULTS = {
"allow_empty_result_advance": True,
},
"io": {
"export_root": r"D:\LLZQ\DB\export",
"log_root": r"D:\LLZQ\DB\logs",
"export_root": r"C:\dev\LLTQ\export\JSON",
"log_root": r"C:\dev\LLTQ\export\LOG",
"manifest_name": "manifest.json",
"ingest_report_name": "ingest_report.json",
"write_pretty_json": False,
"write_pretty_json": True,
"max_file_bytes": 50 * 1024 * 1024,
},
"pipeline": {
# 运行流程FETCH_ONLY仅在线抓取落盘、INGEST_ONLY本地清洗入库、FULL抓取 + 清洗入库)
"flow": "FULL",
# 在线抓取 JSON 输出根目录按任务、run_id 与时间自动创建子目录)
"fetch_root": r"D:\LLZQ\DB\json_fetch",
"fetch_root": r"C:\dev\LLTQ\export\JSON",
# 本地清洗入库时的 JSON 输入目录(为空则默认使用本次抓取目录)
"ingest_source_dir": "",
},
@@ -95,6 +95,12 @@ DEFAULTS = {
"redact_keys": ["token", "password", "Authorization"],
"echo_token_in_logs": False,
},
"ods": {
# ODS 离线重建/回放相关(仅开发/运维使用)
"json_doc_dir": r"C:\dev\LLTQ\export\test-json-doc",
"include_files": "",
"drop_schema_first": True,
},
}
# 任务代码常量

View File

@@ -6,40 +6,55 @@ from pathlib import Path
from copy import deepcopy
ENV_MAP = {
"TIMEZONE": ("app.timezone",),
"STORE_ID": ("app.store_id",),
"SCHEMA_OLTP": ("app.schema_oltp",),
"SCHEMA_ETL": ("app.schema_etl",),
"PG_DSN": ("db.dsn",),
"PG_HOST": ("db.host",),
"PG_PORT": ("db.port",),
"PG_NAME": ("db.name",),
"PG_USER": ("db.user",),
"PG_PASSWORD": ("db.password",),
"PG_CONNECT_TIMEOUT": ("db.connect_timeout_sec",),
"API_BASE": ("api.base_url",),
"API_TOKEN": ("api.token",),
"FICOO_TOKEN": ("api.token",),
"API_TIMEOUT": ("api.timeout_sec",),
"API_PAGE_SIZE": ("api.page_size",),
"API_PARAMS": ("api.params",),
"EXPORT_ROOT": ("io.export_root",),
"LOG_ROOT": ("io.log_root",),
"RUN_TASKS": ("run.tasks",),
"OVERLAP_SECONDS": ("run.overlap_seconds",),
"WINDOW_BUSY_MIN": ("run.window_minutes.default_busy",),
"WINDOW_IDLE_MIN": ("run.window_minutes.default_idle",),
"PIPELINE_FLOW": ("pipeline.flow",),
"JSON_FETCH_ROOT": ("pipeline.fetch_root",),
"JSON_SOURCE_DIR": ("pipeline.ingest_source_dir",),
"TIMEZONE": ("app.timezone",),
"STORE_ID": ("app.store_id",),
"SCHEMA_OLTP": ("app.schema_oltp",),
"SCHEMA_ETL": ("app.schema_etl",),
"PG_DSN": ("db.dsn",),
"PG_HOST": ("db.host",),
"PG_PORT": ("db.port",),
"PG_NAME": ("db.name",),
"PG_USER": ("db.user",),
"PG_PASSWORD": ("db.password",),
"PG_CONNECT_TIMEOUT": ("db.connect_timeout_sec",),
"API_BASE": ("api.base_url",),
"API_TOKEN": ("api.token",),
"FICOO_TOKEN": ("api.token",),
"API_TIMEOUT": ("api.timeout_sec",),
"API_PAGE_SIZE": ("api.page_size",),
"API_RETRY_MAX": ("api.retries.max_attempts",),
"API_RETRY_BACKOFF": ("api.retries.backoff_sec",),
"API_PARAMS": ("api.params",),
"EXPORT_ROOT": ("io.export_root",),
"LOG_ROOT": ("io.log_root",),
"MANIFEST_NAME": ("io.manifest_name",),
"INGEST_REPORT_NAME": ("io.ingest_report_name",),
"WRITE_PRETTY_JSON": ("io.write_pretty_json",),
"RUN_TASKS": ("run.tasks",),
"OVERLAP_SECONDS": ("run.overlap_seconds",),
"WINDOW_BUSY_MIN": ("run.window_minutes.default_busy",),
"WINDOW_IDLE_MIN": ("run.window_minutes.default_idle",),
"IDLE_START": ("run.idle_window.start",),
"IDLE_END": ("run.idle_window.end",),
"IDLE_WINDOW_START": ("run.idle_window.start",),
"IDLE_WINDOW_END": ("run.idle_window.end",),
"ALLOW_EMPTY_RESULT_ADVANCE": ("run.allow_empty_result_advance",),
"ALLOW_EMPTY_ADVANCE": ("run.allow_empty_result_advance",),
"PIPELINE_FLOW": ("pipeline.flow",),
"JSON_FETCH_ROOT": ("pipeline.fetch_root",),
"JSON_SOURCE_DIR": ("pipeline.ingest_source_dir",),
"FETCH_ROOT": ("pipeline.fetch_root",),
"INGEST_SOURCE_DIR": ("pipeline.ingest_source_dir",),
}
def _deep_set(d, dotted_keys, value):
cur = d
for k in dotted_keys[:-1]:
cur = cur.setdefault(k, {})
cur[dotted_keys[-1]] = value
def _coerce_env(v: str):
if v is None:
return None
@@ -58,6 +73,7 @@ def _coerce_env(v: str):
return s
return s
def _strip_inline_comment(value: str) -> str:
"""去掉未被引号包裹的内联注释"""
result = []
@@ -121,20 +137,24 @@ def _parse_dotenv_line(line: str) -> tuple[str, str] | None:
value = _unquote_value(value)
return key, value
def _load_dotenv_values() -> dict:
"""从项目根目录 .env 文件读取键值"""
"""从项目根目录读取 .env 文件键值"""
if os.environ.get("ETL_SKIP_DOTENV") in ("1", "true", "TRUE", "True"):
return {}
root = Path(__file__).resolve().parents[1]
dotenv_path = root / ".env"
if not dotenv_path.exists():
return {}
values: dict[str, str] = {}
for line in dotenv_path.read_text(encoding="utf-8").splitlines():
for line in dotenv_path.read_text(encoding="utf-8", errors="ignore").splitlines():
parsed = _parse_dotenv_line(line)
if parsed:
key, value = parsed
values[key] = value
return values
def _apply_env_values(cfg: dict, source: dict):
for env_key, dotted in ENV_MAP.items():
val = source.get(env_key)
@@ -146,6 +166,7 @@ def _apply_env_values(cfg: dict, source: dict):
v2 = [item.strip() for item in v2.split(",") if item.strip()]
_deep_set(cfg, path.split("."), v2)
def load_env_overrides(defaults: dict) -> dict:
cfg = deepcopy(defaults)
# 先读取 .env再读取真实环境变量确保 CLI 仍然最高优先级

View File

@@ -49,6 +49,13 @@ class AppConfig:
f"@{cfg['db']['host']}:{cfg['db']['port']}/{cfg['db']['name']}"
)
# connect_timeout 限定 1-20 秒
try:
timeout_sec = int(cfg["db"].get("connect_timeout_sec") or 5)
except Exception:
raise SystemExit("db.connect_timeout_sec 必须为整数")
cfg["db"]["connect_timeout_sec"] = max(1, min(timeout_sec, 20))
# 会话参数
cfg["db"].setdefault("session", {})
sess = cfg["db"]["session"]