# -*- coding: utf-8 -*- """环境变量解析""" import os import json from pathlib import Path from copy import deepcopy ENV_MAP = { "TIMEZONE": ("app.timezone",), "BUSINESS_DAY_START_HOUR": ("app.business_day_start_hour",), "STORE_ID": ("app.store_id",), "SCHEMA_OLTP": ("app.schema_oltp",), "SCHEMA_ETL": ("app.schema_etl",), "PG_DSN": ("db.dsn",), "PG_HOST": ("db.host",), "PG_PORT": ("db.port",), "PG_NAME": ("db.name",), "PG_USER": ("db.user",), "PG_PASSWORD": ("db.password",), "PG_CONNECT_TIMEOUT": ("db.connect_timeout_sec",), "API_BASE": ("api.base_url",), "API_TOKEN": ("api.token",), "FICOO_TOKEN": ("api.token",), "API_TIMEOUT": ("api.timeout_sec",), "API_PAGE_SIZE": ("api.page_size",), "API_RETRY_MAX": ("api.retries.max_attempts",), "API_RETRY_BACKOFF": ("api.retries.backoff_sec",), "API_PARAMS": ("api.params",), "EXPORT_ROOT": ("io.export_root",), "LOG_ROOT": ("io.log_root",), "MANIFEST_NAME": ("io.manifest_name",), "INGEST_REPORT_NAME": ("io.ingest_report_name",), "WRITE_PRETTY_JSON": ("io.write_pretty_json",), "RUN_TASKS": ("run.tasks",), "RUN_DWS_TASKS": ("run.dws_tasks",), "RUN_INDEX_TASKS": ("run.index_tasks",), "INDEX_LOOKBACK_DAYS": ("run.index_lookback_days",), "OVERLAP_SECONDS": ("run.overlap_seconds",), "WINDOW_BUSY_MIN": ("run.window_minutes.default_busy",), "WINDOW_IDLE_MIN": ("run.window_minutes.default_idle",), "IDLE_START": ("run.idle_window.start",), "IDLE_END": ("run.idle_window.end",), "IDLE_WINDOW_START": ("run.idle_window.start",), "IDLE_WINDOW_END": ("run.idle_window.end",), "ALLOW_EMPTY_RESULT_ADVANCE": ("run.allow_empty_result_advance",), "ALLOW_EMPTY_ADVANCE": ("run.allow_empty_result_advance",), "SNAPSHOT_MISSING_DELETE": ("run.snapshot_missing_delete",), "SNAPSHOT_ALLOW_EMPTY_DELETE": ("run.snapshot_allow_empty_delete",), "WINDOW_START": ("run.window_override.start",), "WINDOW_END": ("run.window_override.end",), "WINDOW_SPLIT_UNIT": ("run.window_split.unit",), "WINDOW_SPLIT_DAYS": ("run.window_split.days",), "WINDOW_COMPENSATION_HOURS": ("run.window_split.compensation_hours",), "PIPELINE_FLOW": ("pipeline.flow",), "JSON_FETCH_ROOT": ("pipeline.fetch_root",), "JSON_SOURCE_DIR": ("pipeline.ingest_source_dir",), "FETCH_ROOT": ("pipeline.fetch_root", "io.fetch_root"), "INGEST_SOURCE_DIR": ("pipeline.ingest_source_dir",), "INTEGRITY_MODE": ("integrity.mode",), "INTEGRITY_HISTORY_START": ("integrity.history_start",), "INTEGRITY_HISTORY_END": ("integrity.history_end",), "INTEGRITY_INCLUDE_DIMENSIONS": ("integrity.include_dimensions",), "INTEGRITY_AUTO_CHECK": ("integrity.auto_check",), "INTEGRITY_AUTO_BACKFILL": ("integrity.auto_backfill",), "INTEGRITY_COMPARE_CONTENT": ("integrity.compare_content",), "INTEGRITY_CONTENT_SAMPLE_LIMIT": ("integrity.content_sample_limit",), "INTEGRITY_BACKFILL_MISMATCH": ("integrity.backfill_mismatch",), "INTEGRITY_RECHECK_AFTER_BACKFILL": ("integrity.recheck_after_backfill",), "INTEGRITY_ODS_TASK_CODES": ("integrity.ods_task_codes",), "VERIFY_SKIP_ODS_ON_FETCH": ("verification.skip_ods_when_fetch_before_verify",), "VERIFY_ODS_LOCAL_JSON": ("verification.ods_use_local_json",), "DWD_FACT_UPSERT": ("dwd.fact_upsert",), # DWS 月度/薪资配置 "DWS_MONTHLY_ALLOW_HISTORY": ("dws.monthly.allow_history",), "DWS_MONTHLY_PREV_GRACE_DAYS": ("dws.monthly.prev_month_grace_days",), "DWS_MONTHLY_HISTORY_MONTHS": ("dws.monthly.history_months",), "DWS_MONTHLY_NEW_HIRE_CAP_EFFECTIVE_FROM": ("dws.monthly.new_hire_cap_effective_from",), "DWS_MONTHLY_NEW_HIRE_CAP_DAY": ("dws.monthly.new_hire_cap_day",), "DWS_MONTHLY_NEW_HIRE_MAX_TIER_LEVEL": ("dws.monthly.new_hire_max_tier_level",), "DWS_SALARY_RUN_DAYS": ("dws.salary.run_days",), "DWS_SALARY_ALLOW_OUT_OF_CYCLE": ("dws.salary.allow_out_of_cycle",), "DWS_SALARY_ROOM_COURSE_PRICE": ("dws.salary.room_course_price",), # ODS 离线回放配置 "ODS_JSON_DOC_DIR": ("ods.json_doc_dir",), "ODS_INCLUDE_FILES": ("ods.include_files",), "ODS_DROP_SCHEMA_FIRST": ("ods.drop_schema_first",), # ── 以下为 2026-02-16 补齐:defaults.py 中有定义但此前缺少 ENV 映射的参数 ── # 数据库会话参数 "DB_SESSION_TIMEZONE": ("db.session.timezone",), "DB_STATEMENT_TIMEOUT_MS": ("db.session.statement_timeout_ms",), "DB_LOCK_TIMEOUT_MS": ("db.session.lock_timeout_ms",), "DB_IDLE_IN_TX_TIMEOUT_MS": ("db.session.idle_in_tx_timeout_ms",), # 清洗配置 "CLEAN_LOG_UNKNOWN_FIELDS": ("clean.log_unknown_fields",), "CLEAN_UNKNOWN_FIELDS_LIMIT": ("clean.unknown_fields_limit",), "CLEAN_HASH_ALGO": ("clean.hash_key.algo",), "CLEAN_HASH_SALT": ("clean.hash_key.salt",), "CLEAN_STRICT_NUMERIC": ("clean.strict_numeric",), "CLEAN_ROUND_MONEY_SCALE": ("clean.round_money_scale",), # 安全配置 "SECURITY_REDACT_IN_LOGS": ("security.redact_in_logs",), "SECURITY_REDACT_KEYS": ("security.redact_keys",), "SECURITY_ECHO_TOKEN_IN_LOGS": ("security.echo_token_in_logs",), # IO 文件大小限制 "MAX_FILE_BYTES": ("io.max_file_bytes",), # 完整性检查:强制按月切分 "INTEGRITY_FORCE_MONTHLY_SPLIT": ("integrity.force_monthly_split",), # DWD 事实表 UPSERT 批量参数 "DWD_FACT_UPSERT_BATCH_SIZE": ("dwd.fact_upsert_batch_size",), "DWD_FACT_UPSERT_MIN_BATCH_SIZE": ("dwd.fact_upsert_min_batch_size",), "DWD_FACT_UPSERT_MAX_RETRIES": ("dwd.fact_upsert_max_retries",), "DWD_FACT_UPSERT_RETRY_BACKOFF": ("dwd.fact_upsert_retry_backoff_sec",), "DWD_FACT_UPSERT_LOCK_TIMEOUT_MS": ("dwd.fact_upsert_lock_timeout_ms",), # 运行模式(直接设置,不经旧 pipeline.flow 配置键映射;配置键名保留以兼容旧配置) "DATA_SOURCE": ("run.data_source",), # API 额外请求头(JSON 对象格式) "API_HEADERS_EXTRA": ("api.headers_extra",), # Pipeline 管道限流参数 "PIPELINE_RATE_MIN": ("pipeline.rate_min",), "PIPELINE_RATE_MAX": ("pipeline.rate_max",), } def _deep_set(d, dotted_keys, value): cur = d for k in dotted_keys[:-1]: cur = cur.setdefault(k, {}) cur[dotted_keys[-1]] = value def _coerce_env(v: str): if v is None: return None s = v.strip() if s.lower() in ("true", "false"): return s.lower() == "true" try: if s.isdigit() or (s.startswith("-") and s[1:].isdigit()): return int(s) except Exception: pass if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")): try: return json.loads(s) except Exception: return s return s def _strip_inline_comment(value: str) -> str: """去掉未被引号包裹的内联注释""" result = [] in_quote = False quote_char = "" escape = False for ch in value: if escape: result.append(ch) escape = False continue if ch == "\\": escape = True result.append(ch) continue if ch in ("'", '"'): if not in_quote: in_quote = True quote_char = ch elif quote_char == ch: in_quote = False quote_char = "" result.append(ch) continue if ch == "#" and not in_quote: break result.append(ch) return "".join(result).rstrip() def _unquote_value(value: str) -> str: """处理引号/原始字符串以及尾随逗号""" trimmed = value.strip() trimmed = _strip_inline_comment(trimmed) trimmed = trimmed.rstrip(",").rstrip() if not trimmed: return trimmed if len(trimmed) >= 2 and trimmed[0] in ("'", '"') and trimmed[-1] == trimmed[0]: return trimmed[1:-1] if ( len(trimmed) >= 3 and trimmed[0] in ("r", "R") and trimmed[1] in ("'", '"') and trimmed[-1] == trimmed[1] ): return trimmed[2:-1] return trimmed def _parse_dotenv_line(line: str) -> tuple[str, str] | None: """解析 .env 文件中的单行""" stripped = line.strip() if not stripped or stripped.startswith("#"): return None if stripped.startswith("export "): stripped = stripped[len("export ") :].strip() if "=" not in stripped: return None key, value = stripped.split("=", 1) key = key.strip() value = _unquote_value(value) return key, value def _load_dotenv_values() -> dict: """从项目根目录读取 .env 文件键值""" if os.environ.get("ETL_SKIP_DOTENV") in ("1", "true", "TRUE", "True"): return {} root = Path(__file__).resolve().parents[1] dotenv_path = root / ".env" if not dotenv_path.exists(): return {} values: dict[str, str] = {} for line in dotenv_path.read_text(encoding="utf-8", errors="ignore").splitlines(): parsed = _parse_dotenv_line(line) if parsed: key, value = parsed values[key] = value return values def _apply_env_values(cfg: dict, source: dict): for env_key, dotted in ENV_MAP.items(): val = source.get(env_key) if val is None: continue v2 = _coerce_env(val) for path in dotted: if path in ("run.tasks", "run.dws_tasks", "run.index_tasks") and isinstance(v2, str): v2 = [item.strip() for item in v2.split(",") if item.strip()] _deep_set(cfg, path.split("."), v2) def load_env_overrides(defaults: dict) -> dict: cfg = deepcopy(defaults) # 先读取 .env,再读取真实环境变量,确保 CLI 仍然最高优先级 _apply_env_values(cfg, _load_dotenv_values()) _apply_env_values(cfg, os.environ) return cfg