Neo-ZQYY/apps/etl/connectors/feiqiu/config/env_parser.py

# -*- coding: utf-8 -*-
"""环境变量解析"""
import os
import json
from pathlib import Path
from copy import deepcopy

ENV_MAP = {
    "TIMEZONE": ("app.timezone",),
    "BUSINESS_DAY_START_HOUR": ("app.business_day_start_hour",),
    "STORE_ID": ("app.store_id",),
    "SCHEMA_OLTP": ("app.schema_oltp",),
    "SCHEMA_ETL": ("app.schema_etl",),
    "PG_DSN": ("db.dsn",),
    "PG_HOST": ("db.host",),
    "PG_PORT": ("db.port",),
    "PG_NAME": ("db.name",),
    "PG_USER": ("db.user",),
    "PG_PASSWORD": ("db.password",),
    "PG_CONNECT_TIMEOUT": ("db.connect_timeout_sec",),
    "API_BASE": ("api.base_url",),
    "API_TOKEN": ("api.token",),
    "FICOO_TOKEN": ("api.token",),
    "API_TIMEOUT": ("api.timeout_sec",),
    "API_PAGE_SIZE": ("api.page_size",),
    "API_RETRY_MAX": ("api.retries.max_attempts",),
    "API_RETRY_BACKOFF": ("api.retries.backoff_sec",),
    "API_PARAMS": ("api.params",),
    "EXPORT_ROOT": ("io.export_root",),
    "LOG_ROOT": ("io.log_root",),
    "MANIFEST_NAME": ("io.manifest_name",),
    "INGEST_REPORT_NAME": ("io.ingest_report_name",),
    "WRITE_PRETTY_JSON": ("io.write_pretty_json",),
    "RUN_TASKS": ("run.tasks",),
    "RUN_DWS_TASKS": ("run.dws_tasks",),
    "RUN_INDEX_TASKS": ("run.index_tasks",),
    "INDEX_LOOKBACK_DAYS": ("run.index_lookback_days",),
    "OVERLAP_SECONDS": ("run.overlap_seconds",),
    "WINDOW_BUSY_MIN": ("run.window_minutes.default_busy",),
    "WINDOW_IDLE_MIN": ("run.window_minutes.default_idle",),
    "IDLE_START": ("run.idle_window.start",),
    "IDLE_END": ("run.idle_window.end",),
    "IDLE_WINDOW_START": ("run.idle_window.start",),
    "IDLE_WINDOW_END": ("run.idle_window.end",),
    "ALLOW_EMPTY_RESULT_ADVANCE": ("run.allow_empty_result_advance",),
    "ALLOW_EMPTY_ADVANCE": ("run.allow_empty_result_advance",),
    "SNAPSHOT_MISSING_DELETE": ("run.snapshot_missing_delete",),
    "SNAPSHOT_ALLOW_EMPTY_DELETE": ("run.snapshot_allow_empty_delete",),
    "WINDOW_START": ("run.window_override.start",),
    "WINDOW_END": ("run.window_override.end",),
    "WINDOW_SPLIT_UNIT": ("run.window_split.unit",),
    "WINDOW_SPLIT_DAYS": ("run.window_split.days",),
    "WINDOW_COMPENSATION_HOURS": ("run.window_split.compensation_hours",),
    "PIPELINE_FLOW": ("pipeline.flow",),
    "JSON_FETCH_ROOT": ("pipeline.fetch_root",),
    "JSON_SOURCE_DIR": ("pipeline.ingest_source_dir",),
    "FETCH_ROOT": ("pipeline.fetch_root", "io.fetch_root"),
    "INGEST_SOURCE_DIR": ("pipeline.ingest_source_dir",),
    "INTEGRITY_MODE": ("integrity.mode",),
    "INTEGRITY_HISTORY_START": ("integrity.history_start",),
    "INTEGRITY_HISTORY_END": ("integrity.history_end",),
    "INTEGRITY_INCLUDE_DIMENSIONS": ("integrity.include_dimensions",),
    "INTEGRITY_AUTO_CHECK": ("integrity.auto_check",),
    "INTEGRITY_AUTO_BACKFILL": ("integrity.auto_backfill",),
    "INTEGRITY_COMPARE_CONTENT": ("integrity.compare_content",),
    "INTEGRITY_CONTENT_SAMPLE_LIMIT": ("integrity.content_sample_limit",),
    "INTEGRITY_BACKFILL_MISMATCH": ("integrity.backfill_mismatch",),
    "INTEGRITY_RECHECK_AFTER_BACKFILL": ("integrity.recheck_after_backfill",),
    "INTEGRITY_ODS_TASK_CODES": ("integrity.ods_task_codes",),
    "VERIFY_SKIP_ODS_ON_FETCH": ("verification.skip_ods_when_fetch_before_verify",),
    "VERIFY_ODS_LOCAL_JSON": ("verification.ods_use_local_json",),
    "DWD_FACT_UPSERT": ("dwd.fact_upsert",),
    # DWS 月度/薪资配置
    "DWS_MONTHLY_ALLOW_HISTORY": ("dws.monthly.allow_history",),
    "DWS_MONTHLY_PREV_GRACE_DAYS": ("dws.monthly.prev_month_grace_days",),
    "DWS_MONTHLY_HISTORY_MONTHS": ("dws.monthly.history_months",),
    "DWS_MONTHLY_NEW_HIRE_CAP_EFFECTIVE_FROM": ("dws.monthly.new_hire_cap_effective_from",),
    "DWS_MONTHLY_NEW_HIRE_CAP_DAY": ("dws.monthly.new_hire_cap_day",),
    "DWS_MONTHLY_NEW_HIRE_MAX_TIER_LEVEL": ("dws.monthly.new_hire_max_tier_level",),
    "DWS_SALARY_RUN_DAYS": ("dws.salary.run_days",),
    "DWS_SALARY_ALLOW_OUT_OF_CYCLE": ("dws.salary.allow_out_of_cycle",),
    "DWS_SALARY_ROOM_COURSE_PRICE": ("dws.salary.room_course_price",),
    # ODS 离线回放配置
    "ODS_JSON_DOC_DIR": ("ods.json_doc_dir",),
    "ODS_INCLUDE_FILES": ("ods.include_files",),
    "ODS_DROP_SCHEMA_FIRST": ("ods.drop_schema_first",),
    # ── 以下为 2026-02-16 补齐：defaults.py 中有定义但此前缺少 ENV 映射的参数 ──
    # 数据库会话参数
    "DB_SESSION_TIMEZONE": ("db.session.timezone",),
    "DB_STATEMENT_TIMEOUT_MS": ("db.session.statement_timeout_ms",),
    "DB_LOCK_TIMEOUT_MS": ("db.session.lock_timeout_ms",),
    "DB_IDLE_IN_TX_TIMEOUT_MS": ("db.session.idle_in_tx_timeout_ms",),
    # 清洗配置
    "CLEAN_LOG_UNKNOWN_FIELDS": ("clean.log_unknown_fields",),
    "CLEAN_UNKNOWN_FIELDS_LIMIT": ("clean.unknown_fields_limit",),
    "CLEAN_HASH_ALGO": ("clean.hash_key.algo",),
    "CLEAN_HASH_SALT": ("clean.hash_key.salt",),
    "CLEAN_STRICT_NUMERIC": ("clean.strict_numeric",),
    "CLEAN_ROUND_MONEY_SCALE": ("clean.round_money_scale",),
    # 安全配置
    "SECURITY_REDACT_IN_LOGS": ("security.redact_in_logs",),
    "SECURITY_REDACT_KEYS": ("security.redact_keys",),
    "SECURITY_ECHO_TOKEN_IN_LOGS": ("security.echo_token_in_logs",),
    # IO 文件大小限制
    "MAX_FILE_BYTES": ("io.max_file_bytes",),
    # 完整性检查：强制按月切分
    "INTEGRITY_FORCE_MONTHLY_SPLIT": ("integrity.force_monthly_split",),
    # DWD 事实表 UPSERT 批量参数
    "DWD_FACT_UPSERT_BATCH_SIZE": ("dwd.fact_upsert_batch_size",),
    "DWD_FACT_UPSERT_MIN_BATCH_SIZE": ("dwd.fact_upsert_min_batch_size",),
    "DWD_FACT_UPSERT_MAX_RETRIES": ("dwd.fact_upsert_max_retries",),
    "DWD_FACT_UPSERT_RETRY_BACKOFF": ("dwd.fact_upsert_retry_backoff_sec",),
    "DWD_FACT_UPSERT_LOCK_TIMEOUT_MS": ("dwd.fact_upsert_lock_timeout_ms",),
    # 运行模式（直接设置，不经旧 pipeline.flow 配置键映射；配置键名保留以兼容旧配置）
    "DATA_SOURCE": ("run.data_source",),
    # API 额外请求头（JSON 对象格式）
    "API_HEADERS_EXTRA": ("api.headers_extra",),
    # Pipeline 管道限流参数
    "PIPELINE_RATE_MIN": ("pipeline.rate_min",),
    "PIPELINE_RATE_MAX": ("pipeline.rate_max",),
}


def _deep_set(d, dotted_keys, value):
    cur = d
    for k in dotted_keys[:-1]:
        cur = cur.setdefault(k, {})
    cur[dotted_keys[-1]] = value


def _coerce_env(v: str):
    if v is None:
        return None
    s = v.strip()
    if s.lower() in ("true", "false"):
        return s.lower() == "true"
    try:
        if s.isdigit() or (s.startswith("-") and s[1:].isdigit()):
            return int(s)
    except Exception:
        pass
    if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
        try:
            return json.loads(s)
        except Exception:
            return s
    return s


def _strip_inline_comment(value: str) -> str:
    """去掉未被引号包裹的内联注释"""
    result = []
    in_quote = False
    quote_char = ""
    escape = False
    for ch in value:
        if escape:
            result.append(ch)
            escape = False
            continue
        if ch == "\\":
            escape = True
            result.append(ch)
            continue
        if ch in ("'", '"'):
            if not in_quote:
                in_quote = True
                quote_char = ch
            elif quote_char == ch:
                in_quote = False
                quote_char = ""
            result.append(ch)
            continue
        if ch == "#" and not in_quote:
            break
        result.append(ch)
    return "".join(result).rstrip()


def _unquote_value(value: str) -> str:
    """处理引号/原始字符串以及尾随逗号"""
    trimmed = value.strip()
    trimmed = _strip_inline_comment(trimmed)
    trimmed = trimmed.rstrip(",").rstrip()
    if not trimmed:
        return trimmed
    if len(trimmed) >= 2 and trimmed[0] in ("'", '"') and trimmed[-1] == trimmed[0]:
        return trimmed[1:-1]
    if (
        len(trimmed) >= 3
        and trimmed[0] in ("r", "R")
        and trimmed[1] in ("'", '"')
        and trimmed[-1] == trimmed[1]
    ):
        return trimmed[2:-1]
    return trimmed


def _parse_dotenv_line(line: str) -> tuple[str, str] | None:
    """解析 .env 文件中的单行"""
    stripped = line.strip()
    if not stripped or stripped.startswith("#"):
        return None
    if stripped.startswith("export "):
        stripped = stripped[len("export ") :].strip()
    if "=" not in stripped:
        return None
    key, value = stripped.split("=", 1)
    key = key.strip()
    value = _unquote_value(value)
    return key, value


def _load_dotenv_values() -> dict:
    """从项目根目录读取 .env 文件键值"""
    if os.environ.get("ETL_SKIP_DOTENV") in ("1", "true", "TRUE", "True"):
        return {}
    root = Path(__file__).resolve().parents[1]
    dotenv_path = root / ".env"
    if not dotenv_path.exists():
        return {}
    values: dict[str, str] = {}
    for line in dotenv_path.read_text(encoding="utf-8", errors="ignore").splitlines():
        parsed = _parse_dotenv_line(line)
        if parsed:
            key, value = parsed
            values[key] = value
    return values


def _apply_env_values(cfg: dict, source: dict):
    for env_key, dotted in ENV_MAP.items():
        val = source.get(env_key)
        if val is None:
            continue
        v2 = _coerce_env(val)
        for path in dotted:
            if path in ("run.tasks", "run.dws_tasks", "run.index_tasks") and isinstance(v2, str):
                v2 = [item.strip() for item in v2.split(",") if item.strip()]
            _deep_set(cfg, path.split("."), v2)


def load_env_overrides(defaults: dict) -> dict:
    cfg = deepcopy(defaults)
    # 先读取 .env，再读取真实环境变量，确保 CLI 仍然最高优先级
    _apply_env_values(cfg, _load_dotenv_values())
    _apply_env_values(cfg, os.environ)
    return cfg