# -*- coding: utf-8 -*- """CLI主入口 支持三种执行模式: 1. 传统模式:指定任务列表直接执行 2. Flow 模式:通过 --flow 指定预定义 Flow 类型,执行多层 ETL 3. Layers 模式:通过 --layers 自由组合 ETL 层(ODS,DWD,DWS,INDEX) 注意:--flow 和 --layers 互斥,不能同时使用。 --pipeline 是 --flow 的已弃用别名,使用时会输出 DeprecationWarning。 处理模式说明(--processing-mode): - increment_only:仅增量 - 只执行增量数据处理 - verify_only:校验并修复 - 跳过增量,直接校验数据一致性并自动补齐 - 可选 --fetch-before-verify:校验前先从 API 获取数据 - increment_verify:增量+校验并修复 - 先增量处理,再校验补齐 示例: # 传统模式 python -m cli.main --tasks ODS_MEMBER,ODS_ORDER # Flow 模式(仅增量) python -m cli.main --flow api_full --processing-mode increment_only # Flow 模式(校验并修复,跳过增量) python -m cli.main --flow api_full --processing-mode verify_only # Flow 模式(校验并修复,校验前先从 API 获取数据) python -m cli.main --flow api_full --processing-mode verify_only --fetch-before-verify # Flow 模式(增量+校验并修复) python -m cli.main --flow api_full --processing-mode increment_verify # 带时间窗口的 Flow 模式 python -m cli.main --flow api_ods_dwd --window-start "2026-02-01" --window-end "2026-02-02" # --layers 模式:自由组合层 python -m cli.main --layers ODS,DWD --store-id 1 # 仅执行 DWS + INDEX 层 python -m cli.main --layers DWS,INDEX """ import sys import argparse import logging from datetime import datetime from pathlib import Path from config.settings import AppConfig from orchestration.scheduler import ETLScheduler # 保留,task 9 处理薄包装层 # 新架构依赖 from database.connection import DatabaseConnection from database.operations import DatabaseOperations from orchestration.cursor_manager import CursorManager from orchestration.run_tracker import RunTracker from orchestration.task_registry import default_registry from orchestration.task_executor import TaskExecutor from orchestration.flow_runner import FlowRunner from api.client import APIClient # Flow 选项 FLOW_CHOICES = [ "api_ods", # API → ODS "api_ods_dwd", # API → ODS → DWD "api_full", # API → ODS → DWD → DWS汇总 → DWS指数 "ods_dwd", # ODS → DWD "dwd_dws", # DWD → DWS汇总 "dwd_dws_index", # DWD → DWS汇总 → DWS指数 "dwd_index", # DWD → DWS指数 ] # 处理模式选项 PROCESSING_MODE_CHOICES = [ "increment_only", # 仅增量 "verify_only", # 校验并修复(跳过增量) "increment_verify", # 增量 + 校验并修复 ] # 时间窗口切分选项 WINDOW_SPLIT_CHOICES = ["none", "day", "week", "month"] def setup_logging(): """设置日志(使用统一格式)""" try: from utils.logging_utils import UNIFIED_FORMAT, DATE_FORMAT fmt = UNIFIED_FORMAT datefmt = DATE_FORMAT except ImportError: fmt = "[%(asctime)s] %(levelname)-5s | %(name)s | %(message)s" datefmt = "%Y-%m-%d %H:%M:%S" logging.basicConfig(level=logging.INFO, format=fmt, datefmt=datefmt) return logging.getLogger("etl_billiards") # --layers 合法层名 VALID_LAYERS = {"ODS", "DWD", "DWS", "INDEX"} def parse_layers(raw: str) -> list[str]: """解析逗号分隔的层名字符串,校验合法性。 Args: raw: 逗号分隔的层名,如 "ODS,DWD" Returns: 大写层名列表,如 ["ODS", "DWD"] Raises: ValueError: 包含无效层名时抛出 """ layers = [l.strip().upper() for l in raw.split(",") if l.strip()] if not layers: raise ValueError("--layers 不能为空") invalid = set(layers) - VALID_LAYERS if invalid: raise ValueError(f"无效的层名: {invalid},合法值: {sorted(VALID_LAYERS)}") return layers def parse_args(): """解析命令行参数""" parser = argparse.ArgumentParser( description="台球场ETL系统", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: # 传统任务模式 python -m cli.main --tasks ODS_MEMBER,ODS_ORDER --store-id 1 # Flow 模式(仅增量) python -m cli.main --flow api_ods_dwd --processing-mode increment_only # Flow 模式(校验并修复,跳过增量) python -m cli.main --flow api_full --processing-mode verify_only # Flow 模式(校验并修复,先从 API 获取数据) python -m cli.main --flow api_full --processing-mode verify_only --fetch-before-verify # Flow 模式(增量+校验并修复) python -m cli.main --flow api_full --processing-mode increment_verify # 指定时间窗口 python -m cli.main --flow api_ods --window-start "2026-02-01" --window-end "2026-02-02" # --layers 模式:自由组合 ETL 层(与 --flow 互斥) python -m cli.main --layers ODS,DWD --store-id 1 # 仅执行 DWS + INDEX 层 python -m cli.main --layers DWS,INDEX # --pipeline 仍可用(已弃用,建议迁移到 --flow) python -m cli.main --pipeline api_full """, ) # 基本参数 parser.add_argument("--store-id", type=int, help="门店ID") parser.add_argument("--tasks", help="任务列表,逗号分隔(传统模式)") parser.add_argument("--dry-run", action="store_true", help="试运行(不提交)") # Flow 参数 parser.add_argument( "--flow", choices=FLOW_CHOICES, help="Flow 类型(与 --layers 互斥):api_ods, api_ods_dwd, api_full, ods_dwd, dwd_dws, dwd_dws_index, dwd_index", ) # --pipeline 作为已弃用别名,映射到独立 dest 以便检测使用 parser.add_argument( "--pipeline", choices=FLOW_CHOICES, dest="pipeline_deprecated", help="[已弃用] 请使用 --flow。功能与 --flow 相同,使用时输出 DeprecationWarning", ) parser.add_argument( "--layers", help="ETL 层自由组合,逗号分隔(ODS,DWD,DWS,INDEX),与 --flow/--pipeline 互斥", ) parser.add_argument( "--processing-mode", dest="processing_mode", choices=PROCESSING_MODE_CHOICES, default="increment_only", help="处理模式:increment_only(仅增量)/ verify_only(校验并修复)/ increment_verify(增量+校验并修复)", ) parser.add_argument( "--fetch-before-verify", dest="fetch_before_verify", action="store_true", help="校验前先从 API 获取数据(仅在 verify_only 模式下有效)", ) parser.add_argument( "--verify-tables", dest="verify_tables", help="仅校验指定表(逗号分隔),用于单表验证", ) parser.add_argument( "--window-split", dest="window_split", choices=WINDOW_SPLIT_CHOICES, default="none", help="时间窗口切分:none(不切分)/ day / week / month", ) parser.add_argument( "--lookback-hours", dest="lookback_hours", type=int, default=24, help="回溯小时数(默认24小时)", ) parser.add_argument( "--overlap-seconds", dest="overlap_seconds", type=int, default=3600, help="冗余秒数(默认3600秒=1小时)", ) # 数据库参数 parser.add_argument("--pg-dsn", help="PostgreSQL DSN") parser.add_argument("--pg-host", help="PostgreSQL主机") parser.add_argument("--pg-port", type=int, help="PostgreSQL端口") parser.add_argument("--pg-name", help="PostgreSQL数据库名") parser.add_argument("--pg-user", help="PostgreSQL用户名") parser.add_argument("--pg-password", help="PostgreSQL密码") # API参数 parser.add_argument("--api-base", help="API基础URL") parser.add_argument("--api-token", "--token", dest="api_token", help="API令牌(Bearer Token)") parser.add_argument("--api-timeout", type=int, help="API超时(秒)") parser.add_argument("--api-page-size", type=int, help="分页大小") parser.add_argument("--api-retry-max", type=int, help="API重试最大次数") # 回溯/手动窗口 parser.add_argument( "--window-start", dest="window_start", help="固定时间窗口开始(优先级高于游标,例如:2025-07-01 00:00:00)", ) parser.add_argument( "--window-end", dest="window_end", help="固定时间窗口结束(优先级高于游标,推荐用月末+1,例如:2025-08-01 00:00:00)", ) parser.add_argument( "--force-window-override", action="store_true", help="强制使用 window_start/window_end,不走 MAX(fetched_at) 兜底", ) parser.add_argument( "--window-split-unit", dest="window_split_unit", help="窗口切分单位(day/week/month/none),默认来自配置 run.window_split.unit", ) parser.add_argument( "--window-split-days", dest="window_split_days", type=int, choices=[1, 10, 30], help="按天切分的天数(1/10/30),默认来自配置 run.window_split.days", ) parser.add_argument( "--window-compensation-hours", dest="window_compensation_hours", type=int, help="窗口前后补偿小时数,默认来自配置 run.window_split.compensation_hours", ) # 目录参数 parser.add_argument("--export-root", help="导出根目录") parser.add_argument("--log-root", help="日志根目录") # 数据源模式(新参数,替代旧 --pipeline-flow) parser.add_argument( "--data-source", dest="data_source", choices=["online", "offline", "hybrid"], default=None, help="数据源模式:online(仅在线抓取)/ offline(仅本地入库)/ hybrid(抓取+入库)", ) # 数据源 Flow(--pipeline-flow 已弃用,请使用 --data-source) parser.add_argument("--pipeline-flow", choices=["FULL", "FETCH_ONLY", "INGEST_ONLY"], help="[已弃用] 请使用 --data-source") parser.add_argument("--fetch-root", help="抓取JSON输出根目录") parser.add_argument("--ingest-source", help="本地清洗入库源目录") parser.add_argument("--write-pretty-json", action="store_true", help="抓取JSON美化输出") # 运行窗口 parser.add_argument("--idle-start", help="闲时窗口开始(HH:MM)") parser.add_argument("--idle-end", help="闲时窗口结束(HH:MM)") parser.add_argument("--allow-empty-advance", action="store_true", help="允许空结果推进窗口") # 强制全量更新(跳过 ODS hash 去重 + DWD 变更对比,无条件写入) parser.add_argument( "--force-full", dest="force_full", action="store_true", help="强制全量处理:跳过 ODS hash 去重和 DWD 变更对比,无条件写入", ) return parser.parse_args() def resolve_data_source(args) -> str: """解析 data_source 参数,处理旧参数 --pipeline-flow 的弃用映射。 优先级:--data-source > --pipeline-flow(弃用别名)> 默认值 hybrid """ _FLOW_TO_DATA_SOURCE = { "FULL": "hybrid", "FETCH_ONLY": "online", "INGEST_ONLY": "offline", } if args.data_source: return args.data_source if args.pipeline_flow: import warnings mapped = _FLOW_TO_DATA_SOURCE.get(args.pipeline_flow.upper(), "hybrid") warnings.warn( f"--pipeline-flow 已弃用,请使用 --data-source {mapped}", DeprecationWarning, stacklevel=2, ) return mapped return "hybrid" # 默认值 def build_cli_overrides(args) -> dict: """从命令行参数构建配置覆盖""" overrides = {} # 基本信息 if args.store_id is not None: overrides.setdefault("app", {})["store_id"] = args.store_id # 数据库 if args.pg_dsn: overrides.setdefault("db", {})["dsn"] = args.pg_dsn if args.pg_host: overrides.setdefault("db", {})["host"] = args.pg_host if args.pg_port: overrides.setdefault("db", {})["port"] = args.pg_port if args.pg_name: overrides.setdefault("db", {})["name"] = args.pg_name if args.pg_user: overrides.setdefault("db", {})["user"] = args.pg_user if args.pg_password: overrides.setdefault("db", {})["password"] = args.pg_password # API if args.api_base: overrides.setdefault("api", {})["base_url"] = args.api_base if args.api_token: overrides.setdefault("api", {})["token"] = args.api_token if args.api_timeout: overrides.setdefault("api", {})["timeout_sec"] = args.api_timeout if args.api_page_size: overrides.setdefault("api", {})["page_size"] = args.api_page_size if args.api_retry_max: overrides.setdefault("api", {}).setdefault("retries", {})["max_attempts"] = args.api_retry_max # 目录 if args.export_root: overrides.setdefault("io", {})["export_root"] = args.export_root if args.log_root: overrides.setdefault("io", {})["log_root"] = args.log_root # 数据源 Flow(旧参数 --pipeline-flow 保留向后兼容,配置键名不变) if args.pipeline_flow: overrides.setdefault("pipeline", {})["flow"] = args.pipeline_flow.upper() # 数据源模式(新参数) data_source = resolve_data_source(args) overrides.setdefault("run", {})["data_source"] = data_source if args.fetch_root: overrides.setdefault("pipeline", {})["fetch_root"] = args.fetch_root if args.ingest_source: overrides.setdefault("pipeline", {})["ingest_source_dir"] = args.ingest_source if args.write_pretty_json: overrides.setdefault("io", {})["write_pretty_json"] = True # 回溯/手动窗口 if args.window_start or args.window_end: overrides.setdefault("run", {}).setdefault("window_override", {}) if args.window_start: overrides["run"]["window_override"]["start"] = args.window_start if args.window_end: overrides["run"]["window_override"]["end"] = args.window_end if args.force_window_override: overrides.setdefault("run", {})["force_window_override"] = True if args.window_split_unit: overrides.setdefault("run", {}).setdefault("window_split", {})["unit"] = args.window_split_unit if args.window_split_days is not None: overrides.setdefault("run", {}).setdefault("window_split", {})["days"] = args.window_split_days if args.window_compensation_hours is not None: overrides.setdefault("run", {}).setdefault("window_split", {})[ "compensation_hours" ] = args.window_compensation_hours # 运行窗口 if args.idle_start: overrides.setdefault("run", {}).setdefault("idle_window", {})["start"] = args.idle_start if args.idle_end: overrides.setdefault("run", {}).setdefault("idle_window", {})["end"] = args.idle_end if args.allow_empty_advance: overrides.setdefault("run", {})["allow_empty_result_advance"] = True # 强制全量更新 if args.force_full: overrides.setdefault("run", {})["force_full_update"] = True # 任务 if args.tasks: tasks = [t.strip().upper() for t in args.tasks.split(",") if t.strip()] overrides.setdefault("run", {})["tasks"] = tasks return overrides def parse_datetime(s: str) -> datetime: """解析日期时间字符串""" if not s: return None formats = [ "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d", "%Y/%m/%d %H:%M:%S", "%Y/%m/%d", ] for fmt in formats: try: return datetime.strptime(s, fmt) except ValueError: continue raise ValueError(f"无法解析日期时间: {s}") def main(): """主函数 资源生命周期由 CLI 层统一管理(try/finally), TaskExecutor / FlowRunner 通过依赖注入接收已创建的资源。 """ logger = setup_logging() args = parse_args() # --pipeline 已弃用别名处理:合并到 args.flow(参数名保留以兼容旧调用) if args.pipeline_deprecated: import warnings warnings.warn( "--pipeline 参数已弃用,请使用 --flow", DeprecationWarning, stacklevel=2, ) if args.flow: print("错误: --pipeline 和 --flow 不能同时指定", file=sys.stderr) sys.exit(2) args.flow = args.pipeline_deprecated # --layers 和 --flow 互斥校验 if getattr(args, "layers", None) and args.flow: print("错误: --layers 和 --flow 互斥,请只指定其中一个", file=sys.stderr) sys.exit(2) try: # 加载配置 cli_overrides = build_cli_overrides(args) config = AppConfig.load(cli_overrides) logger.info("配置加载完成") logger.info("门店ID: %s", config.get('app.store_id')) # ── 创建资源 ────────────────────────────────────────── db_conn = DatabaseConnection( dsn=config["db"]["dsn"], session=config["db"].get("session"), connect_timeout=config["db"].get("connect_timeout_sec"), ) api_client = APIClient( base_url=config["api"]["base_url"], token=config["api"]["token"], timeout=config["api"].get("timeout_sec", 20), retry_max=config["api"].get("retries", {}).get("max_attempts", 3), headers_extra=config["api"].get("headers_extra"), ) try: # ── 组装依赖 ────────────────────────────────────── db_ops = DatabaseOperations(db_conn) cursor_mgr = CursorManager(db_conn) run_tracker = RunTracker(db_conn) registry = default_registry executor = TaskExecutor( config, db_ops, api_client, cursor_mgr, run_tracker, registry, logger, ) data_source = resolve_data_source(args) # ── 判断执行模式 ────────────────────────────────── if args.flow: # Flow 模式 logger.info("执行模式: Flow 模式") logger.info("Flow 类型: %s", args.flow) logger.info("处理模式: %s", args.processing_mode) # 解析时间窗口 window_start = None window_end = None if args.window_start: window_start = parse_datetime(args.window_start) if args.window_end: window_end = parse_datetime(args.window_end) # 如果没有指定时间窗口,使用回溯 if window_start is None and window_end is None: from datetime import timedelta from zoneinfo import ZoneInfo tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai")) window_end = datetime.now(tz) window_start = window_end - timedelta(hours=args.lookback_hours) logger.info("使用回溯时间窗口: %s ~ %s", window_start, window_end) # 将回溯窗口设置为 window_override,确保 ODS 任务使用指定窗口 config.config.setdefault("run", {}).setdefault("window_override", {}) config.config["run"]["window_override"]["start"] = window_start config.config["run"]["window_override"]["end"] = window_end # 任务过滤器 task_codes = None if args.tasks: task_codes = [t.strip().upper() for t in args.tasks.split(",") if t.strip()] # 校验表过滤 verify_tables = None if args.verify_tables: verify_tables = [t.strip().lower() for t in args.verify_tables.split(",") if t.strip()] # 组装 FlowRunner 并执行 runner = FlowRunner( config, executor, registry, db_conn, api_client, logger, ) result = runner.run( pipeline=args.flow, processing_mode=args.processing_mode, data_source=data_source, window_start=window_start, window_end=window_end, window_split=args.window_split if args.window_split != "none" else None, task_codes=task_codes, fetch_before_verify=args.fetch_before_verify, verify_tables=verify_tables, ) logger.info("Flow 执行完成: %s", result.get("status")) elif getattr(args, "layers", None): # --layers 模式:自由组合层 layers = parse_layers(args.layers) logger.info("执行模式: --layers 模式") logger.info("层列表: %s", layers) logger.info("处理模式: %s", args.processing_mode) # 解析时间窗口(与 --flow 模式共享逻辑) window_start = None window_end = None if args.window_start: window_start = parse_datetime(args.window_start) if args.window_end: window_end = parse_datetime(args.window_end) if window_start is None and window_end is None: from datetime import timedelta from zoneinfo import ZoneInfo tz = ZoneInfo(config.get("app.timezone", "Asia/Shanghai")) window_end = datetime.now(tz) window_start = window_end - timedelta(hours=args.lookback_hours) logger.info("使用回溯时间窗口: %s ~ %s", window_start, window_end) config.config.setdefault("run", {}).setdefault("window_override", {}) config.config["run"]["window_override"]["start"] = window_start config.config["run"]["window_override"]["end"] = window_end # 任务过滤器 task_codes = None if args.tasks: task_codes = [t.strip().upper() for t in args.tasks.split(",") if t.strip()] # 校验表过滤 verify_tables = None if args.verify_tables: verify_tables = [t.strip().lower() for t in args.verify_tables.split(",") if t.strip()] # 组装 FlowRunner 并执行(使用 layers 参数) runner = FlowRunner( config, executor, registry, db_conn, api_client, logger, ) result = runner.run( pipeline=None, layers=layers, processing_mode=args.processing_mode, data_source=data_source, window_start=window_start, window_end=window_end, window_split=args.window_split if args.window_split != "none" else None, task_codes=task_codes, fetch_before_verify=args.fetch_before_verify, verify_tables=verify_tables, ) logger.info("--layers 执行完成: %s", result.get("status")) else: # 传统模式 logger.info("执行模式: 传统模式") task_codes = config.get("run.tasks") logger.info("任务列表: %s", task_codes) executor.run_tasks(task_codes, data_source=data_source) finally: # 确保资源释放(需求 6.1, 6.4) db_conn.close() logger.info("ETL运行完成") return 0 except Exception as e: logger.error("ETL运行失败: %s", e, exc_info=True) return 1 if __name__ == "__main__": sys.exit(main())