初始提交:飞球 ETL 系统全量代码

This commit is contained in:
Neo
2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions

0
config/__init__.py Normal file
View File

177
config/defaults.py Normal file
View File

@@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
"""配置默认值定义"""
DEFAULTS = {
"app": {
"timezone": "Asia/Shanghai",
"store_id": "",
"schema_oltp": "billiards",
"schema_etl": "etl_admin",
},
"db": {
"dsn": "",
"host": "",
"port": "",
"name": "",
"user": "",
"password": "",
"connect_timeout_sec": 20,
"batch_size": 1000,
"session": {
"timezone": "Asia/Shanghai",
"statement_timeout_ms": 30000,
"lock_timeout_ms": 5000,
"idle_in_tx_timeout_ms": 600000,
},
},
"api": {
"base_url": "https://pc.ficoo.vip/apiprod/admin/v1",
"token": None,
"timeout_sec": 20,
"page_size": 200,
"params": {},
"retries": {
"max_attempts": 3,
"backoff_sec": [1, 2, 4],
},
"headers_extra": {},
},
"run": {
"data_source": "hybrid",
"tasks": [
"PRODUCTS",
"TABLES",
"MEMBERS",
"ASSISTANTS",
"PACKAGES_DEF",
"ORDERS",
"PAYMENTS",
"REFUNDS",
"COUPON_USAGE",
"INVENTORY_CHANGE",
"TOPUPS",
"TABLE_DISCOUNT",
"ASSISTANT_ABOLISH",
"LEDGER",
],
"dws_tasks": [],
"index_tasks": [],
"index_lookback_days": 60,
"window_minutes": {
"default_busy": 30,
"default_idle": 180,
},
"overlap_seconds": 600,
"snapshot_missing_delete": True,
"snapshot_allow_empty_delete": False,
"window_split": {
"unit": "day",
"days": 10,
"compensation_hours": 2,
},
"idle_window": {
"start": "04:00",
"end": "16:00",
},
"allow_empty_result_advance": True,
},
"io": {
"export_root": "export/JSON",
"log_root": "export/LOG",
"fetch_root": "export/JSON",
"ingest_source_dir": "",
"manifest_name": "manifest.json",
"ingest_report_name": "ingest_report.json",
"write_pretty_json": True,
"max_file_bytes": 50 * 1024 * 1024,
},
"pipeline": {
# 运行流程FETCH_ONLY仅在线抓取落盘、INGEST_ONLY本地清洗入库、FULL抓取 + 清洗入库)
"flow": "FULL",
# 在线抓取 JSON 输出根目录按任务、run_id 与时间自动创建子目录)
"fetch_root": "export/JSON",
# 本地清洗入库时的 JSON 输入目录(为空则默认使用本次抓取目录)
"ingest_source_dir": "",
},
"clean": {
"log_unknown_fields": True,
"unknown_fields_limit": 50,
"hash_key": {
"algo": "sha1",
"salt": "",
},
"strict_numeric": True,
"round_money_scale": 2,
},
"security": {
"redact_in_logs": True,
"redact_keys": ["token", "password", "Authorization"],
"echo_token_in_logs": False,
},
"ods": {
# ODS 离线重建/回放相关(仅开发/运维使用)
"json_doc_dir": "export/test-json-doc",
"include_files": "",
"drop_schema_first": True,
},
"integrity": {
"mode": "history",
"history_start": "2025-07-01",
"history_end": "",
"include_dimensions": True,
"auto_check": False,
"auto_backfill": False,
"compare_content": True,
"content_sample_limit": 50,
"backfill_mismatch": True,
"recheck_after_backfill": True,
"ods_task_codes": "",
"force_monthly_split": True,
},
"verification": {
"skip_ods_when_fetch_before_verify": True,
"ods_use_local_json": True,
},
"dws": {
"monthly": {
"allow_history": False,
"prev_month_grace_days": 5,
"history_months": 0,
"new_hire_cap_effective_from": "2026-03-01",
"new_hire_cap_day": 25,
"new_hire_max_tier_level": 2,
},
"salary": {
"run_days": 5,
"allow_out_of_cycle": False,
"room_course_price": 138,
},
},
"dwd": {
"fact_upsert": True,
# 事实表补齐 UPSERT 批量参数(可按锁冲突情况调优)
"fact_upsert_batch_size": 1000,
"fact_upsert_min_batch_size": 100,
"fact_upsert_max_retries": 2,
"fact_upsert_retry_backoff_sec": [1, 2, 4],
# 仅对事实表 backfill 设置的锁等待超时None 表示沿用 db.session.lock_timeout_ms
"fact_upsert_lock_timeout_ms": None,
},
}
# 任务代码常量
TASK_ORDERS = "ORDERS"
TASK_PAYMENTS = "PAYMENTS"
TASK_REFUNDS = "REFUNDS"
TASK_INVENTORY_CHANGE = "INVENTORY_CHANGE"
TASK_COUPON_USAGE = "COUPON_USAGE"
TASK_MEMBERS = "MEMBERS"
TASK_ASSISTANTS = "ASSISTANTS"
TASK_PRODUCTS = "PRODUCTS"
TASK_TABLES = "TABLES"
TASK_PACKAGES_DEF = "PACKAGES_DEF"
TASK_TOPUPS = "TOPUPS"
TASK_TABLE_DISCOUNT = "TABLE_DISCOUNT"
TASK_ASSISTANT_ABOLISH = "ASSISTANT_ABOLISH"
TASK_LEDGER = "LEDGER"

213
config/env_parser.py Normal file
View File

@@ -0,0 +1,213 @@
# -*- coding: utf-8 -*-
"""环境变量解析"""
import os
import json
from pathlib import Path
from copy import deepcopy
ENV_MAP = {
"TIMEZONE": ("app.timezone",),
"STORE_ID": ("app.store_id",),
"SCHEMA_OLTP": ("app.schema_oltp",),
"SCHEMA_ETL": ("app.schema_etl",),
"PG_DSN": ("db.dsn",),
"PG_HOST": ("db.host",),
"PG_PORT": ("db.port",),
"PG_NAME": ("db.name",),
"PG_USER": ("db.user",),
"PG_PASSWORD": ("db.password",),
"PG_CONNECT_TIMEOUT": ("db.connect_timeout_sec",),
"API_BASE": ("api.base_url",),
"API_TOKEN": ("api.token",),
"FICOO_TOKEN": ("api.token",),
"API_TIMEOUT": ("api.timeout_sec",),
"API_PAGE_SIZE": ("api.page_size",),
"API_RETRY_MAX": ("api.retries.max_attempts",),
"API_RETRY_BACKOFF": ("api.retries.backoff_sec",),
"API_PARAMS": ("api.params",),
"EXPORT_ROOT": ("io.export_root",),
"LOG_ROOT": ("io.log_root",),
"MANIFEST_NAME": ("io.manifest_name",),
"INGEST_REPORT_NAME": ("io.ingest_report_name",),
"WRITE_PRETTY_JSON": ("io.write_pretty_json",),
"RUN_TASKS": ("run.tasks",),
"RUN_DWS_TASKS": ("run.dws_tasks",),
"RUN_INDEX_TASKS": ("run.index_tasks",),
"INDEX_LOOKBACK_DAYS": ("run.index_lookback_days",),
"OVERLAP_SECONDS": ("run.overlap_seconds",),
"WINDOW_BUSY_MIN": ("run.window_minutes.default_busy",),
"WINDOW_IDLE_MIN": ("run.window_minutes.default_idle",),
"IDLE_START": ("run.idle_window.start",),
"IDLE_END": ("run.idle_window.end",),
"IDLE_WINDOW_START": ("run.idle_window.start",),
"IDLE_WINDOW_END": ("run.idle_window.end",),
"ALLOW_EMPTY_RESULT_ADVANCE": ("run.allow_empty_result_advance",),
"ALLOW_EMPTY_ADVANCE": ("run.allow_empty_result_advance",),
"SNAPSHOT_MISSING_DELETE": ("run.snapshot_missing_delete",),
"SNAPSHOT_ALLOW_EMPTY_DELETE": ("run.snapshot_allow_empty_delete",),
"WINDOW_START": ("run.window_override.start",),
"WINDOW_END": ("run.window_override.end",),
"WINDOW_SPLIT_UNIT": ("run.window_split.unit",),
"WINDOW_SPLIT_DAYS": ("run.window_split.days",),
"WINDOW_COMPENSATION_HOURS": ("run.window_split.compensation_hours",),
"PIPELINE_FLOW": ("pipeline.flow",),
"JSON_FETCH_ROOT": ("pipeline.fetch_root",),
"JSON_SOURCE_DIR": ("pipeline.ingest_source_dir",),
"FETCH_ROOT": ("pipeline.fetch_root",),
"INGEST_SOURCE_DIR": ("pipeline.ingest_source_dir",),
"INTEGRITY_MODE": ("integrity.mode",),
"INTEGRITY_HISTORY_START": ("integrity.history_start",),
"INTEGRITY_HISTORY_END": ("integrity.history_end",),
"INTEGRITY_INCLUDE_DIMENSIONS": ("integrity.include_dimensions",),
"INTEGRITY_AUTO_CHECK": ("integrity.auto_check",),
"INTEGRITY_AUTO_BACKFILL": ("integrity.auto_backfill",),
"INTEGRITY_COMPARE_CONTENT": ("integrity.compare_content",),
"INTEGRITY_CONTENT_SAMPLE_LIMIT": ("integrity.content_sample_limit",),
"INTEGRITY_BACKFILL_MISMATCH": ("integrity.backfill_mismatch",),
"INTEGRITY_RECHECK_AFTER_BACKFILL": ("integrity.recheck_after_backfill",),
"INTEGRITY_ODS_TASK_CODES": ("integrity.ods_task_codes",),
"VERIFY_SKIP_ODS_ON_FETCH": ("verification.skip_ods_when_fetch_before_verify",),
"VERIFY_ODS_LOCAL_JSON": ("verification.ods_use_local_json",),
"DWD_FACT_UPSERT": ("dwd.fact_upsert",),
# DWS 月度/薪资配置
"DWS_MONTHLY_ALLOW_HISTORY": ("dws.monthly.allow_history",),
"DWS_MONTHLY_PREV_GRACE_DAYS": ("dws.monthly.prev_month_grace_days",),
"DWS_MONTHLY_HISTORY_MONTHS": ("dws.monthly.history_months",),
"DWS_MONTHLY_NEW_HIRE_CAP_EFFECTIVE_FROM": ("dws.monthly.new_hire_cap_effective_from",),
"DWS_MONTHLY_NEW_HIRE_CAP_DAY": ("dws.monthly.new_hire_cap_day",),
"DWS_MONTHLY_NEW_HIRE_MAX_TIER_LEVEL": ("dws.monthly.new_hire_max_tier_level",),
"DWS_SALARY_RUN_DAYS": ("dws.salary.run_days",),
"DWS_SALARY_ALLOW_OUT_OF_CYCLE": ("dws.salary.allow_out_of_cycle",),
"DWS_SALARY_ROOM_COURSE_PRICE": ("dws.salary.room_course_price",),
# ODS 离线回放配置
"ODS_JSON_DOC_DIR": ("ods.json_doc_dir",),
"ODS_INCLUDE_FILES": ("ods.include_files",),
"ODS_DROP_SCHEMA_FIRST": ("ods.drop_schema_first",),
}
def _deep_set(d, dotted_keys, value):
cur = d
for k in dotted_keys[:-1]:
cur = cur.setdefault(k, {})
cur[dotted_keys[-1]] = value
def _coerce_env(v: str):
if v is None:
return None
s = v.strip()
if s.lower() in ("true", "false"):
return s.lower() == "true"
try:
if s.isdigit() or (s.startswith("-") and s[1:].isdigit()):
return int(s)
except Exception:
pass
if (s.startswith("{") and s.endswith("}")) or (s.startswith("[") and s.endswith("]")):
try:
return json.loads(s)
except Exception:
return s
return s
def _strip_inline_comment(value: str) -> str:
"""去掉未被引号包裹的内联注释"""
result = []
in_quote = False
quote_char = ""
escape = False
for ch in value:
if escape:
result.append(ch)
escape = False
continue
if ch == "\\":
escape = True
result.append(ch)
continue
if ch in ("'", '"'):
if not in_quote:
in_quote = True
quote_char = ch
elif quote_char == ch:
in_quote = False
quote_char = ""
result.append(ch)
continue
if ch == "#" and not in_quote:
break
result.append(ch)
return "".join(result).rstrip()
def _unquote_value(value: str) -> str:
"""处理引号/原始字符串以及尾随逗号"""
trimmed = value.strip()
trimmed = _strip_inline_comment(trimmed)
trimmed = trimmed.rstrip(",").rstrip()
if not trimmed:
return trimmed
if len(trimmed) >= 2 and trimmed[0] in ("'", '"') and trimmed[-1] == trimmed[0]:
return trimmed[1:-1]
if (
len(trimmed) >= 3
and trimmed[0] in ("r", "R")
and trimmed[1] in ("'", '"')
and trimmed[-1] == trimmed[1]
):
return trimmed[2:-1]
return trimmed
def _parse_dotenv_line(line: str) -> tuple[str, str] | None:
"""解析 .env 文件中的单行"""
stripped = line.strip()
if not stripped or stripped.startswith("#"):
return None
if stripped.startswith("export "):
stripped = stripped[len("export ") :].strip()
if "=" not in stripped:
return None
key, value = stripped.split("=", 1)
key = key.strip()
value = _unquote_value(value)
return key, value
def _load_dotenv_values() -> dict:
"""从项目根目录读取 .env 文件键值"""
if os.environ.get("ETL_SKIP_DOTENV") in ("1", "true", "TRUE", "True"):
return {}
root = Path(__file__).resolve().parents[1]
dotenv_path = root / ".env"
if not dotenv_path.exists():
return {}
values: dict[str, str] = {}
for line in dotenv_path.read_text(encoding="utf-8", errors="ignore").splitlines():
parsed = _parse_dotenv_line(line)
if parsed:
key, value = parsed
values[key] = value
return values
def _apply_env_values(cfg: dict, source: dict):
for env_key, dotted in ENV_MAP.items():
val = source.get(env_key)
if val is None:
continue
v2 = _coerce_env(val)
for path in dotted:
if path in ("run.tasks", "run.dws_tasks", "run.index_tasks") and isinstance(v2, str):
v2 = [item.strip() for item in v2.split(",") if item.strip()]
_deep_set(cfg, path.split("."), v2)
def load_env_overrides(defaults: dict) -> dict:
cfg = deepcopy(defaults)
# 先读取 .env再读取真实环境变量确保 CLI 仍然最高优先级
_apply_env_values(cfg, _load_dotenv_values())
_apply_env_values(cfg, os.environ)
return cfg

View File

@@ -0,0 +1,3 @@
{
"tasks": {}
}

127
config/settings.py Normal file
View File

@@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
"""配置管理主类"""
import warnings
from copy import deepcopy
from .defaults import DEFAULTS
from .env_parser import load_env_overrides
# pipeline.flow → run.data_source 值映射
_FLOW_TO_DATA_SOURCE = {
"FULL": "hybrid",
"FETCH_ONLY": "online",
"INGEST_ONLY": "offline",
}
class AppConfig:
"""应用配置管理器"""
def __init__(self, config_dict: dict):
self.config = config_dict
@classmethod
def load(cls, cli_overrides: dict = None):
"""加载配置: DEFAULTS < ENV < CLI"""
cfg = load_env_overrides(DEFAULTS)
if cli_overrides:
cls._deep_merge(cfg, cli_overrides)
# 规范化
cls._normalize(cfg)
cls._validate(cfg)
return cls(cfg)
@staticmethod
def _deep_merge(dst, src):
"""深度合并字典"""
for k, v in src.items():
if isinstance(v, dict) and isinstance(dst.get(k), dict):
AppConfig._deep_merge(dst[k], v)
else:
dst[k] = v
@staticmethod
def _normalize(cfg):
"""规范化配置"""
# 转换 store_id 为整数
try:
cfg["app"]["store_id"] = int(str(cfg["app"]["store_id"]).strip())
except Exception:
raise SystemExit("app.store_id 必须为整数")
# DSN 组装
if not cfg["db"]["dsn"]:
cfg["db"]["dsn"] = (
f"postgresql://{cfg['db']['user']}:{cfg['db']['password']}"
f"@{cfg['db']['host']}:{cfg['db']['port']}/{cfg['db']['name']}"
)
# connect_timeout 限定 1-20 秒
try:
timeout_sec = int(cfg["db"].get("connect_timeout_sec") or 5)
except Exception:
raise SystemExit("db.connect_timeout_sec 必须为整数")
cfg["db"]["connect_timeout_sec"] = max(1, min(timeout_sec, 20))
# 会话参数
cfg["db"].setdefault("session", {})
sess = cfg["db"]["session"]
sess.setdefault("timezone", cfg["app"]["timezone"])
for k in ("statement_timeout_ms", "lock_timeout_ms", "idle_in_tx_timeout_ms"):
if k in sess and sess[k] is not None:
try:
sess[k] = int(sess[k])
except Exception:
raise SystemExit(f"db.session.{k} 需为整数毫秒")
# ── 旧键 → 新键 兼容映射 ──
pipeline = cfg.get("pipeline", {})
run = cfg.setdefault("run", {})
io = cfg.setdefault("io", {})
# 1. pipeline.flow → run.data_source
# 仅当新键未被显式设置(缺失或仍为默认值 hybrid才用旧键覆盖
old_flow = str(pipeline.get("flow", "")).upper()
if old_flow in _FLOW_TO_DATA_SOURCE:
mapped = _FLOW_TO_DATA_SOURCE[old_flow]
if run.get("data_source", "hybrid") == "hybrid" and mapped != "hybrid":
run["data_source"] = mapped
warnings.warn(
f"配置键 pipeline.flow={old_flow} 已弃用,"
f"已映射为 run.data_source={mapped}",
DeprecationWarning,
stacklevel=2,
)
# 2. pipeline.fetch_root → io.fetch_root新键优先
if pipeline.get("fetch_root") and not io.get("fetch_root"):
io["fetch_root"] = pipeline["fetch_root"]
# 3. pipeline.ingest_source_dir → io.ingest_source_dir新键优先
if pipeline.get("ingest_source_dir") and not io.get("ingest_source_dir"):
io["ingest_source_dir"] = pipeline["ingest_source_dir"]
@staticmethod
def _validate(cfg):
"""验证必填配置"""
missing = []
if not cfg["app"]["store_id"]:
missing.append("app.store_id")
if missing:
raise SystemExit("缺少必需配置: " + ", ".join(missing))
def get(self, key: str, default=None):
"""获取配置值(支持点号路径)"""
keys = key.split(".")
val = self.config
for k in keys:
if isinstance(val, dict):
val = val.get(k)
else:
return default
return val if val is not None else default
def __getitem__(self, key):
return self.config[key]