初始提交：飞球 ETL 系统全量代码

2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions
--- a/scripts/repair/tune_integrity_indexes.py
+++ b/scripts/repair/tune_integrity_indexes.py
@@ -0,0 +1,231 @@
+# -*- coding: utf-8 -*-
+"""Create performance indexes for integrity verification and run ANALYZE.
+
+Usage:
+    python -m scripts.tune_integrity_indexes
+    python -m scripts.tune_integrity_indexes --dry-run
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+from dataclasses import dataclass
+from typing import Dict, List, Sequence, Set, Tuple
+
+import psycopg2
+from psycopg2 import sql
+
+from config.settings import AppConfig
+
+
+TIME_CANDIDATES = (
+    "pay_time",
+    "create_time",
+    "start_use_time",
+    "scd2_start_time",
+    "calc_time",
+    "order_date",
+    "fetched_at",
+)
+
+
+@dataclass(frozen=True)
+class IndexPlan:
+    schema: str
+    table: str
+    index_name: str
+    columns: Tuple[str, ...]
+
+
+def _short_index_name(table: str, tag: str, columns: Sequence[str]) -> str:
+    raw = f"idx_{table}_{tag}_{'_'.join(columns)}"
+    if len(raw) <= 63:
+        return raw
+    digest = hashlib.md5(raw.encode("utf-8")).hexdigest()[:8]
+    shortened = f"idx_{table}_{tag}_{digest}"
+    return shortened[:63]
+
+
+def _load_table_columns(cur, schema: str, table: str) -> Set[str]:
+    cur.execute(
+        """
+        SELECT column_name
+        FROM information_schema.columns
+        WHERE table_schema = %s AND table_name = %s
+        """,
+        (schema, table),
+    )
+    return {r[0] for r in cur.fetchall()}
+
+
+def _load_pk_columns(cur, schema: str, table: str) -> List[str]:
+    cur.execute(
+        """
+        SELECT kcu.column_name
+        FROM information_schema.table_constraints tc
+        JOIN information_schema.key_column_usage kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+         AND tc.table_name = kcu.table_name
+        WHERE tc.table_schema = %s
+          AND tc.table_name = %s
+          AND tc.constraint_type = 'PRIMARY KEY'
+        ORDER BY kcu.ordinal_position
+        """,
+        (schema, table),
+    )
+    return [r[0] for r in cur.fetchall()]
+
+
+def _load_tables(cur, schema: str) -> List[str]:
+    cur.execute(
+        """
+        SELECT table_name
+        FROM information_schema.tables
+        WHERE table_schema = %s
+          AND table_type = 'BASE TABLE'
+        ORDER BY table_name
+        """,
+        (schema,),
+    )
+    return [r[0] for r in cur.fetchall()]
+
+
+def _plan_indexes(cur, schema: str, table: str) -> List[IndexPlan]:
+    plans: List[IndexPlan] = []
+    cols = _load_table_columns(cur, schema, table)
+    pk_cols = _load_pk_columns(cur, schema, table)
+
+    if schema == "billiards_ods":
+        if "fetched_at" in cols:
+            plans.append(
+                IndexPlan(
+                    schema=schema,
+                    table=table,
+                    index_name=_short_index_name(table, "fetched_at", ("fetched_at",)),
+                    columns=("fetched_at",),
+                )
+            )
+            if pk_cols and len(pk_cols) <= 3 and all(c in cols for c in pk_cols):
+                comp_cols = ("fetched_at", *pk_cols)
+                plans.append(
+                    IndexPlan(
+                        schema=schema,
+                        table=table,
+                        index_name=_short_index_name(table, "fetched_pk", comp_cols),
+                        columns=comp_cols,
+                    )
+                )
+
+    if schema == "billiards_dwd":
+        if pk_cols and "scd2_is_current" in cols and len(pk_cols) <= 4:
+            comp_cols = (*pk_cols, "scd2_is_current")
+            plans.append(
+                IndexPlan(
+                    schema=schema,
+                    table=table,
+                    index_name=_short_index_name(table, "pk_current", comp_cols),
+                    columns=comp_cols,
+                )
+            )
+
+        for tcol in TIME_CANDIDATES:
+            if tcol in cols:
+                plans.append(
+                    IndexPlan(
+                        schema=schema,
+                        table=table,
+                        index_name=_short_index_name(table, "time", (tcol,)),
+                        columns=(tcol,),
+                    )
+                )
+                if pk_cols and len(pk_cols) <= 3 and all(c in cols for c in pk_cols):
+                    comp_cols = (tcol, *pk_cols)
+                    plans.append(
+                        IndexPlan(
+                            schema=schema,
+                            table=table,
+                            index_name=_short_index_name(table, "time_pk", comp_cols),
+                            columns=comp_cols,
+                        )
+                    )
+
+    # 按索引名去重
+    dedup: Dict[str, IndexPlan] = {}
+    for p in plans:
+        dedup[p.index_name] = p
+    return list(dedup.values())
+
+
+def _create_index(cur, plan: IndexPlan) -> None:
+    stmt = sql.SQL("CREATE INDEX IF NOT EXISTS {idx} ON {sch}.{tbl} ({cols})").format(
+        idx=sql.Identifier(plan.index_name),
+        sch=sql.Identifier(plan.schema),
+        tbl=sql.Identifier(plan.table),
+        cols=sql.SQL(", ").join(sql.Identifier(c) for c in plan.columns),
+    )
+    cur.execute(stmt)
+
+
+def _analyze_table(cur, schema: str, table: str) -> None:
+    stmt = sql.SQL("ANALYZE {sch}.{tbl}").format(
+        sch=sql.Identifier(schema),
+        tbl=sql.Identifier(table),
+    )
+    cur.execute(stmt)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Tune indexes for integrity verification.")
+    ap.add_argument("--dry-run", action="store_true", help="Print planned SQL only.")
+    ap.add_argument(
+        "--skip-analyze",
+        action="store_true",
+        help="Create indexes but skip ANALYZE.",
+    )
+    args = ap.parse_args()
+
+    cfg = AppConfig.load({})
+    dsn = cfg.get("db.dsn")
+    timeout_sec = int(cfg.get("db.connect_timeout_sec", 10) or 10)
+
+    with psycopg2.connect(dsn, connect_timeout=timeout_sec) as conn:
+        conn.autocommit = False
+        with conn.cursor() as cur:
+            all_plans: List[IndexPlan] = []
+            for schema in ("billiards_ods", "billiards_dwd"):
+                for table in _load_tables(cur, schema):
+                    all_plans.extend(_plan_indexes(cur, schema, table))
+
+            touched_tables: Set[Tuple[str, str]] = set()
+            print(f"planned indexes: {len(all_plans)}")
+            for plan in all_plans:
+                cols = ", ".join(plan.columns)
+                print(f"[INDEX] {plan.schema}.{plan.table} ({cols}) -> {plan.index_name}")
+                if not args.dry_run:
+                    _create_index(cur, plan)
+                    touched_tables.add((plan.schema, plan.table))
+
+            if not args.skip_analyze:
+                if args.dry_run:
+                    for schema, table in sorted({(p.schema, p.table) for p in all_plans}):
+                        print(f"[ANALYZE] {schema}.{table}")
+                else:
+                    for schema, table in sorted(touched_tables):
+                        _analyze_table(cur, schema, table)
+                        print(f"[ANALYZE] {schema}.{table}")
+
+        if args.dry_run:
+            conn.rollback()
+            print("dry-run complete; transaction rolled back")
+        else:
+            conn.commit()
+            print("index tuning complete")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+