初始提交:飞球 ETL 系统全量代码
This commit is contained in:
231
scripts/repair/tune_integrity_indexes.py
Normal file
231
scripts/repair/tune_integrity_indexes.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Create performance indexes for integrity verification and run ANALYZE.
|
||||
|
||||
Usage:
|
||||
python -m scripts.tune_integrity_indexes
|
||||
python -m scripts.tune_integrity_indexes --dry-run
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Sequence, Set, Tuple
|
||||
|
||||
import psycopg2
|
||||
from psycopg2 import sql
|
||||
|
||||
from config.settings import AppConfig
|
||||
|
||||
|
||||
TIME_CANDIDATES = (
|
||||
"pay_time",
|
||||
"create_time",
|
||||
"start_use_time",
|
||||
"scd2_start_time",
|
||||
"calc_time",
|
||||
"order_date",
|
||||
"fetched_at",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IndexPlan:
|
||||
schema: str
|
||||
table: str
|
||||
index_name: str
|
||||
columns: Tuple[str, ...]
|
||||
|
||||
|
||||
def _short_index_name(table: str, tag: str, columns: Sequence[str]) -> str:
|
||||
raw = f"idx_{table}_{tag}_{'_'.join(columns)}"
|
||||
if len(raw) <= 63:
|
||||
return raw
|
||||
digest = hashlib.md5(raw.encode("utf-8")).hexdigest()[:8]
|
||||
shortened = f"idx_{table}_{tag}_{digest}"
|
||||
return shortened[:63]
|
||||
|
||||
|
||||
def _load_table_columns(cur, schema: str, table: str) -> Set[str]:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s AND table_name = %s
|
||||
""",
|
||||
(schema, table),
|
||||
)
|
||||
return {r[0] for r in cur.fetchall()}
|
||||
|
||||
|
||||
def _load_pk_columns(cur, schema: str, table: str) -> List[str]:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT kcu.column_name
|
||||
FROM information_schema.table_constraints tc
|
||||
JOIN information_schema.key_column_usage kcu
|
||||
ON tc.constraint_name = kcu.constraint_name
|
||||
AND tc.table_schema = kcu.table_schema
|
||||
AND tc.table_name = kcu.table_name
|
||||
WHERE tc.table_schema = %s
|
||||
AND tc.table_name = %s
|
||||
AND tc.constraint_type = 'PRIMARY KEY'
|
||||
ORDER BY kcu.ordinal_position
|
||||
""",
|
||||
(schema, table),
|
||||
)
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _load_tables(cur, schema: str) -> List[str]:
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = %s
|
||||
AND table_type = 'BASE TABLE'
|
||||
ORDER BY table_name
|
||||
""",
|
||||
(schema,),
|
||||
)
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _plan_indexes(cur, schema: str, table: str) -> List[IndexPlan]:
|
||||
plans: List[IndexPlan] = []
|
||||
cols = _load_table_columns(cur, schema, table)
|
||||
pk_cols = _load_pk_columns(cur, schema, table)
|
||||
|
||||
if schema == "billiards_ods":
|
||||
if "fetched_at" in cols:
|
||||
plans.append(
|
||||
IndexPlan(
|
||||
schema=schema,
|
||||
table=table,
|
||||
index_name=_short_index_name(table, "fetched_at", ("fetched_at",)),
|
||||
columns=("fetched_at",),
|
||||
)
|
||||
)
|
||||
if pk_cols and len(pk_cols) <= 3 and all(c in cols for c in pk_cols):
|
||||
comp_cols = ("fetched_at", *pk_cols)
|
||||
plans.append(
|
||||
IndexPlan(
|
||||
schema=schema,
|
||||
table=table,
|
||||
index_name=_short_index_name(table, "fetched_pk", comp_cols),
|
||||
columns=comp_cols,
|
||||
)
|
||||
)
|
||||
|
||||
if schema == "billiards_dwd":
|
||||
if pk_cols and "scd2_is_current" in cols and len(pk_cols) <= 4:
|
||||
comp_cols = (*pk_cols, "scd2_is_current")
|
||||
plans.append(
|
||||
IndexPlan(
|
||||
schema=schema,
|
||||
table=table,
|
||||
index_name=_short_index_name(table, "pk_current", comp_cols),
|
||||
columns=comp_cols,
|
||||
)
|
||||
)
|
||||
|
||||
for tcol in TIME_CANDIDATES:
|
||||
if tcol in cols:
|
||||
plans.append(
|
||||
IndexPlan(
|
||||
schema=schema,
|
||||
table=table,
|
||||
index_name=_short_index_name(table, "time", (tcol,)),
|
||||
columns=(tcol,),
|
||||
)
|
||||
)
|
||||
if pk_cols and len(pk_cols) <= 3 and all(c in cols for c in pk_cols):
|
||||
comp_cols = (tcol, *pk_cols)
|
||||
plans.append(
|
||||
IndexPlan(
|
||||
schema=schema,
|
||||
table=table,
|
||||
index_name=_short_index_name(table, "time_pk", comp_cols),
|
||||
columns=comp_cols,
|
||||
)
|
||||
)
|
||||
|
||||
# 按索引名去重
|
||||
dedup: Dict[str, IndexPlan] = {}
|
||||
for p in plans:
|
||||
dedup[p.index_name] = p
|
||||
return list(dedup.values())
|
||||
|
||||
|
||||
def _create_index(cur, plan: IndexPlan) -> None:
|
||||
stmt = sql.SQL("CREATE INDEX IF NOT EXISTS {idx} ON {sch}.{tbl} ({cols})").format(
|
||||
idx=sql.Identifier(plan.index_name),
|
||||
sch=sql.Identifier(plan.schema),
|
||||
tbl=sql.Identifier(plan.table),
|
||||
cols=sql.SQL(", ").join(sql.Identifier(c) for c in plan.columns),
|
||||
)
|
||||
cur.execute(stmt)
|
||||
|
||||
|
||||
def _analyze_table(cur, schema: str, table: str) -> None:
|
||||
stmt = sql.SQL("ANALYZE {sch}.{tbl}").format(
|
||||
sch=sql.Identifier(schema),
|
||||
tbl=sql.Identifier(table),
|
||||
)
|
||||
cur.execute(stmt)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Tune indexes for integrity verification.")
|
||||
ap.add_argument("--dry-run", action="store_true", help="Print planned SQL only.")
|
||||
ap.add_argument(
|
||||
"--skip-analyze",
|
||||
action="store_true",
|
||||
help="Create indexes but skip ANALYZE.",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = AppConfig.load({})
|
||||
dsn = cfg.get("db.dsn")
|
||||
timeout_sec = int(cfg.get("db.connect_timeout_sec", 10) or 10)
|
||||
|
||||
with psycopg2.connect(dsn, connect_timeout=timeout_sec) as conn:
|
||||
conn.autocommit = False
|
||||
with conn.cursor() as cur:
|
||||
all_plans: List[IndexPlan] = []
|
||||
for schema in ("billiards_ods", "billiards_dwd"):
|
||||
for table in _load_tables(cur, schema):
|
||||
all_plans.extend(_plan_indexes(cur, schema, table))
|
||||
|
||||
touched_tables: Set[Tuple[str, str]] = set()
|
||||
print(f"planned indexes: {len(all_plans)}")
|
||||
for plan in all_plans:
|
||||
cols = ", ".join(plan.columns)
|
||||
print(f"[INDEX] {plan.schema}.{plan.table} ({cols}) -> {plan.index_name}")
|
||||
if not args.dry_run:
|
||||
_create_index(cur, plan)
|
||||
touched_tables.add((plan.schema, plan.table))
|
||||
|
||||
if not args.skip_analyze:
|
||||
if args.dry_run:
|
||||
for schema, table in sorted({(p.schema, p.table) for p in all_plans}):
|
||||
print(f"[ANALYZE] {schema}.{table}")
|
||||
else:
|
||||
for schema, table in sorted(touched_tables):
|
||||
_analyze_table(cur, schema, table)
|
||||
print(f"[ANALYZE] {schema}.{table}")
|
||||
|
||||
if args.dry_run:
|
||||
conn.rollback()
|
||||
print("dry-run complete; transaction rolled back")
|
||||
else:
|
||||
conn.commit()
|
||||
print("index tuning complete")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
Reference in New Issue
Block a user