初始提交：飞球 ETL 系统全量代码

2026-02-13 08:05:34 +08:00
commit 3c51f5485d
441 changed files with 117631 additions and 0 deletions
--- a/scripts/repair/repair_ods_content_hash.py
+++ b/scripts/repair/repair_ods_content_hash.py
@@ -0,0 +1,302 @@
+# -*- coding: utf-8 -*-
+"""
+Repair ODS content_hash values by recomputing from payload.
+
+Usage:
+  PYTHONPATH=. python -m scripts.repair.repair_ods_content_hash
+  PYTHONPATH=. python -m scripts.repair.repair_ods_content_hash --schema billiards_ods
+  PYTHONPATH=. python -m scripts.repair.repair_ods_content_hash --tables member_profiles,orders
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Iterable, Sequence
+
+import psycopg2
+from psycopg2.extras import RealDictCursor
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from config.settings import AppConfig
+from database.connection import DatabaseConnection
+from tasks.ods.ods_tasks import BaseOdsTask
+
+
+def _reconfigure_stdout_utf8() -> None:
+    if hasattr(sys.stdout, "reconfigure"):
+        try:
+            sys.stdout.reconfigure(encoding="utf-8")
+        except Exception:
+            pass
+
+
+def _fetch_tables(conn, schema: str) -> list[str]:
+    sql = """
+        SELECT table_name
+        FROM information_schema.tables
+        WHERE table_schema = %s AND table_type = 'BASE TABLE'
+        ORDER BY table_name
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (schema,))
+        return [r[0] for r in cur.fetchall()]
+
+
+def _fetch_columns(conn, schema: str, table: str) -> list[str]:
+    sql = """
+        SELECT column_name
+        FROM information_schema.columns
+        WHERE table_schema = %s AND table_name = %s
+        ORDER BY ordinal_position
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (schema, table))
+        cols = [r[0] for r in cur.fetchall()]
+    return [c for c in cols if c]
+
+
+def _fetch_pk_columns(conn, schema: str, table: str) -> list[str]:
+    sql = """
+        SELECT kcu.column_name
+        FROM information_schema.table_constraints tc
+        JOIN information_schema.key_column_usage kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+        WHERE tc.constraint_type = 'PRIMARY KEY'
+          AND tc.table_schema = %s
+          AND tc.table_name = %s
+        ORDER BY kcu.ordinal_position
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (schema, table))
+        cols = [r[0] for r in cur.fetchall()]
+    return [c for c in cols if c.lower() != "content_hash"]
+
+
+def _fetch_row_count(conn, schema: str, table: str) -> int:
+    sql = f'SELECT COUNT(*) FROM "{schema}"."{table}"'
+    with conn.cursor() as cur:
+        cur.execute(sql)
+        row = cur.fetchone()
+        return int(row[0] if row else 0)
+
+
+def _iter_rows(
+    conn,
+    schema: str,
+    table: str,
+    select_cols: Sequence[str],
+    batch_size: int,
+) -> Iterable[dict]:
+    cols_sql = ", ".join("ctid" if c == "ctid" else f'"{c}"' for c in select_cols)
+    sql = f'SELECT {cols_sql} FROM "{schema}"."{table}"'
+    with conn.cursor(name=f"ods_hash_fix_{table}", cursor_factory=RealDictCursor) as cur:
+        cur.itersize = max(1, int(batch_size or 500))
+        cur.execute(sql)
+        for row in cur:
+            yield row
+
+
+def _build_report_path(out_arg: str | None) -> Path:
+    if out_arg:
+        return Path(out_arg)
+    reports_dir = PROJECT_ROOT / "reports"
+    reports_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return reports_dir / f"ods_content_hash_repair_{ts}.json"
+
+
+def _print_progress(
+    table_label: str,
+    processed: int,
+    total: int,
+    updated: int,
+    skipped: int,
+    conflicts: int,
+    errors: int,
+    missing_hash: int,
+    invalid_payload: int,
+) -> None:
+    if total:
+        msg = (
+            f"[{table_label}] checked {processed}/{total} "
+            f"updated={updated} skipped={skipped} conflicts={conflicts} errors={errors} "
+            f"missing_hash={missing_hash} invalid_payload={invalid_payload}"
+        )
+    else:
+        msg = (
+            f"[{table_label}] checked {processed} "
+            f"updated={updated} skipped={skipped} conflicts={conflicts} errors={errors} "
+            f"missing_hash={missing_hash} invalid_payload={invalid_payload}"
+        )
+    print(msg, flush=True)
+
+
+def main() -> int:
+    _reconfigure_stdout_utf8()
+    ap = argparse.ArgumentParser(description="Repair ODS content_hash using payload")
+    ap.add_argument("--schema", default="billiards_ods", help="ODS schema name")
+    ap.add_argument("--tables", default="", help="comma-separated table names (optional)")
+    ap.add_argument("--batch-size", type=int, default=500, help="DB fetch batch size")
+    ap.add_argument("--progress-every", type=int, default=100, help="print progress every N rows")
+    ap.add_argument("--sample-limit", type=int, default=10, help="sample conflicts per table")
+    ap.add_argument("--out", default="", help="output report JSON path")
+    ap.add_argument("--dry-run", action="store_true", help="only compute stats, do not update")
+    args = ap.parse_args()
+
+    cfg = AppConfig.load({})
+    db_read = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
+    db_write = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
+    try:
+        db_write.conn.rollback()
+    except Exception:
+        pass
+    db_write.conn.autocommit = True
+
+    tables = _fetch_tables(db_read.conn, args.schema)
+    if args.tables.strip():
+        whitelist = {t.strip() for t in args.tables.split(",") if t.strip()}
+        tables = [t for t in tables if t in whitelist]
+
+    report = {
+        "schema": args.schema,
+        "tables": [],
+        "summary": {
+            "total_tables": len(tables),
+            "checked_tables": 0,
+            "total_rows": 0,
+            "checked_rows": 0,
+            "updated_rows": 0,
+            "skipped_rows": 0,
+            "conflict_rows": 0,
+            "error_rows": 0,
+            "missing_hash_rows": 0,
+            "invalid_payload_rows": 0,
+        },
+    }
+
+    for table in tables:
+        table_label = f"{args.schema}.{table}"
+        cols = _fetch_columns(db_read.conn, args.schema, table)
+        cols_lower = {c.lower() for c in cols}
+        if "payload" not in cols_lower or "content_hash" not in cols_lower:
+            print(f"[{table_label}] skip: missing payload/content_hash", flush=True)
+            continue
+
+        total = _fetch_row_count(db_read.conn, args.schema, table)
+        pk_cols = _fetch_pk_columns(db_read.conn, args.schema, table)
+        select_cols = ["ctid", "content_hash", "payload", *pk_cols]
+
+        processed = 0
+        updated = 0
+        skipped = 0
+        conflicts = 0
+        errors = 0
+        missing_hash = 0
+        invalid_payload = 0
+        samples: list[dict[str, Any]] = []
+
+        print(f"[{table_label}] start: total_rows={total}", flush=True)
+
+        for row in _iter_rows(db_read.conn, args.schema, table, select_cols, args.batch_size):
+            processed += 1
+            content_hash = row.get("content_hash")
+            payload = row.get("payload")
+            recomputed = BaseOdsTask._compute_compare_hash_from_payload(payload)
+            row_ctid = row.get("ctid")
+
+            if not content_hash:
+                missing_hash += 1
+            if not recomputed:
+                invalid_payload += 1
+
+            if not recomputed:
+                skipped += 1
+            elif content_hash == recomputed:
+                skipped += 1
+            else:
+                if args.dry_run:
+                    updated += 1
+                else:
+                    try:
+                        with db_write.conn.cursor() as cur:
+                            cur.execute(
+                                f'UPDATE "{args.schema}"."{table}" SET content_hash = %s WHERE ctid = %s',
+                                (recomputed, row_ctid),
+                            )
+                        updated += 1
+                    except psycopg2.errors.UniqueViolation:
+                        conflicts += 1
+                        if len(samples) < max(0, int(args.sample_limit or 0)):
+                            sample = {k: row.get(k) for k in pk_cols}
+                            sample["content_hash"] = content_hash
+                            sample["recomputed_hash"] = recomputed
+                            samples.append(sample)
+                    except psycopg2.Error:
+                        errors += 1
+
+            if args.progress_every and processed % int(args.progress_every) == 0:
+                _print_progress(
+                    table_label,
+                    processed,
+                    total,
+                    updated,
+                    skipped,
+                    conflicts,
+                    errors,
+                    missing_hash,
+                    invalid_payload,
+                )
+
+        if processed and (not args.progress_every or processed % int(args.progress_every) != 0):
+            _print_progress(
+                table_label,
+                processed,
+                total,
+                updated,
+                skipped,
+                conflicts,
+                errors,
+                missing_hash,
+                invalid_payload,
+            )
+
+        report["tables"].append(
+            {
+                "table": table_label,
+                "total_rows": total,
+                "checked_rows": processed,
+                "updated_rows": updated,
+                "skipped_rows": skipped,
+                "conflict_rows": conflicts,
+                "error_rows": errors,
+                "missing_hash_rows": missing_hash,
+                "invalid_payload_rows": invalid_payload,
+                "conflict_samples": samples,
+            }
+        )
+
+        report["summary"]["checked_tables"] += 1
+        report["summary"]["total_rows"] += total
+        report["summary"]["checked_rows"] += processed
+        report["summary"]["updated_rows"] += updated
+        report["summary"]["skipped_rows"] += skipped
+        report["summary"]["conflict_rows"] += conflicts
+        report["summary"]["error_rows"] += errors
+        report["summary"]["missing_hash_rows"] += missing_hash
+        report["summary"]["invalid_payload_rows"] += invalid_payload
+
+    out_path = _build_report_path(args.out)
+    out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"[REPORT] {out_path}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())