数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/etl_billiards/scripts/backfill_missing_data.py
+++ b/etl_billiards/scripts/backfill_missing_data.py
@@ -32,9 +32,15 @@ from api.client import APIClient
 from config.settings import AppConfig
 from database.connection import DatabaseConnection
 from models.parsers import TypeParser
-from tasks.ods_tasks import ENABLED_ODS_CODES, ODS_TASK_SPECS, OdsTaskSpec
+from tasks.ods_tasks import BaseOdsTask, ENABLED_ODS_CODES, ODS_TASK_SPECS, OdsTaskSpec
 from scripts.check_ods_gaps import run_gap_check
 from utils.logging_utils import build_log_path, configure_logging
+from utils.ods_record_utils import (
+    get_value_case_insensitive,
+    merge_record_layers,
+    normalize_pk_value,
+    pk_tuple_from_record,
+)


 def _reconfigure_stdout_utf8() -> None:
@@ -74,56 +80,26 @@ def _get_spec(code: str) -> Optional[OdsTaskSpec]:


 def _merge_record_layers(record: dict) -> dict:
-    """展开嵌套的 data 层"""
-    merged = record
-    data_part = merged.get("data")
-    while isinstance(data_part, dict):
-        merged = {**data_part, **merged}
-        data_part = data_part.get("data")
-    settle_inner = merged.get("settleList")
-    if isinstance(settle_inner, dict):
-        merged = {**settle_inner, **merged}
-    return merged
+    """Flatten nested data layers into a single dict."""
+    return merge_record_layers(record)


 def _get_value_case_insensitive(record: dict | None, col: str | None):
-    """不区分大小写地获取值"""
-    if record is None or col is None:
-        return None
-    if col in record:
-        return record.get(col)
-    col_lower = col.lower()
-    for k, v in record.items():
-        if isinstance(k, str) and k.lower() == col_lower:
-            return v
-    return None
+    """Fetch value without case sensitivity."""
+    return get_value_case_insensitive(record, col)


 def _normalize_pk_value(value):
-    """规范化 PK 值"""
-    if value is None:
-        return None
-    if isinstance(value, str) and value.isdigit():
-        try:
-            return int(value)
-        except Exception:
-            return value
-    return value
+    """Normalize PK value."""
+    return normalize_pk_value(value)


 def _pk_tuple_from_record(record: dict, pk_cols: List[str]) -> Optional[Tuple]:
-    """从记录中提取 PK 元组"""
-    merged = _merge_record_layers(record)
-    values = []
-    for col in pk_cols:
-        val = _normalize_pk_value(_get_value_case_insensitive(merged, col))
-        if val is None or val == "":
-            return None
-        values.append(val)
-    return tuple(values)
+    """Extract PK tuple from record."""
+    return pk_tuple_from_record(record, pk_cols)


-def _get_table_pk_columns(conn, table: str) -> List[str]:
+def _get_table_pk_columns(conn, table: str, *, include_content_hash: bool = False) -> List[str]:
    """获取表的主键列"""
    if "." in table:
        schema, name = table.split(".", 1)
@@ -142,7 +118,10 @@ def _get_table_pk_columns(conn, table: str) -> List[str]:
    """
    with conn.cursor() as cur:
        cur.execute(sql, (schema, name))
-        return [r[0] for r in cur.fetchall()]
+        cols = [r[0] for r in cur.fetchall()]
+        if include_content_hash:
+            return cols
+        return [c for c in cols if c.lower() != "content_hash"]


 def _get_table_columns(conn, table: str) -> List[Tuple[str, str, str]]:
@@ -247,6 +226,13 @@ class MissingDataBackfiller:
        """关闭连接"""
        if self.db:
            self.db.close()
+
+    def _ensure_db(self):
+        """确保数据库连接可用"""
+        if self.db and getattr(self.db, "conn", None) is not None:
+            if getattr(self.db.conn, "closed", 0) == 0:
+                return
+        self.db = DatabaseConnection(dsn=self.cfg["db"]["dsn"], session=self.cfg["db"].get("session"))
    
    def backfill_from_gap_check(
        self,
@@ -254,8 +240,10 @@ class MissingDataBackfiller:
        start: datetime,
        end: datetime,
        task_codes: Optional[str] = None,
+        include_mismatch: bool = False,
        page_size: int = 200,
        chunk_size: int = 500,
+        content_sample_limit: int | None = None,
    ) -> Dict[str, Any]:
        """
        运行 gap check 并补全丢失数据
@@ -292,16 +280,21 @@ class MissingDataBackfiller:
            cutoff_overlap_hours=24,
            allow_small_window=True,
            logger=self.logger,
+            compare_content=include_mismatch,
+            content_sample_limit=content_sample_limit or 10000,
        )
        
        total_missing = gap_result.get("total_missing", 0)
-        if total_missing == 0:
-            self.logger.info("数据完整，无缺失记录")
+        total_mismatch = gap_result.get("total_mismatch", 0)
+        if total_missing == 0 and (not include_mismatch or total_mismatch == 0):
+            self.logger.info("Data complete: no missing/mismatch records")
            return {"backfilled": 0, "errors": 0, "details": []}
        
-        self.logger.info("缺失检查完成 总缺失=%s", total_missing)
+        if include_mismatch:
+            self.logger.info("Missing/mismatch check done missing=%s mismatch=%s", total_missing, total_mismatch)
+        else:
+            self.logger.info("Missing check done missing=%s", total_missing)
        
-        # 补全每个任务的丢失数据
        results = []
        total_backfilled = 0
        total_errors = 0
@@ -310,13 +303,16 @@ class MissingDataBackfiller:
            task_code = task_result.get("task_code")
            missing = task_result.get("missing", 0)
            missing_samples = task_result.get("missing_samples", [])
+            mismatch = task_result.get("mismatch", 0) if include_mismatch else 0
+            mismatch_samples = task_result.get("mismatch_samples", []) if include_mismatch else []
+            target_samples = list(missing_samples) + list(mismatch_samples)
            
-            if missing == 0:
+            if missing == 0 and mismatch == 0:
                continue
            
            self.logger.info(
-                "开始补全任务 任务=%s 缺失=%s 样本数=%s",
-                task_code, missing, len(missing_samples)
+                "Start backfill task task=%s missing=%s mismatch=%s samples=%s",
+                task_code, missing, mismatch, len(target_samples)
            )
            
            try:
@@ -324,7 +320,7 @@ class MissingDataBackfiller:
                    task_code=task_code,
                    table=task_result.get("table"),
                    pk_columns=task_result.get("pk_columns", []),
-                    missing_samples=missing_samples,
+                    pk_samples=target_samples,
                    start=start,
                    end=end,
                    page_size=page_size,
@@ -333,6 +329,7 @@ class MissingDataBackfiller:
                results.append({
                    "task_code": task_code,
                    "missing": missing,
+                    "mismatch": mismatch,
                    "backfilled": backfilled,
                    "error": None,
                })
@@ -342,6 +339,7 @@ class MissingDataBackfiller:
                results.append({
                    "task_code": task_code,
                    "missing": missing,
+                    "mismatch": mismatch,
                    "backfilled": 0,
                    "error": str(exc),
                })
@@ -354,6 +352,7 @@ class MissingDataBackfiller:
        
        return {
            "total_missing": total_missing,
+            "total_mismatch": total_mismatch,
            "backfilled": total_backfilled,
            "errors": total_errors,
            "details": results,
@@ -365,20 +364,25 @@ class MissingDataBackfiller:
        task_code: str,
        table: str,
        pk_columns: List[str],
-        missing_samples: List[Dict],
+        pk_samples: List[Dict],
        start: datetime,
        end: datetime,
        page_size: int,
        chunk_size: int,
    ) -> int:
        """补全单个任务的丢失数据"""
+        self._ensure_db()
        spec = _get_spec(task_code)
        if not spec:
            self.logger.warning("未找到任务规格 任务=%s", task_code)
            return 0
        
        if not pk_columns:
-            pk_columns = _get_table_pk_columns(self.db.conn, table)
+            pk_columns = _get_table_pk_columns(self.db.conn, table, include_content_hash=False)
+
+        conflict_columns = _get_table_pk_columns(self.db.conn, table, include_content_hash=True)
+        if not conflict_columns:
+            conflict_columns = pk_columns
        
        if not pk_columns:
            self.logger.warning("未找到主键列 任务=%s 表=%s", task_code, table)
@@ -386,7 +390,7 @@ class MissingDataBackfiller:
        
        # 提取丢失的 PK 值
        missing_pks: Set[Tuple] = set()
-        for sample in missing_samples:
+        for sample in pk_samples:
            pk_tuple = tuple(sample.get(col) for col in pk_columns)
            if all(v is not None for v in pk_tuple):
                missing_pks.add(pk_tuple)
@@ -410,6 +414,12 @@ class MissingDataBackfiller:
            if c[1] in ("json", "jsonb") or c[2] in ("json", "jsonb")
        }
        col_names = [c[0] for c in cols_info]
+
+        # 结束只读事务，避免长时间 API 拉取导致 idle_in_tx 超时
+        try:
+            self.db.conn.commit()
+        except Exception:
+            self.db.conn.rollback()
        
        try:
            for page_no, records, _, response_payload in self.api.iter_paginated(
@@ -444,9 +454,12 @@ class MissingDataBackfiller:
                        records=records_to_insert,
                        cols_info=cols_info,
                        pk_columns=pk_columns,
+                        conflict_columns=conflict_columns,
                        db_json_cols_lower=db_json_cols_lower,
                    )
                    backfilled += inserted
+                    # 避免长事务阻塞与 idle_in_tx 超时
+                    self.db.conn.commit()
                    self.logger.info(
                        "已插入 任务=%s 页=%s 数量=%s",
                        task_code, page_no, inserted
@@ -498,6 +511,7 @@ class MissingDataBackfiller:
        records: List[Dict],
        cols_info: List[Tuple[str, str, str]],
        pk_columns: List[str],
+        conflict_columns: List[str],
        db_json_cols_lower: Set[str],
    ) -> int:
        """插入记录到数据库"""
@@ -505,10 +519,12 @@ class MissingDataBackfiller:
            return 0
        
        col_names = [c[0] for c in cols_info]
+        needs_content_hash = any(c[0].lower() == "content_hash" for c in cols_info)
        quoted_cols = ", ".join(f'"{c}"' for c in col_names)
        sql = f"INSERT INTO {table} ({quoted_cols}) VALUES %s"
-        if pk_columns:
-            pk_clause = ", ".join(f'"{c}"' for c in pk_columns)
+        conflict_cols = conflict_columns or pk_columns
+        if conflict_cols:
+            pk_clause = ", ".join(f'"{c}"' for c in conflict_cols)
            sql += f" ON CONFLICT ({pk_clause}) DO NOTHING"
        
        now = datetime.now(self.tz)
@@ -522,12 +538,20 @@ class MissingDataBackfiller:
            if pk_columns:
                missing_pk = False
                for pk in pk_columns:
+                    if str(pk).lower() == "content_hash":
+                        continue
                    pk_val = _get_value_case_insensitive(merged_rec, pk)
                    if pk_val is None or pk_val == "":
                        missing_pk = True
                        break
                if missing_pk:
                    continue
+
+            content_hash = None
+            if needs_content_hash:
+                hash_record = dict(merged_rec)
+                hash_record["fetched_at"] = now
+                content_hash = BaseOdsTask._compute_content_hash(hash_record, include_fetched_at=True)
            
            row_vals: List[Any] = []
            for (col_name, data_type, _udt) in cols_info:
@@ -544,6 +568,9 @@ class MissingDataBackfiller:
                if col_lower == "fetched_at":
                    row_vals.append(now)
                    continue
+                if col_lower == "content_hash":
+                    row_vals.append(content_hash)
+                    continue
                
                value = _normalize_scalar(_get_value_case_insensitive(merged_rec, col_name))
                if col_lower in db_json_cols_lower:
@@ -574,9 +601,11 @@ def run_backfill(
    start: datetime,
    end: datetime,
    task_codes: Optional[str] = None,
+    include_mismatch: bool = False,
    dry_run: bool = False,
    page_size: int = 200,
    chunk_size: int = 500,
+    content_sample_limit: int | None = None,
    logger: logging.Logger,
 ) -> Dict[str, Any]:
    """
@@ -601,8 +630,10 @@ def run_backfill(
            start=start,
            end=end,
            task_codes=task_codes,
+            include_mismatch=include_mismatch,
            page_size=page_size,
            chunk_size=chunk_size,
+            content_sample_limit=content_sample_limit,
        )
    finally:
        backfiller.close()
@@ -615,6 +646,8 @@ def main() -> int:
    ap.add_argument("--start", default="2025-07-01", help="开始日期 (默认: 2025-07-01)")
    ap.add_argument("--end", default="", help="结束日期 (默认: 当前时间)")
    ap.add_argument("--task-codes", default="", help="指定任务代码（逗号分隔，留空=全部）")
+    ap.add_argument("--include-mismatch", action="store_true", help="同时补全内容不一致的记录")
+    ap.add_argument("--content-sample-limit", type=int, default=None, help="不一致样本上限 (默认: 10000)")
    ap.add_argument("--dry-run", action="store_true", help="仅预览，不实际写入")
    ap.add_argument("--page-size", type=int, default=200, help="API 分页大小 (默认: 200)")
    ap.add_argument("--chunk-size", type=int, default=500, help="数据库批量大小 (默认: 500)")
@@ -646,15 +679,19 @@ def main() -> int:
            start=start,
            end=end,
            task_codes=args.task_codes or None,
+            include_mismatch=args.include_mismatch,
            dry_run=args.dry_run,
            page_size=args.page_size,
            chunk_size=args.chunk_size,
+            content_sample_limit=args.content_sample_limit,
            logger=logger,
        )
        
        logger.info("=" * 60)
        logger.info("补全完成!")
        logger.info("  总丢失: %s", result.get("total_missing", 0))
+        if args.include_mismatch:
+            logger.info("  总不一致: %s", result.get("total_mismatch", 0))
        logger.info("  已补全: %s", result.get("backfilled", 0))
        logger.info("  错误数: %s", result.get("errors", 0))
        logger.info("=" * 60)
@@ -663,17 +700,19 @@ def main() -> int:
        for detail in result.get("details", []):
            if detail.get("error"):
                logger.error(
-                    "  %s: 丢失=%s 补全=%s 错误=%s",
+                    "  %s: 丢失=%s 不一致=%s 补全=%s 错误=%s",
                    detail.get("task_code"),
                    detail.get("missing"),
+                    detail.get("mismatch", 0),
                    detail.get("backfilled"),
                    detail.get("error"),
                )
            elif detail.get("backfilled", 0) > 0:
                logger.info(
-                    "  %s: 丢失=%s 补全=%s",
+                    "  %s: 丢失=%s 不一致=%s 补全=%s",
                    detail.get("task_code"),
                    detail.get("missing"),
+                    detail.get("mismatch", 0),
                    detail.get("backfilled"),
                )