数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/etl_billiards/tasks/manual_ingest_task.py
+++ b/etl_billiards/tasks/manual_ingest_task.py
@@ -2,6 +2,7 @@
 """手工示例数据灌入：按 schema_ODS_doc.sql 的表结构写入 ODS。"""
 from __future__ import annotations

+import hashlib
 import json
 import os
 from datetime import datetime
@@ -252,12 +253,17 @@ class ManualIngestTask(BaseTask):
            except Exception:
                pk_index = None

+        has_content_hash = any(c[0].lower() == "content_hash" for c in columns_info)
+
        col_list = ", ".join(f'"{c}"' for c in columns)
        sql_prefix = f"INSERT INTO {table} ({col_list}) VALUES %s"
        if pk_col_db:
-            update_cols = [c for c in columns if c != pk_col_db]
-            set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
-            sql_prefix += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'
+            if has_content_hash:
+                sql_prefix += f' ON CONFLICT ("{pk_col_db}", "content_hash") DO NOTHING'
+            else:
+                update_cols = [c for c in columns if c != pk_col_db]
+                set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
+                sql_prefix += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'

        params = []
        now = datetime.now()
@@ -284,6 +290,12 @@ class ManualIngestTask(BaseTask):
            if pk_col and (pk_val is None or pk_val == ""):
                continue

+            content_hash = None
+            if has_content_hash:
+                hash_record = dict(merged_rec)
+                hash_record["fetched_at"] = merged_rec.get("fetched_at", now)
+                content_hash = self._compute_content_hash(hash_record, include_fetched_at=True)
+
            row_vals = []
            for col_name, data_type, udt in columns_info:
                col_lower = col_name.lower()
@@ -296,6 +308,9 @@ class ManualIngestTask(BaseTask):
                if col_lower == "fetched_at":
                    row_vals.append(merged_rec.get(col_name, now))
                    continue
+                if col_lower == "content_hash":
+                    row_vals.append(content_hash)
+                    continue

                value = self._normalize_scalar(self._get_value_case_insensitive(merged_rec, col_name))

@@ -401,3 +416,48 @@ class ManualIngestTask(BaseTask):
        if dt.startswith("timestamp") or dt in ("date", "time", "interval"):
            return value if isinstance(value, str) else None
        return value
+
+    @staticmethod
+    def _hash_default(value):
+        if isinstance(value, datetime):
+            return value.isoformat()
+        return str(value)
+
+    @classmethod
+    def _sanitize_record_for_hash(cls, record: dict, *, include_fetched_at: bool) -> dict:
+        exclude = {
+            "data",
+            "payload",
+            "source_file",
+            "source_endpoint",
+            "content_hash",
+            "record_index",
+        }
+        if not include_fetched_at:
+            exclude.add("fetched_at")
+
+        def _strip(value):
+            if isinstance(value, dict):
+                cleaned = {}
+                for k, v in value.items():
+                    if isinstance(k, str) and k.lower() in exclude:
+                        continue
+                    cleaned[k] = _strip(v)
+                return cleaned
+            if isinstance(value, list):
+                return [_strip(v) for v in value]
+            return value
+
+        return _strip(record or {})
+
+    @classmethod
+    def _compute_content_hash(cls, record: dict, *, include_fetched_at: bool) -> str:
+        cleaned = cls._sanitize_record_for_hash(record, include_fetched_at=include_fetched_at)
+        payload = json.dumps(
+            cleaned,
+            ensure_ascii=False,
+            sort_keys=True,
+            separators=(",", ":"),
+            default=cls._hash_default,
+        )
+        return hashlib.sha256(payload.encode("utf-8")).hexdigest()