数据库 数据校验写入等逻辑更新。

This commit is contained in:
Neo
2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions

View File

@@ -2,6 +2,7 @@
"""手工示例数据灌入:按 schema_ODS_doc.sql 的表结构写入 ODS。"""
from __future__ import annotations
import hashlib
import json
import os
from datetime import datetime
@@ -252,12 +253,17 @@ class ManualIngestTask(BaseTask):
except Exception:
pk_index = None
has_content_hash = any(c[0].lower() == "content_hash" for c in columns_info)
col_list = ", ".join(f'"{c}"' for c in columns)
sql_prefix = f"INSERT INTO {table} ({col_list}) VALUES %s"
if pk_col_db:
update_cols = [c for c in columns if c != pk_col_db]
set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
sql_prefix += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'
if has_content_hash:
sql_prefix += f' ON CONFLICT ("{pk_col_db}", "content_hash") DO NOTHING'
else:
update_cols = [c for c in columns if c != pk_col_db]
set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
sql_prefix += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'
params = []
now = datetime.now()
@@ -284,6 +290,12 @@ class ManualIngestTask(BaseTask):
if pk_col and (pk_val is None or pk_val == ""):
continue
content_hash = None
if has_content_hash:
hash_record = dict(merged_rec)
hash_record["fetched_at"] = merged_rec.get("fetched_at", now)
content_hash = self._compute_content_hash(hash_record, include_fetched_at=True)
row_vals = []
for col_name, data_type, udt in columns_info:
col_lower = col_name.lower()
@@ -296,6 +308,9 @@ class ManualIngestTask(BaseTask):
if col_lower == "fetched_at":
row_vals.append(merged_rec.get(col_name, now))
continue
if col_lower == "content_hash":
row_vals.append(content_hash)
continue
value = self._normalize_scalar(self._get_value_case_insensitive(merged_rec, col_name))
@@ -401,3 +416,48 @@ class ManualIngestTask(BaseTask):
if dt.startswith("timestamp") or dt in ("date", "time", "interval"):
return value if isinstance(value, str) else None
return value
@staticmethod
def _hash_default(value):
if isinstance(value, datetime):
return value.isoformat()
return str(value)
@classmethod
def _sanitize_record_for_hash(cls, record: dict, *, include_fetched_at: bool) -> dict:
exclude = {
"data",
"payload",
"source_file",
"source_endpoint",
"content_hash",
"record_index",
}
if not include_fetched_at:
exclude.add("fetched_at")
def _strip(value):
if isinstance(value, dict):
cleaned = {}
for k, v in value.items():
if isinstance(k, str) and k.lower() in exclude:
continue
cleaned[k] = _strip(v)
return cleaned
if isinstance(value, list):
return [_strip(v) for v in value]
return value
return _strip(record or {})
@classmethod
def _compute_content_hash(cls, record: dict, *, include_fetched_at: bool) -> str:
cleaned = cls._sanitize_record_for_hash(record, include_fetched_at=include_fetched_at)
payload = json.dumps(
cleaned,
ensure_ascii=False,
sort_keys=True,
separators=(",", ":"),
default=cls._hash_default,
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()