数据库 数据校验写入等逻辑更新。
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
"""手工示例数据灌入:按 schema_ODS_doc.sql 的表结构写入 ODS。"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
@@ -252,12 +253,17 @@ class ManualIngestTask(BaseTask):
|
||||
except Exception:
|
||||
pk_index = None
|
||||
|
||||
has_content_hash = any(c[0].lower() == "content_hash" for c in columns_info)
|
||||
|
||||
col_list = ", ".join(f'"{c}"' for c in columns)
|
||||
sql_prefix = f"INSERT INTO {table} ({col_list}) VALUES %s"
|
||||
if pk_col_db:
|
||||
update_cols = [c for c in columns if c != pk_col_db]
|
||||
set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
|
||||
sql_prefix += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'
|
||||
if has_content_hash:
|
||||
sql_prefix += f' ON CONFLICT ("{pk_col_db}", "content_hash") DO NOTHING'
|
||||
else:
|
||||
update_cols = [c for c in columns if c != pk_col_db]
|
||||
set_clause = ", ".join(f'"{c}"=EXCLUDED."{c}"' for c in update_cols)
|
||||
sql_prefix += f' ON CONFLICT ("{pk_col_db}") DO UPDATE SET {set_clause}'
|
||||
|
||||
params = []
|
||||
now = datetime.now()
|
||||
@@ -284,6 +290,12 @@ class ManualIngestTask(BaseTask):
|
||||
if pk_col and (pk_val is None or pk_val == ""):
|
||||
continue
|
||||
|
||||
content_hash = None
|
||||
if has_content_hash:
|
||||
hash_record = dict(merged_rec)
|
||||
hash_record["fetched_at"] = merged_rec.get("fetched_at", now)
|
||||
content_hash = self._compute_content_hash(hash_record, include_fetched_at=True)
|
||||
|
||||
row_vals = []
|
||||
for col_name, data_type, udt in columns_info:
|
||||
col_lower = col_name.lower()
|
||||
@@ -296,6 +308,9 @@ class ManualIngestTask(BaseTask):
|
||||
if col_lower == "fetched_at":
|
||||
row_vals.append(merged_rec.get(col_name, now))
|
||||
continue
|
||||
if col_lower == "content_hash":
|
||||
row_vals.append(content_hash)
|
||||
continue
|
||||
|
||||
value = self._normalize_scalar(self._get_value_case_insensitive(merged_rec, col_name))
|
||||
|
||||
@@ -401,3 +416,48 @@ class ManualIngestTask(BaseTask):
|
||||
if dt.startswith("timestamp") or dt in ("date", "time", "interval"):
|
||||
return value if isinstance(value, str) else None
|
||||
return value
|
||||
|
||||
@staticmethod
|
||||
def _hash_default(value):
|
||||
if isinstance(value, datetime):
|
||||
return value.isoformat()
|
||||
return str(value)
|
||||
|
||||
@classmethod
|
||||
def _sanitize_record_for_hash(cls, record: dict, *, include_fetched_at: bool) -> dict:
|
||||
exclude = {
|
||||
"data",
|
||||
"payload",
|
||||
"source_file",
|
||||
"source_endpoint",
|
||||
"content_hash",
|
||||
"record_index",
|
||||
}
|
||||
if not include_fetched_at:
|
||||
exclude.add("fetched_at")
|
||||
|
||||
def _strip(value):
|
||||
if isinstance(value, dict):
|
||||
cleaned = {}
|
||||
for k, v in value.items():
|
||||
if isinstance(k, str) and k.lower() in exclude:
|
||||
continue
|
||||
cleaned[k] = _strip(v)
|
||||
return cleaned
|
||||
if isinstance(value, list):
|
||||
return [_strip(v) for v in value]
|
||||
return value
|
||||
|
||||
return _strip(record or {})
|
||||
|
||||
@classmethod
|
||||
def _compute_content_hash(cls, record: dict, *, include_fetched_at: bool) -> str:
|
||||
cleaned = cls._sanitize_record_for_hash(record, include_fetched_at=include_fetched_at)
|
||||
payload = json.dumps(
|
||||
cleaned,
|
||||
ensure_ascii=False,
|
||||
sort_keys=True,
|
||||
separators=(",", ":"),
|
||||
default=cls._hash_default,
|
||||
)
|
||||
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user