# -*- coding: utf-8 -*- """ 修复 ODS_ASSISTANT_LEDGER 误删记录(2025-11-21 ~ 2025-11-23)。 背景: run_id 8932(2026-02-24 00:24)快照对比时,recent endpoint 因数据保留期滚动 丢失了 2025-11-21~2025-11-23 的 67 条记录,_mark_missing_as_deleted 将其误标 为 is_delete=1。 修复策略: 1. 调 Former endpoint 拉取 2025-11-01 ~ 2025-11-24 的完整数据 2. 用 ODS 任务的 _insert_records_schema_aware 入库(content_hash 去重保证幂等) 3. 对比 ODS 中 is_delete=1 但 Former 返回 is_delete=0 的记录,INSERT 修正版本行 4. 完成后提示用户跑 DWD 加载 用法: cd apps/etl/connectors/feiqiu python ../../../../scripts/ops/fix_assistant_ledger_misdelete.py [--dry-run] """ from __future__ import annotations import argparse import json import sys from datetime import datetime from pathlib import Path from zoneinfo import ZoneInfo # 加载环境变量 from dotenv import load_dotenv _ROOT = Path(__file__).resolve().parents[2] load_dotenv(_ROOT / ".env", override=False) _FEIQIU_ENV = _ROOT / "apps" / "etl" / "connectors" / "feiqiu" / ".env" if _FEIQIU_ENV.exists(): load_dotenv(_FEIQIU_ENV, override=False) # 确保 ETL 模块可导入 sys.path.insert(0, str(_ROOT / "apps" / "etl" / "connectors" / "feiqiu")) from config.settings import AppConfig from api.client import APIClient from database.connection import DatabaseConnection TZ = ZoneInfo("Asia/Shanghai") FORMER_ENDPOINT = "/AssistantPerformance/GetFormerOrderAssistantDetails" TABLE = "ods.assistant_service_records" STORE_ID = 2790685415443269 WINDOW_START = "2025-11-01 00:00:00" WINDOW_END = "2025-11-24 00:00:00" def parse_args(): p = argparse.ArgumentParser(description="修复 ODS_ASSISTANT_LEDGER 误删记录") p.add_argument("--dry-run", action="store_true", help="仅查询不写入") return p.parse_args() def fetch_former_records(api: APIClient) -> list[dict]: """调 Former endpoint 拉取指定窗口的全部记录。""" params = { "siteId": STORE_ID, "startTime": WINDOW_START, "endTime": WINDOW_END, } all_records, _ = api.get_paginated( endpoint=FORMER_ENDPOINT, params=params, page_size=200, data_path=("data",), list_key="orderAssistantDetails", ) return all_records def find_misdeleted_ids(db: DatabaseConnection) -> set[int]: """查询 ODS 中被误标 is_delete=1 的记录 ID(窗口内最新版本)。""" sql = """ SELECT DISTINCT ON (id) id, is_delete, fetched_at FROM ods.assistant_service_records WHERE create_time >= %s AND create_time < %s ORDER BY id, fetched_at DESC NULLS LAST """ rows = db.query(sql, (WINDOW_START, WINDOW_END)) return {r["id"] for r in rows if r["is_delete"] == 1} def get_table_columns(db: DatabaseConnection) -> list[str]: """获取 ODS 表的列名列表。""" sql = """ SELECT column_name FROM information_schema.columns WHERE table_schema = 'ods' AND table_name = 'assistant_service_records' ORDER BY ordinal_position """ return [r["column_name"] for r in db.query(sql)] def insert_correction_rows( db: DatabaseConnection, former_records: list[dict], misdeleted_ids: set[int], columns: list[str], dry_run: bool, ) -> int: """为误删记录插入修正版本行(is_delete=0,新 fetched_at)。 策略:从 Former API 返回的原始数据构造 ODS 行, content_hash 基于 payload + is_delete=0 计算,ON CONFLICT DO NOTHING 保证幂等。 """ import hashlib now = datetime.now(TZ) corrected = 0 for rec in former_records: rec_id = rec.get("id") if rec_id is None: continue try: rec_id = int(rec_id) except (ValueError, TypeError): continue if rec_id not in misdeleted_ids: continue # 构造 payload JSON payload_json = json.dumps(rec, ensure_ascii=False, sort_keys=True) # content_hash = md5(payload_json + "|is_delete=0") hash_input = payload_json + "|is_delete=0" content_hash = hashlib.md5(hash_input.encode("utf-8")).hexdigest() # 从 payload 提取 create_time raw_ct = rec.get("create_time") or rec.get("createTime") or rec.get("Create_time") create_time_val = None if raw_ct: try: from dateutil import parser as dtparser create_time_val = dtparser.parse(str(raw_ct)) except (ValueError, TypeError): pass if dry_run: print(f" [DRY-RUN] 将修正 id={rec_id}, create_time={create_time_val}, content_hash={content_hash}") corrected += 1 continue # INSERT 修正行(含 create_time) sql = """ INSERT INTO ods.assistant_service_records (id, payload, is_delete, content_hash, fetched_at, source_file, create_time) VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (id, content_hash) DO NOTHING """ from psycopg2.extras import Json as PgJson db.execute(sql, ( rec_id, PgJson(rec, dumps=lambda v: json.dumps(v, ensure_ascii=False)), 0, content_hash, now, f"fix_misdelete_former_{WINDOW_START[:10]}_{WINDOW_END[:10]}", create_time_val, )) corrected += 1 return corrected def main(): args = parse_args() config = AppConfig.load() dsn = config.get("db.dsn") if not dsn: raise RuntimeError("db.dsn 未配置") print(f"=== 修复 ODS_ASSISTANT_LEDGER 误删记录 ===") print(f"窗口: {WINDOW_START} ~ {WINDOW_END}") print(f"Former endpoint: {FORMER_ENDPOINT}") print(f"目标表: {TABLE}") if args.dry_run: print("[DRY-RUN 模式]") print() # 1. 连接数据库 db = DatabaseConnection(dsn, session={"timezone": "Asia/Shanghai"}) print("数据库连接成功") # 2. 查询当前误删记录 misdeleted = find_misdeleted_ids(db) print(f"ODS 中窗口内 is_delete=1 的记录数: {len(misdeleted)}") if not misdeleted: print("无需修复,退出") db.close() return # 3. 调 Former endpoint 拉取数据 api = APIClient( base_url=config.get("api.base_url"), token=config.get("api.token"), timeout=config.get("api.timeout", 20), retry_max=config.get("api.retry_max", 3), ) print(f"正在调用 Former endpoint...") former_records = fetch_former_records(api) print(f"Former endpoint 返回 {len(former_records)} 条记录") # 4. 匹配:Former 返回的记录中,哪些在 ODS 被误标为 is_delete=1 former_ids = set() for rec in former_records: rid = rec.get("id") if rid is not None: try: former_ids.add(int(rid)) except (ValueError, TypeError): pass recoverable = misdeleted & former_ids print(f"可修复记录数: {len(recoverable)} (ODS误删={len(misdeleted)}, Former返回={len(former_ids)})") if not recoverable: print("Former endpoint 未返回任何误删记录,退出") db.close() return # 5. 获取表结构 columns = get_table_columns(db) # 6. 插入修正版本行 corrected = insert_correction_rows(db, former_records, recoverable, columns, args.dry_run) if not args.dry_run: db.commit() print(f"\n已插入 {corrected} 条修正版本行(is_delete=0)") print("\n下一步:跑 DWD 加载以同步修正数据到 DWD 层") print(" cd apps/etl/connectors/feiqiu") print(' python -m cli.main --tasks DWD_LOAD_FROM_ODS --window-start "2025-11-01" --window-end "2025-11-24" --force-window-override') else: print(f"\n[DRY-RUN] 将修正 {corrected} 条记录") db.close() print("\n完成") if __name__ == "__main__": main()