"""诊断:NULL level_name 助教的 SCD2 最早记录 vs daily 最早日期""" import os, sys from pathlib import Path from dotenv import load_dotenv load_dotenv(Path(__file__).resolve().parents[2] / ".env") import psycopg2 import psycopg2.extras dsn = os.environ.get("PG_DSN") if not dsn: raise RuntimeError("PG_DSN 未设置") conn = psycopg2.connect(dsn) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # 对比每个 NULL assistant_id 的 SCD2 最早时间 vs daily 最早日期 cur.execute(""" WITH null_aids AS ( SELECT DISTINCT assistant_id FROM dws.dws_assistant_daily_detail WHERE assistant_level_name IS NULL ), scd2_earliest AS ( SELECT da.assistant_id, MIN(da.scd2_start_time) AS earliest_scd2 FROM dwd.dim_assistant da JOIN null_aids n ON da.assistant_id = n.assistant_id GROUP BY da.assistant_id ), daily_earliest AS ( SELECT d.assistant_id, MIN(d.stat_date) AS earliest_daily FROM dws.dws_assistant_daily_detail d JOIN null_aids n ON d.assistant_id = n.assistant_id WHERE d.assistant_level_name IS NULL GROUP BY d.assistant_id ) SELECT de.assistant_id, de.earliest_daily, se.earliest_scd2, se.earliest_scd2::date AS scd2_date, CASE WHEN de.earliest_daily < se.earliest_scd2::date THEN 'DAILY_BEFORE_SCD2' ELSE 'SCD2_COVERS' END AS status FROM daily_earliest de LEFT JOIN scd2_earliest se ON de.assistant_id = se.assistant_id ORDER BY de.earliest_daily """) rows = cur.fetchall() print(f"=== {len(rows)} 个 NULL level_name 助教 ===") before_count = 0 for r in rows: status = r['status'] if status == 'DAILY_BEFORE_SCD2': before_count += 1 print(f" aid={r['assistant_id']}, daily_earliest={r['earliest_daily']}, " f"scd2_earliest={r['scd2_date']}, status={status}") print(f"\n总计: {before_count}/{len(rows)} 个助教的 daily 数据早于 SCD2 首条记录") cur.close() conn.close()