在准备环境前提交次全部更改。

2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions
--- a/apps/etl/connectors/feiqiu/scripts/check/check_data_integrity.py
+++ b/apps/etl/connectors/feiqiu/scripts/check/check_data_integrity.py
@@ -0,0 +1,193 @@
+# -*- coding: utf-8 -*-
+"""Run data integrity checks across API -> ODS -> DWD."""
+from __future__ import annotations
+
+import argparse
+import sys
+from datetime import datetime
+from pathlib import Path
+from zoneinfo import ZoneInfo
+
+from dateutil import parser as dtparser
+
+from config.settings import AppConfig
+from quality.integrity_service import run_history_flow, run_window_flow, write_report
+from utils.logging_utils import build_log_path, configure_logging
+from utils.windowing import split_window
+
+
+def _parse_dt(value: str, tz: ZoneInfo) -> datetime:
+    dt = dtparser.parse(value)
+    if dt.tzinfo is None:
+        return dt.replace(tzinfo=tz)
+    return dt.astimezone(tz)
+
+
+def main() -> int:
+    if hasattr(sys.stdout, "reconfigure"):
+        try:
+            sys.stdout.reconfigure(encoding="utf-8")
+        except Exception:
+            pass
+
+    ap = argparse.ArgumentParser(description="Data integrity checks (API -> ODS -> DWD)")
+    ap.add_argument("--mode", choices=["history", "window"], default="history")
+    ap.add_argument(
+        "--flow",
+        choices=["verify", "update_and_verify"],
+        default="verify",
+        help="verify only or update+verify (auto backfill then optional recheck)",
+    )
+    ap.add_argument("--start", default="2025-07-01", help="history start date (default: 2025-07-01)")
+    ap.add_argument("--end", default="", help="history end datetime (default: last ETL end)")
+    ap.add_argument("--window-start", default="", help="window start datetime (mode=window)")
+    ap.add_argument("--window-end", default="", help="window end datetime (mode=window)")
+    ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
+    ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
+    ap.add_argument(
+        "--include-dimensions",
+        action="store_true",
+        default=None,
+        help="include dimension tables in ODS->DWD checks",
+    )
+    ap.add_argument(
+        "--no-include-dimensions",
+        action="store_true",
+        help="exclude dimension tables in ODS->DWD checks",
+    )
+    ap.add_argument("--ods-task-codes", default="", help="comma-separated ODS task codes for API checks")
+    ap.add_argument("--compare-content", action="store_true", help="compare API vs ODS content hash")
+    ap.add_argument("--no-compare-content", action="store_true", help="disable content comparison even if enabled in config")
+    ap.add_argument("--include-mismatch", action="store_true", help="backfill mismatch records as well")
+    ap.add_argument("--no-include-mismatch", action="store_true", help="disable mismatch backfill")
+    ap.add_argument("--recheck", action="store_true", help="re-run checks after backfill")
+    ap.add_argument("--no-recheck", action="store_true", help="skip recheck after backfill")
+    ap.add_argument("--content-sample-limit", type=int, default=None, help="max mismatch samples per table")
+    ap.add_argument("--out", default="", help="output JSON path")
+    ap.add_argument("--log-file", default="", help="log file path")
+    ap.add_argument("--log-dir", default="", help="log directory")
+    ap.add_argument("--log-level", default="INFO", help="log level")
+    ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
+    args = ap.parse_args()
+
+    log_dir = Path(args.log_dir) if args.log_dir else (Path(__file__).resolve().parent / "logs")
+    log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "data_integrity")
+    log_console = not args.no_log_console
+
+    with configure_logging(
+        "data_integrity",
+        log_file,
+        level=args.log_level,
+        console=log_console,
+        tee_std=True,
+    ) as logger:
+        cfg = AppConfig.load({})
+        tz = ZoneInfo(cfg.get("app.timezone", "Asia/Shanghai"))
+        report_path = Path(args.out) if args.out else None
+
+        if args.recheck and args.no_recheck:
+            raise SystemExit("cannot set both --recheck and --no-recheck")
+        if args.include_mismatch and args.no_include_mismatch:
+            raise SystemExit("cannot set both --include-mismatch and --no-include-mismatch")
+        if args.include_dimensions and args.no_include_dimensions:
+            raise SystemExit("cannot set both --include-dimensions and --no-include-dimensions")
+
+        compare_content = None
+        if args.compare_content and args.no_compare_content:
+            raise SystemExit("cannot set both --compare-content and --no-compare-content")
+        if args.compare_content:
+            compare_content = True
+        elif args.no_compare_content:
+            compare_content = False
+
+        include_mismatch = cfg.get("integrity.backfill_mismatch", True)
+        if args.include_mismatch:
+            include_mismatch = True
+        elif args.no_include_mismatch:
+            include_mismatch = False
+
+        recheck_after_backfill = cfg.get("integrity.recheck_after_backfill", True)
+        if args.recheck:
+            recheck_after_backfill = True
+        elif args.no_recheck:
+            recheck_after_backfill = False
+
+        include_dimensions = cfg.get("integrity.include_dimensions", True)
+        if args.include_dimensions:
+            include_dimensions = True
+        elif args.no_include_dimensions:
+            include_dimensions = False
+
+        if args.mode == "window":
+            if not args.window_start or not args.window_end:
+                raise SystemExit("window-start and window-end are required for mode=window")
+            start_dt = _parse_dt(args.window_start, tz)
+            end_dt = _parse_dt(args.window_end, tz)
+            split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
+            comp_hours = args.window_compensation_hours
+            if comp_hours is None:
+                comp_hours = cfg.get("run.window_split.compensation_hours", 0)
+
+            windows = split_window(
+                start_dt,
+                end_dt,
+                tz=tz,
+                split_unit=split_unit,
+                compensation_hours=comp_hours,
+            )
+            if not windows:
+                windows = [(start_dt, end_dt)]
+
+            report, counts = run_window_flow(
+                cfg=cfg,
+                windows=windows,
+                include_dimensions=bool(include_dimensions),
+                task_codes=args.ods_task_codes,
+                logger=logger,
+                compare_content=compare_content,
+                content_sample_limit=args.content_sample_limit,
+                do_backfill=args.flow == "update_and_verify",
+                include_mismatch=bool(include_mismatch),
+                recheck_after_backfill=bool(recheck_after_backfill),
+                page_size=int(cfg.get("api.page_size") or 200),
+                chunk_size=500,
+            )
+            report_path = write_report(report, prefix="data_integrity_window", tz=tz, report_path=report_path)
+            report["report_path"] = report_path
+            logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
+        else:
+            start_dt = _parse_dt(args.start, tz)
+            if args.end:
+                end_dt = _parse_dt(args.end, tz)
+            else:
+                end_dt = None
+            report, counts = run_history_flow(
+                cfg=cfg,
+                start_dt=start_dt,
+                end_dt=end_dt,
+                include_dimensions=bool(include_dimensions),
+                task_codes=args.ods_task_codes,
+                logger=logger,
+                compare_content=compare_content,
+                content_sample_limit=args.content_sample_limit,
+                do_backfill=args.flow == "update_and_verify",
+                include_mismatch=bool(include_mismatch),
+                recheck_after_backfill=bool(recheck_after_backfill),
+                page_size=int(cfg.get("api.page_size") or 200),
+                chunk_size=500,
+            )
+            report_path = write_report(report, prefix="data_integrity_history", tz=tz, report_path=report_path)
+            report["report_path"] = report_path
+            logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
+            logger.info(
+                "SUMMARY missing=%s mismatch=%s errors=%s",
+                counts.get("missing"),
+                counts.get("mismatch"),
+                counts.get("errors"),
+            )
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/apps/etl/connectors/feiqiu/scripts/check/check_dwd_service.py
+++ b/apps/etl/connectors/feiqiu/scripts/check/check_dwd_service.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+import sys
+sys.path.insert(0, '.')
+from config.settings import AppConfig
+from database.connection import DatabaseConnection
+from database.operations import DatabaseOperations
+
+config = AppConfig.load()
+db_conn = DatabaseConnection(config.config['db']['dsn'])
+db = DatabaseOperations(db_conn)
+
+# 检查DWD层服务记录分布
+print("=== DWD层服务记录分析 ===")
+print()
+
+# 1. 总体统计
+sql1 = """
+    SELECT 
+        COUNT(*) as total_records,
+        COUNT(DISTINCT tenant_member_id) as unique_members,
+        COUNT(DISTINCT site_assistant_id) as unique_assistants,
+        COUNT(DISTINCT (tenant_member_id, site_assistant_id)) as unique_pairs
+    FROM dwd.dwd_assistant_service_log
+    WHERE tenant_member_id > 0 AND is_delete = 0
+"""
+r = dict(db.query(sql1)[0])
+print("总体统计:")
+print(f"  总服务记录数: {r['total_records']}")
+print(f"  唯一会员数: {r['unique_members']}")
+print(f"  唯一助教数: {r['unique_assistants']}")
+print(f"  唯一客户-助教对: {r['unique_pairs']}")
+
+# 2. 助教服务会员数分布
+print()
+print("助教服务会员数分布 (Top 10):")
+sql2 = """
+    SELECT site_assistant_id, COUNT(DISTINCT tenant_member_id) as member_count
+    FROM dwd.dwd_assistant_service_log
+    WHERE tenant_member_id > 0 AND is_delete = 0
+    GROUP BY site_assistant_id
+    ORDER BY member_count DESC
+    LIMIT 10
+"""
+for row in db.query(sql2):
+    r = dict(row)
+    print(f"  助教 {r['site_assistant_id']}: 服务 {r['member_count']} 个会员")
+
+# 3. 每个客户-助教对的服务次数分布
+print()
+print("客户-助教对 服务次数分布 (Top 10):")
+sql3 = """
+    SELECT tenant_member_id, site_assistant_id, COUNT(*) as service_count
+    FROM dwd.dwd_assistant_service_log
+    WHERE tenant_member_id > 0 AND is_delete = 0
+    GROUP BY tenant_member_id, site_assistant_id
+    ORDER BY service_count DESC
+    LIMIT 10
+"""
+for row in db.query(sql3):
+    r = dict(row)
+    print(f"  会员 {r['tenant_member_id']} - 助教 {r['site_assistant_id']}: {r['service_count']} 次服务")
+
+# 4. 近60天的数据
+print()
+print("=== 近60天数据 ===")
+sql4 = """
+    SELECT 
+        COUNT(*) as total_records,
+        COUNT(DISTINCT tenant_member_id) as unique_members,
+        COUNT(DISTINCT site_assistant_id) as unique_assistants,
+        COUNT(DISTINCT (tenant_member_id, site_assistant_id)) as unique_pairs
+    FROM dwd.dwd_assistant_service_log
+    WHERE tenant_member_id > 0 AND is_delete = 0
+      AND last_use_time >= NOW() - INTERVAL '60 days'
+"""
+r4 = dict(db.query(sql4)[0])
+print(f"  总服务记录数: {r4['total_records']}")
+print(f"  唯一会员数: {r4['unique_members']}")
+print(f"  唯一助教数: {r4['unique_assistants']}")
+print(f"  唯一客户-助教对: {r4['unique_pairs']}")
+
+db_conn.close()
--- a/apps/etl/connectors/feiqiu/scripts/check/check_ods_content_hash.py
+++ b/apps/etl/connectors/feiqiu/scripts/check/check_ods_content_hash.py
@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+"""
+Validate that ODS payload content matches stored content_hash.
+
+Usage:
+  PYTHONPATH=. python -m scripts.check.check_ods_content_hash
+  PYTHONPATH=. python -m scripts.check.check_ods_content_hash --schema ods
+  PYTHONPATH=. python -m scripts.check.check_ods_content_hash --tables member_profiles,orders
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Iterable, Sequence
+
+from psycopg2.extras import RealDictCursor
+
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from config.settings import AppConfig
+from database.connection import DatabaseConnection
+from tasks.ods.ods_tasks import BaseOdsTask
+
+
+def _reconfigure_stdout_utf8() -> None:
+    if hasattr(sys.stdout, "reconfigure"):
+        try:
+            sys.stdout.reconfigure(encoding="utf-8")
+        except Exception:
+            pass
+
+
+def _fetch_tables(conn, schema: str) -> list[str]:
+    sql = """
+        SELECT table_name
+        FROM information_schema.tables
+        WHERE table_schema = %s AND table_type = 'BASE TABLE'
+        ORDER BY table_name
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (schema,))
+        return [r[0] for r in cur.fetchall()]
+
+
+def _fetch_columns(conn, schema: str, table: str) -> list[str]:
+    sql = """
+        SELECT column_name
+        FROM information_schema.columns
+        WHERE table_schema = %s AND table_name = %s
+        ORDER BY ordinal_position
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (schema, table))
+        cols = [r[0] for r in cur.fetchall()]
+    return [c for c in cols if c]
+
+
+def _fetch_pk_columns(conn, schema: str, table: str) -> list[str]:
+    sql = """
+        SELECT kcu.column_name
+        FROM information_schema.table_constraints tc
+        JOIN information_schema.key_column_usage kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+        WHERE tc.constraint_type = 'PRIMARY KEY'
+          AND tc.table_schema = %s
+          AND tc.table_name = %s
+        ORDER BY kcu.ordinal_position
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (schema, table))
+        cols = [r[0] for r in cur.fetchall()]
+    return [c for c in cols if c.lower() != "content_hash"]
+
+
+def _fetch_row_count(conn, schema: str, table: str) -> int:
+    sql = f'SELECT COUNT(*) FROM "{schema}"."{table}"'
+    with conn.cursor() as cur:
+        cur.execute(sql)
+        row = cur.fetchone()
+        return int(row[0] if row else 0)
+
+
+def _iter_rows(
+    conn,
+    schema: str,
+    table: str,
+    select_cols: Sequence[str],
+    batch_size: int,
+) -> Iterable[dict]:
+    cols_sql = ", ".join(f'"{c}"' for c in select_cols)
+    sql = f'SELECT {cols_sql} FROM "{schema}"."{table}"'
+    with conn.cursor(name=f"ods_hash_{table}", cursor_factory=RealDictCursor) as cur:
+        cur.itersize = max(1, int(batch_size or 500))
+        cur.execute(sql)
+        for row in cur:
+            yield row
+
+
+def _build_report_path(out_arg: str | None) -> Path:
+    if out_arg:
+        return Path(out_arg)
+    reports_dir = PROJECT_ROOT / "reports"
+    reports_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return reports_dir / f"ods_content_hash_check_{ts}.json"
+
+
+def _print_progress(
+    table_label: str,
+    processed: int,
+    total: int,
+    mismatched: int,
+    missing_hash: int,
+    invalid_payload: int,
+) -> None:
+    if total:
+        msg = (
+            f"[{table_label}] checked {processed}/{total} "
+            f"mismatch={mismatched} missing_hash={missing_hash} invalid_payload={invalid_payload}"
+        )
+    else:
+        msg = (
+            f"[{table_label}] checked {processed} "
+            f"mismatch={mismatched} missing_hash={missing_hash} invalid_payload={invalid_payload}"
+        )
+    print(msg, flush=True)
+
+
+def main() -> int:
+    _reconfigure_stdout_utf8()
+    ap = argparse.ArgumentParser(description="Validate ODS payload vs content_hash consistency")
+    ap.add_argument("--schema", default="ods", help="ODS schema name")
+    ap.add_argument("--tables", default="", help="comma-separated table names (optional)")
+    ap.add_argument("--batch-size", type=int, default=500, help="DB fetch batch size")
+    ap.add_argument("--progress-every", type=int, default=100, help="print progress every N rows")
+    ap.add_argument("--sample-limit", type=int, default=5, help="sample mismatch rows per table")
+    ap.add_argument("--out", default="", help="output report JSON path")
+    args = ap.parse_args()
+
+    cfg = AppConfig.load({})
+    db = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
+    conn = db.conn
+
+    tables = _fetch_tables(conn, args.schema)
+    if args.tables.strip():
+        whitelist = {t.strip() for t in args.tables.split(",") if t.strip()}
+        tables = [t for t in tables if t in whitelist]
+
+    report = {
+        "schema": args.schema,
+        "tables": [],
+        "summary": {
+            "total_tables": 0,
+            "checked_tables": 0,
+            "total_rows": 0,
+            "checked_rows": 0,
+            "mismatch_rows": 0,
+            "missing_hash_rows": 0,
+            "invalid_payload_rows": 0,
+        },
+    }
+
+    for table in tables:
+        table_label = f"{args.schema}.{table}"
+        cols = _fetch_columns(conn, args.schema, table)
+        cols_lower = {c.lower() for c in cols}
+        if "payload" not in cols_lower or "content_hash" not in cols_lower:
+            print(f"[{table_label}] skip: missing payload/content_hash", flush=True)
+            continue
+
+        total = _fetch_row_count(conn, args.schema, table)
+        pk_cols = _fetch_pk_columns(conn, args.schema, table)
+        select_cols = ["content_hash", "payload", *pk_cols]
+
+        processed = 0
+        mismatched = 0
+        missing_hash = 0
+        invalid_payload = 0
+        samples: list[dict[str, Any]] = []
+
+        print(f"[{table_label}] start: total_rows={total}", flush=True)
+
+        for row in _iter_rows(conn, args.schema, table, select_cols, args.batch_size):
+            processed += 1
+            content_hash = row.get("content_hash")
+            payload = row.get("payload")
+            recomputed = BaseOdsTask._compute_compare_hash_from_payload(payload)
+
+            row_mismatch = False
+            if not content_hash:
+                missing_hash += 1
+                mismatched += 1
+                row_mismatch = True
+            elif not recomputed:
+                invalid_payload += 1
+                mismatched += 1
+                row_mismatch = True
+            elif content_hash != recomputed:
+                mismatched += 1
+                row_mismatch = True
+
+            if row_mismatch and len(samples) < max(0, int(args.sample_limit or 0)):
+                sample = {k: row.get(k) for k in pk_cols}
+                sample["content_hash"] = content_hash
+                sample["recomputed_hash"] = recomputed
+                samples.append(sample)
+
+            if args.progress_every and processed % int(args.progress_every) == 0:
+                _print_progress(table_label, processed, total, mismatched, missing_hash, invalid_payload)
+
+        if processed and (not args.progress_every or processed % int(args.progress_every) != 0):
+            _print_progress(table_label, processed, total, mismatched, missing_hash, invalid_payload)
+
+        report["tables"].append(
+            {
+                "table": table_label,
+                "total_rows": total,
+                "checked_rows": processed,
+                "mismatch_rows": mismatched,
+                "missing_hash_rows": missing_hash,
+                "invalid_payload_rows": invalid_payload,
+                "sample_mismatches": samples,
+            }
+        )
+
+        report["summary"]["checked_tables"] += 1
+        report["summary"]["total_rows"] += total
+        report["summary"]["checked_rows"] += processed
+        report["summary"]["mismatch_rows"] += mismatched
+        report["summary"]["missing_hash_rows"] += missing_hash
+        report["summary"]["invalid_payload_rows"] += invalid_payload
+
+    report["summary"]["total_tables"] = len(tables)
+
+    out_path = _build_report_path(args.out)
+    out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"[REPORT] {out_path}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/apps/etl/connectors/feiqiu/scripts/check/check_ods_gaps.py
+++ b/apps/etl/connectors/feiqiu/scripts/check/check_ods_gaps.py
--- a/apps/etl/connectors/feiqiu/scripts/check/check_ods_json_vs_table.py
+++ b/apps/etl/connectors/feiqiu/scripts/check/check_ods_json_vs_table.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+ODS JSON 字段核对脚本：对照当前数据库中的 ODS 表字段，检查示例 JSON（默认目录 export/test-json-doc）
+是否包含同名键，并输出每表未命中的字段，便于补充映射或确认确实无源字段。
+
+使用方法：
+    set PG_DSN=postgresql://...               # 如 .env 中配置
+    python -m scripts.check.check_ods_json_vs_table
+"""
+from __future__ import annotations
+
+import json
+import os
+import pathlib
+from typing import Dict, Iterable, Set, Tuple
+
+import psycopg2
+
+from tasks.manual_ingest_task import ManualIngestTask
+
+
+def _flatten_keys(obj, prefix: str = "") -> Set[str]:
+    """递归展开 JSON 所有键路径，返回形如 data.assistantInfos.id 的集合。列表不保留索引，仅继续向下展开。"""
+    keys: Set[str] = set()
+    if isinstance(obj, dict):
+        for k, v in obj.items():
+            new_prefix = f"{prefix}.{k}" if prefix else k
+            keys.add(new_prefix)
+            keys |= _flatten_keys(v, new_prefix)
+    elif isinstance(obj, list):
+        for item in obj:
+            keys |= _flatten_keys(item, prefix)
+    return keys
+
+
+def _load_json_keys(path: pathlib.Path) -> Tuple[Set[str], dict[str, Set[str]]]:
+    """读取单个 JSON 文件并返回展开后的键集合以及末段->路径列表映射，若文件不存在或无法解析则返回空集合。"""
+    if not path.exists():
+        return set(), {}
+    data = json.loads(path.read_text(encoding="utf-8"))
+    paths = _flatten_keys(data)
+    last_map: dict[str, Set[str]] = {}
+    for p in paths:
+        last = p.split(".")[-1].lower()
+        last_map.setdefault(last, set()).add(p)
+    return paths, last_map
+
+
+def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
+    """从数据库读取 ods.* 的列名集合，按表返回。"""
+    conn = psycopg2.connect(dsn)
+    cur = conn.cursor()
+    cur.execute(
+        """
+        SELECT table_name, column_name
+        FROM information_schema.columns
+        WHERE table_schema='ods'
+        ORDER BY table_name, ordinal_position
+        """
+    )
+    result: Dict[str, Set[str]] = {}
+    for table, col in cur.fetchall():
+        result.setdefault(table, set()).add(col.lower())
+    cur.close()
+    conn.close()
+    return result
+
+
+def main() -> None:
+    """主流程：遍历 FILE_MAPPING 中的 ODS 表，检查 JSON 键覆盖情况并打印报告。"""
+    dsn = os.environ.get("PG_DSN")
+    json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", "export/test-json-doc"))
+
+    ods_cols_map = _load_ods_columns(dsn)
+
+    print(f"使用 JSON 目录: {json_dir}")
+    print(f"连接 DSN: {dsn}")
+    print("=" * 80)
+
+    for keywords, ods_table in ManualIngestTask.FILE_MAPPING:
+        table = ods_table.split(".")[-1]
+        cols = ods_cols_map.get(table, set())
+        file_name = f"{keywords[0]}.json"
+        file_path = json_dir / file_name
+        keys_full, path_map = _load_json_keys(file_path)
+        key_last_parts = set(path_map.keys())
+
+        missing: Set[str] = set()
+        extra_keys: Set[str] = set()
+        present: Set[str] = set()
+        for col in sorted(cols):
+            if col in key_last_parts:
+                present.add(col)
+            else:
+                missing.add(col)
+        for k in key_last_parts:
+            if k not in cols:
+                extra_keys.add(k)
+
+        print(f"[{table}] 文件={file_name} 列数={len(cols)} JSON键(末段)覆盖={len(present)}/{len(cols)}")
+        if missing:
+            print("  未命中列:", ", ".join(sorted(missing)))
+        else:
+            print("  未命中列: 无")
+        if extra_keys:
+            extras = []
+            for k in sorted(extra_keys):
+                paths = ", ".join(sorted(path_map.get(k, [])))
+                extras.append(f"{k} ({paths})")
+            print("  JSON 仅有(表无此列):", "; ".join(extras))
+        else:
+            print("  JSON 仅有(表无此列): 无")
+        print("-" * 80)
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/etl/connectors/feiqiu/scripts/check/verify_dws_config.py
+++ b/apps/etl/connectors/feiqiu/scripts/check/verify_dws_config.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+"""验证DWS配置数据"""
+
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+import psycopg2
+
+def main():
+    load_dotenv(Path(__file__).parent.parent / ".env")
+    dsn = os.getenv("PG_DSN")
+    conn = psycopg2.connect(dsn)
+    
+    tables = [
+        "cfg_performance_tier",
+        "cfg_assistant_level_price", 
+        "cfg_bonus_rules",
+        "cfg_area_category",
+        "cfg_skill_type"
+    ]
+    
+    print("DWS 配置表数据统计:")
+    print("-" * 40)
+    
+    with conn.cursor() as cur:
+        for t in tables:
+            cur.execute(f"SELECT COUNT(*) FROM dws.{t}")
+            cnt = cur.fetchone()[0]
+            print(f"{t}: {cnt} 行")
+    
+    conn.close()
+
+if __name__ == "__main__":
+    main()