init: 项目初始提交 - NeoZQYY Monorepo 完整代码
This commit is contained in:
193
apps/etl/pipelines/feiqiu/scripts/check/check_data_integrity.py
Normal file
193
apps/etl/pipelines/feiqiu/scripts/check/check_data_integrity.py
Normal file
@@ -0,0 +1,193 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Run data integrity checks across API -> ODS -> DWD."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from dateutil import parser as dtparser
|
||||
|
||||
from config.settings import AppConfig
|
||||
from quality.integrity_service import run_history_flow, run_window_flow, write_report
|
||||
from utils.logging_utils import build_log_path, configure_logging
|
||||
from utils.windowing import split_window
|
||||
|
||||
|
||||
def _parse_dt(value: str, tz: ZoneInfo) -> datetime:
|
||||
dt = dtparser.parse(value)
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=tz)
|
||||
return dt.astimezone(tz)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
ap = argparse.ArgumentParser(description="Data integrity checks (API -> ODS -> DWD)")
|
||||
ap.add_argument("--mode", choices=["history", "window"], default="history")
|
||||
ap.add_argument(
|
||||
"--flow",
|
||||
choices=["verify", "update_and_verify"],
|
||||
default="verify",
|
||||
help="verify only or update+verify (auto backfill then optional recheck)",
|
||||
)
|
||||
ap.add_argument("--start", default="2025-07-01", help="history start date (default: 2025-07-01)")
|
||||
ap.add_argument("--end", default="", help="history end datetime (default: last ETL end)")
|
||||
ap.add_argument("--window-start", default="", help="window start datetime (mode=window)")
|
||||
ap.add_argument("--window-end", default="", help="window end datetime (mode=window)")
|
||||
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
|
||||
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
|
||||
ap.add_argument(
|
||||
"--include-dimensions",
|
||||
action="store_true",
|
||||
default=None,
|
||||
help="include dimension tables in ODS->DWD checks",
|
||||
)
|
||||
ap.add_argument(
|
||||
"--no-include-dimensions",
|
||||
action="store_true",
|
||||
help="exclude dimension tables in ODS->DWD checks",
|
||||
)
|
||||
ap.add_argument("--ods-task-codes", default="", help="comma-separated ODS task codes for API checks")
|
||||
ap.add_argument("--compare-content", action="store_true", help="compare API vs ODS content hash")
|
||||
ap.add_argument("--no-compare-content", action="store_true", help="disable content comparison even if enabled in config")
|
||||
ap.add_argument("--include-mismatch", action="store_true", help="backfill mismatch records as well")
|
||||
ap.add_argument("--no-include-mismatch", action="store_true", help="disable mismatch backfill")
|
||||
ap.add_argument("--recheck", action="store_true", help="re-run checks after backfill")
|
||||
ap.add_argument("--no-recheck", action="store_true", help="skip recheck after backfill")
|
||||
ap.add_argument("--content-sample-limit", type=int, default=None, help="max mismatch samples per table")
|
||||
ap.add_argument("--out", default="", help="output JSON path")
|
||||
ap.add_argument("--log-file", default="", help="log file path")
|
||||
ap.add_argument("--log-dir", default="", help="log directory")
|
||||
ap.add_argument("--log-level", default="INFO", help="log level")
|
||||
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
|
||||
args = ap.parse_args()
|
||||
|
||||
log_dir = Path(args.log_dir) if args.log_dir else (Path(__file__).resolve().parent / "logs")
|
||||
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "data_integrity")
|
||||
log_console = not args.no_log_console
|
||||
|
||||
with configure_logging(
|
||||
"data_integrity",
|
||||
log_file,
|
||||
level=args.log_level,
|
||||
console=log_console,
|
||||
tee_std=True,
|
||||
) as logger:
|
||||
cfg = AppConfig.load({})
|
||||
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Shanghai"))
|
||||
report_path = Path(args.out) if args.out else None
|
||||
|
||||
if args.recheck and args.no_recheck:
|
||||
raise SystemExit("cannot set both --recheck and --no-recheck")
|
||||
if args.include_mismatch and args.no_include_mismatch:
|
||||
raise SystemExit("cannot set both --include-mismatch and --no-include-mismatch")
|
||||
if args.include_dimensions and args.no_include_dimensions:
|
||||
raise SystemExit("cannot set both --include-dimensions and --no-include-dimensions")
|
||||
|
||||
compare_content = None
|
||||
if args.compare_content and args.no_compare_content:
|
||||
raise SystemExit("cannot set both --compare-content and --no-compare-content")
|
||||
if args.compare_content:
|
||||
compare_content = True
|
||||
elif args.no_compare_content:
|
||||
compare_content = False
|
||||
|
||||
include_mismatch = cfg.get("integrity.backfill_mismatch", True)
|
||||
if args.include_mismatch:
|
||||
include_mismatch = True
|
||||
elif args.no_include_mismatch:
|
||||
include_mismatch = False
|
||||
|
||||
recheck_after_backfill = cfg.get("integrity.recheck_after_backfill", True)
|
||||
if args.recheck:
|
||||
recheck_after_backfill = True
|
||||
elif args.no_recheck:
|
||||
recheck_after_backfill = False
|
||||
|
||||
include_dimensions = cfg.get("integrity.include_dimensions", True)
|
||||
if args.include_dimensions:
|
||||
include_dimensions = True
|
||||
elif args.no_include_dimensions:
|
||||
include_dimensions = False
|
||||
|
||||
if args.mode == "window":
|
||||
if not args.window_start or not args.window_end:
|
||||
raise SystemExit("window-start and window-end are required for mode=window")
|
||||
start_dt = _parse_dt(args.window_start, tz)
|
||||
end_dt = _parse_dt(args.window_end, tz)
|
||||
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
|
||||
comp_hours = args.window_compensation_hours
|
||||
if comp_hours is None:
|
||||
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
|
||||
|
||||
windows = split_window(
|
||||
start_dt,
|
||||
end_dt,
|
||||
tz=tz,
|
||||
split_unit=split_unit,
|
||||
compensation_hours=comp_hours,
|
||||
)
|
||||
if not windows:
|
||||
windows = [(start_dt, end_dt)]
|
||||
|
||||
report, counts = run_window_flow(
|
||||
cfg=cfg,
|
||||
windows=windows,
|
||||
include_dimensions=bool(include_dimensions),
|
||||
task_codes=args.ods_task_codes,
|
||||
logger=logger,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=args.content_sample_limit,
|
||||
do_backfill=args.flow == "update_and_verify",
|
||||
include_mismatch=bool(include_mismatch),
|
||||
recheck_after_backfill=bool(recheck_after_backfill),
|
||||
page_size=int(cfg.get("api.page_size") or 200),
|
||||
chunk_size=500,
|
||||
)
|
||||
report_path = write_report(report, prefix="data_integrity_window", tz=tz, report_path=report_path)
|
||||
report["report_path"] = report_path
|
||||
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
|
||||
else:
|
||||
start_dt = _parse_dt(args.start, tz)
|
||||
if args.end:
|
||||
end_dt = _parse_dt(args.end, tz)
|
||||
else:
|
||||
end_dt = None
|
||||
report, counts = run_history_flow(
|
||||
cfg=cfg,
|
||||
start_dt=start_dt,
|
||||
end_dt=end_dt,
|
||||
include_dimensions=bool(include_dimensions),
|
||||
task_codes=args.ods_task_codes,
|
||||
logger=logger,
|
||||
compare_content=compare_content,
|
||||
content_sample_limit=args.content_sample_limit,
|
||||
do_backfill=args.flow == "update_and_verify",
|
||||
include_mismatch=bool(include_mismatch),
|
||||
recheck_after_backfill=bool(recheck_after_backfill),
|
||||
page_size=int(cfg.get("api.page_size") or 200),
|
||||
chunk_size=500,
|
||||
)
|
||||
report_path = write_report(report, prefix="data_integrity_history", tz=tz, report_path=report_path)
|
||||
report["report_path"] = report_path
|
||||
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
|
||||
logger.info(
|
||||
"SUMMARY missing=%s mismatch=%s errors=%s",
|
||||
counts.get("missing"),
|
||||
counts.get("mismatch"),
|
||||
counts.get("errors"),
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
82
apps/etl/pipelines/feiqiu/scripts/check/check_dwd_service.py
Normal file
82
apps/etl/pipelines/feiqiu/scripts/check/check_dwd_service.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import sys
|
||||
sys.path.insert(0, '.')
|
||||
from config.settings import AppConfig
|
||||
from database.connection import DatabaseConnection
|
||||
from database.operations import DatabaseOperations
|
||||
|
||||
config = AppConfig.load()
|
||||
db_conn = DatabaseConnection(config.config['db']['dsn'])
|
||||
db = DatabaseOperations(db_conn)
|
||||
|
||||
# 检查DWD层服务记录分布
|
||||
print("=== DWD层服务记录分析 ===")
|
||||
print()
|
||||
|
||||
# 1. 总体统计
|
||||
sql1 = """
|
||||
SELECT
|
||||
COUNT(*) as total_records,
|
||||
COUNT(DISTINCT tenant_member_id) as unique_members,
|
||||
COUNT(DISTINCT site_assistant_id) as unique_assistants,
|
||||
COUNT(DISTINCT (tenant_member_id, site_assistant_id)) as unique_pairs
|
||||
FROM billiards_dwd.dwd_assistant_service_log
|
||||
WHERE tenant_member_id > 0 AND is_delete = 0
|
||||
"""
|
||||
r = dict(db.query(sql1)[0])
|
||||
print("总体统计:")
|
||||
print(f" 总服务记录数: {r['total_records']}")
|
||||
print(f" 唯一会员数: {r['unique_members']}")
|
||||
print(f" 唯一助教数: {r['unique_assistants']}")
|
||||
print(f" 唯一客户-助教对: {r['unique_pairs']}")
|
||||
|
||||
# 2. 助教服务会员数分布
|
||||
print()
|
||||
print("助教服务会员数分布 (Top 10):")
|
||||
sql2 = """
|
||||
SELECT site_assistant_id, COUNT(DISTINCT tenant_member_id) as member_count
|
||||
FROM billiards_dwd.dwd_assistant_service_log
|
||||
WHERE tenant_member_id > 0 AND is_delete = 0
|
||||
GROUP BY site_assistant_id
|
||||
ORDER BY member_count DESC
|
||||
LIMIT 10
|
||||
"""
|
||||
for row in db.query(sql2):
|
||||
r = dict(row)
|
||||
print(f" 助教 {r['site_assistant_id']}: 服务 {r['member_count']} 个会员")
|
||||
|
||||
# 3. 每个客户-助教对的服务次数分布
|
||||
print()
|
||||
print("客户-助教对 服务次数分布 (Top 10):")
|
||||
sql3 = """
|
||||
SELECT tenant_member_id, site_assistant_id, COUNT(*) as service_count
|
||||
FROM billiards_dwd.dwd_assistant_service_log
|
||||
WHERE tenant_member_id > 0 AND is_delete = 0
|
||||
GROUP BY tenant_member_id, site_assistant_id
|
||||
ORDER BY service_count DESC
|
||||
LIMIT 10
|
||||
"""
|
||||
for row in db.query(sql3):
|
||||
r = dict(row)
|
||||
print(f" 会员 {r['tenant_member_id']} - 助教 {r['site_assistant_id']}: {r['service_count']} 次服务")
|
||||
|
||||
# 4. 近60天的数据
|
||||
print()
|
||||
print("=== 近60天数据 ===")
|
||||
sql4 = """
|
||||
SELECT
|
||||
COUNT(*) as total_records,
|
||||
COUNT(DISTINCT tenant_member_id) as unique_members,
|
||||
COUNT(DISTINCT site_assistant_id) as unique_assistants,
|
||||
COUNT(DISTINCT (tenant_member_id, site_assistant_id)) as unique_pairs
|
||||
FROM billiards_dwd.dwd_assistant_service_log
|
||||
WHERE tenant_member_id > 0 AND is_delete = 0
|
||||
AND last_use_time >= NOW() - INTERVAL '60 days'
|
||||
"""
|
||||
r4 = dict(db.query(sql4)[0])
|
||||
print(f" 总服务记录数: {r4['total_records']}")
|
||||
print(f" 唯一会员数: {r4['unique_members']}")
|
||||
print(f" 唯一助教数: {r4['unique_assistants']}")
|
||||
print(f" 唯一客户-助教对: {r4['unique_pairs']}")
|
||||
|
||||
db_conn.close()
|
||||
@@ -0,0 +1,248 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Validate that ODS payload content matches stored content_hash.
|
||||
|
||||
Usage:
|
||||
PYTHONPATH=. python -m scripts.check.check_ods_content_hash
|
||||
PYTHONPATH=. python -m scripts.check.check_ods_content_hash --schema billiards_ods
|
||||
PYTHONPATH=. python -m scripts.check.check_ods_content_hash --tables member_profiles,orders
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Sequence
|
||||
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
||||
if str(PROJECT_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
|
||||
from config.settings import AppConfig
|
||||
from database.connection import DatabaseConnection
|
||||
from tasks.ods.ods_tasks import BaseOdsTask
|
||||
|
||||
|
||||
def _reconfigure_stdout_utf8() -> None:
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
try:
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _fetch_tables(conn, schema: str) -> list[str]:
|
||||
sql = """
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = %s AND table_type = 'BASE TABLE'
|
||||
ORDER BY table_name
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (schema,))
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _fetch_columns(conn, schema: str, table: str) -> list[str]:
|
||||
sql = """
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s AND table_name = %s
|
||||
ORDER BY ordinal_position
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (schema, table))
|
||||
cols = [r[0] for r in cur.fetchall()]
|
||||
return [c for c in cols if c]
|
||||
|
||||
|
||||
def _fetch_pk_columns(conn, schema: str, table: str) -> list[str]:
|
||||
sql = """
|
||||
SELECT kcu.column_name
|
||||
FROM information_schema.table_constraints tc
|
||||
JOIN information_schema.key_column_usage kcu
|
||||
ON tc.constraint_name = kcu.constraint_name
|
||||
AND tc.table_schema = kcu.table_schema
|
||||
WHERE tc.constraint_type = 'PRIMARY KEY'
|
||||
AND tc.table_schema = %s
|
||||
AND tc.table_name = %s
|
||||
ORDER BY kcu.ordinal_position
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql, (schema, table))
|
||||
cols = [r[0] for r in cur.fetchall()]
|
||||
return [c for c in cols if c.lower() != "content_hash"]
|
||||
|
||||
|
||||
def _fetch_row_count(conn, schema: str, table: str) -> int:
|
||||
sql = f'SELECT COUNT(*) FROM "{schema}"."{table}"'
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(sql)
|
||||
row = cur.fetchone()
|
||||
return int(row[0] if row else 0)
|
||||
|
||||
|
||||
def _iter_rows(
|
||||
conn,
|
||||
schema: str,
|
||||
table: str,
|
||||
select_cols: Sequence[str],
|
||||
batch_size: int,
|
||||
) -> Iterable[dict]:
|
||||
cols_sql = ", ".join(f'"{c}"' for c in select_cols)
|
||||
sql = f'SELECT {cols_sql} FROM "{schema}"."{table}"'
|
||||
with conn.cursor(name=f"ods_hash_{table}", cursor_factory=RealDictCursor) as cur:
|
||||
cur.itersize = max(1, int(batch_size or 500))
|
||||
cur.execute(sql)
|
||||
for row in cur:
|
||||
yield row
|
||||
|
||||
|
||||
def _build_report_path(out_arg: str | None) -> Path:
|
||||
if out_arg:
|
||||
return Path(out_arg)
|
||||
reports_dir = PROJECT_ROOT / "reports"
|
||||
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
return reports_dir / f"ods_content_hash_check_{ts}.json"
|
||||
|
||||
|
||||
def _print_progress(
|
||||
table_label: str,
|
||||
processed: int,
|
||||
total: int,
|
||||
mismatched: int,
|
||||
missing_hash: int,
|
||||
invalid_payload: int,
|
||||
) -> None:
|
||||
if total:
|
||||
msg = (
|
||||
f"[{table_label}] checked {processed}/{total} "
|
||||
f"mismatch={mismatched} missing_hash={missing_hash} invalid_payload={invalid_payload}"
|
||||
)
|
||||
else:
|
||||
msg = (
|
||||
f"[{table_label}] checked {processed} "
|
||||
f"mismatch={mismatched} missing_hash={missing_hash} invalid_payload={invalid_payload}"
|
||||
)
|
||||
print(msg, flush=True)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
_reconfigure_stdout_utf8()
|
||||
ap = argparse.ArgumentParser(description="Validate ODS payload vs content_hash consistency")
|
||||
ap.add_argument("--schema", default="billiards_ods", help="ODS schema name")
|
||||
ap.add_argument("--tables", default="", help="comma-separated table names (optional)")
|
||||
ap.add_argument("--batch-size", type=int, default=500, help="DB fetch batch size")
|
||||
ap.add_argument("--progress-every", type=int, default=100, help="print progress every N rows")
|
||||
ap.add_argument("--sample-limit", type=int, default=5, help="sample mismatch rows per table")
|
||||
ap.add_argument("--out", default="", help="output report JSON path")
|
||||
args = ap.parse_args()
|
||||
|
||||
cfg = AppConfig.load({})
|
||||
db = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
|
||||
conn = db.conn
|
||||
|
||||
tables = _fetch_tables(conn, args.schema)
|
||||
if args.tables.strip():
|
||||
whitelist = {t.strip() for t in args.tables.split(",") if t.strip()}
|
||||
tables = [t for t in tables if t in whitelist]
|
||||
|
||||
report = {
|
||||
"schema": args.schema,
|
||||
"tables": [],
|
||||
"summary": {
|
||||
"total_tables": 0,
|
||||
"checked_tables": 0,
|
||||
"total_rows": 0,
|
||||
"checked_rows": 0,
|
||||
"mismatch_rows": 0,
|
||||
"missing_hash_rows": 0,
|
||||
"invalid_payload_rows": 0,
|
||||
},
|
||||
}
|
||||
|
||||
for table in tables:
|
||||
table_label = f"{args.schema}.{table}"
|
||||
cols = _fetch_columns(conn, args.schema, table)
|
||||
cols_lower = {c.lower() for c in cols}
|
||||
if "payload" not in cols_lower or "content_hash" not in cols_lower:
|
||||
print(f"[{table_label}] skip: missing payload/content_hash", flush=True)
|
||||
continue
|
||||
|
||||
total = _fetch_row_count(conn, args.schema, table)
|
||||
pk_cols = _fetch_pk_columns(conn, args.schema, table)
|
||||
select_cols = ["content_hash", "payload", *pk_cols]
|
||||
|
||||
processed = 0
|
||||
mismatched = 0
|
||||
missing_hash = 0
|
||||
invalid_payload = 0
|
||||
samples: list[dict[str, Any]] = []
|
||||
|
||||
print(f"[{table_label}] start: total_rows={total}", flush=True)
|
||||
|
||||
for row in _iter_rows(conn, args.schema, table, select_cols, args.batch_size):
|
||||
processed += 1
|
||||
content_hash = row.get("content_hash")
|
||||
payload = row.get("payload")
|
||||
recomputed = BaseOdsTask._compute_compare_hash_from_payload(payload)
|
||||
|
||||
row_mismatch = False
|
||||
if not content_hash:
|
||||
missing_hash += 1
|
||||
mismatched += 1
|
||||
row_mismatch = True
|
||||
elif not recomputed:
|
||||
invalid_payload += 1
|
||||
mismatched += 1
|
||||
row_mismatch = True
|
||||
elif content_hash != recomputed:
|
||||
mismatched += 1
|
||||
row_mismatch = True
|
||||
|
||||
if row_mismatch and len(samples) < max(0, int(args.sample_limit or 0)):
|
||||
sample = {k: row.get(k) for k in pk_cols}
|
||||
sample["content_hash"] = content_hash
|
||||
sample["recomputed_hash"] = recomputed
|
||||
samples.append(sample)
|
||||
|
||||
if args.progress_every and processed % int(args.progress_every) == 0:
|
||||
_print_progress(table_label, processed, total, mismatched, missing_hash, invalid_payload)
|
||||
|
||||
if processed and (not args.progress_every or processed % int(args.progress_every) != 0):
|
||||
_print_progress(table_label, processed, total, mismatched, missing_hash, invalid_payload)
|
||||
|
||||
report["tables"].append(
|
||||
{
|
||||
"table": table_label,
|
||||
"total_rows": total,
|
||||
"checked_rows": processed,
|
||||
"mismatch_rows": mismatched,
|
||||
"missing_hash_rows": missing_hash,
|
||||
"invalid_payload_rows": invalid_payload,
|
||||
"sample_mismatches": samples,
|
||||
}
|
||||
)
|
||||
|
||||
report["summary"]["checked_tables"] += 1
|
||||
report["summary"]["total_rows"] += total
|
||||
report["summary"]["checked_rows"] += processed
|
||||
report["summary"]["mismatch_rows"] += mismatched
|
||||
report["summary"]["missing_hash_rows"] += missing_hash
|
||||
report["summary"]["invalid_payload_rows"] += invalid_payload
|
||||
|
||||
report["summary"]["total_tables"] = len(tables)
|
||||
|
||||
out_path = _build_report_path(args.out)
|
||||
out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"[REPORT] {out_path}", flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
1004
apps/etl/pipelines/feiqiu/scripts/check/check_ods_gaps.py
Normal file
1004
apps/etl/pipelines/feiqiu/scripts/check/check_ods_gaps.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,117 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON(默认目录 export/test-json-doc)
|
||||
是否包含同名键,并输出每表未命中的字段,便于补充映射或确认确实无源字段。
|
||||
|
||||
使用方法:
|
||||
set PG_DSN=postgresql://... # 如 .env 中配置
|
||||
python -m scripts.check.check_ods_json_vs_table
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Dict, Iterable, Set, Tuple
|
||||
|
||||
import psycopg2
|
||||
|
||||
from tasks.manual_ingest_task import ManualIngestTask
|
||||
|
||||
|
||||
def _flatten_keys(obj, prefix: str = "") -> Set[str]:
|
||||
"""递归展开 JSON 所有键路径,返回形如 data.assistantInfos.id 的集合。列表不保留索引,仅继续向下展开。"""
|
||||
keys: Set[str] = set()
|
||||
if isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
new_prefix = f"{prefix}.{k}" if prefix else k
|
||||
keys.add(new_prefix)
|
||||
keys |= _flatten_keys(v, new_prefix)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
keys |= _flatten_keys(item, prefix)
|
||||
return keys
|
||||
|
||||
|
||||
def _load_json_keys(path: pathlib.Path) -> Tuple[Set[str], dict[str, Set[str]]]:
|
||||
"""读取单个 JSON 文件并返回展开后的键集合以及末段->路径列表映射,若文件不存在或无法解析则返回空集合。"""
|
||||
if not path.exists():
|
||||
return set(), {}
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
paths = _flatten_keys(data)
|
||||
last_map: dict[str, Set[str]] = {}
|
||||
for p in paths:
|
||||
last = p.split(".")[-1].lower()
|
||||
last_map.setdefault(last, set()).add(p)
|
||||
return paths, last_map
|
||||
|
||||
|
||||
def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
|
||||
"""从数据库读取 billiards_ods.* 的列名集合,按表返回。"""
|
||||
conn = psycopg2.connect(dsn)
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
SELECT table_name, column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema='billiards_ods'
|
||||
ORDER BY table_name, ordinal_position
|
||||
"""
|
||||
)
|
||||
result: Dict[str, Set[str]] = {}
|
||||
for table, col in cur.fetchall():
|
||||
result.setdefault(table, set()).add(col.lower())
|
||||
cur.close()
|
||||
conn.close()
|
||||
return result
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""主流程:遍历 FILE_MAPPING 中的 ODS 表,检查 JSON 键覆盖情况并打印报告。"""
|
||||
dsn = os.environ.get("PG_DSN")
|
||||
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", "export/test-json-doc"))
|
||||
|
||||
ods_cols_map = _load_ods_columns(dsn)
|
||||
|
||||
print(f"使用 JSON 目录: {json_dir}")
|
||||
print(f"连接 DSN: {dsn}")
|
||||
print("=" * 80)
|
||||
|
||||
for keywords, ods_table in ManualIngestTask.FILE_MAPPING:
|
||||
table = ods_table.split(".")[-1]
|
||||
cols = ods_cols_map.get(table, set())
|
||||
file_name = f"{keywords[0]}.json"
|
||||
file_path = json_dir / file_name
|
||||
keys_full, path_map = _load_json_keys(file_path)
|
||||
key_last_parts = set(path_map.keys())
|
||||
|
||||
missing: Set[str] = set()
|
||||
extra_keys: Set[str] = set()
|
||||
present: Set[str] = set()
|
||||
for col in sorted(cols):
|
||||
if col in key_last_parts:
|
||||
present.add(col)
|
||||
else:
|
||||
missing.add(col)
|
||||
for k in key_last_parts:
|
||||
if k not in cols:
|
||||
extra_keys.add(k)
|
||||
|
||||
print(f"[{table}] 文件={file_name} 列数={len(cols)} JSON键(末段)覆盖={len(present)}/{len(cols)}")
|
||||
if missing:
|
||||
print(" 未命中列:", ", ".join(sorted(missing)))
|
||||
else:
|
||||
print(" 未命中列: 无")
|
||||
if extra_keys:
|
||||
extras = []
|
||||
for k in sorted(extra_keys):
|
||||
paths = ", ".join(sorted(path_map.get(k, [])))
|
||||
extras.append(f"{k} ({paths})")
|
||||
print(" JSON 仅有(表无此列):", "; ".join(extras))
|
||||
else:
|
||||
print(" JSON 仅有(表无此列): 无")
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
34
apps/etl/pipelines/feiqiu/scripts/check/verify_dws_config.py
Normal file
34
apps/etl/pipelines/feiqiu/scripts/check/verify_dws_config.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""验证DWS配置数据"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import psycopg2
|
||||
|
||||
def main():
|
||||
load_dotenv(Path(__file__).parent.parent / ".env")
|
||||
dsn = os.getenv("PG_DSN")
|
||||
conn = psycopg2.connect(dsn)
|
||||
|
||||
tables = [
|
||||
"cfg_performance_tier",
|
||||
"cfg_assistant_level_price",
|
||||
"cfg_bonus_rules",
|
||||
"cfg_area_category",
|
||||
"cfg_skill_type"
|
||||
]
|
||||
|
||||
print("DWS 配置表数据统计:")
|
||||
print("-" * 40)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
for t in tables:
|
||||
cur.execute(f"SELECT COUNT(*) FROM billiards_dws.{t}")
|
||||
cnt = cur.fetchone()[0]
|
||||
print(f"{t}: {cnt} 行")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user