在准备环境前提交次全部更改。

This commit is contained in:
Neo
2026-02-19 08:35:13 +08:00
parent ded6dfb9d8
commit 4eac07da47
1387 changed files with 6107191 additions and 33002 deletions

View File

@@ -0,0 +1,193 @@
# -*- coding: utf-8 -*-
"""Run data integrity checks across API -> ODS -> DWD."""
from __future__ import annotations
import argparse
import sys
from datetime import datetime
from pathlib import Path
from zoneinfo import ZoneInfo
from dateutil import parser as dtparser
from config.settings import AppConfig
from quality.integrity_service import run_history_flow, run_window_flow, write_report
from utils.logging_utils import build_log_path, configure_logging
from utils.windowing import split_window
def _parse_dt(value: str, tz: ZoneInfo) -> datetime:
dt = dtparser.parse(value)
if dt.tzinfo is None:
return dt.replace(tzinfo=tz)
return dt.astimezone(tz)
def main() -> int:
if hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
ap = argparse.ArgumentParser(description="Data integrity checks (API -> ODS -> DWD)")
ap.add_argument("--mode", choices=["history", "window"], default="history")
ap.add_argument(
"--flow",
choices=["verify", "update_and_verify"],
default="verify",
help="verify only or update+verify (auto backfill then optional recheck)",
)
ap.add_argument("--start", default="2025-07-01", help="history start date (default: 2025-07-01)")
ap.add_argument("--end", default="", help="history end datetime (default: last ETL end)")
ap.add_argument("--window-start", default="", help="window start datetime (mode=window)")
ap.add_argument("--window-end", default="", help="window end datetime (mode=window)")
ap.add_argument("--window-split-unit", default="", help="split unit (month/none), default from config")
ap.add_argument("--window-compensation-hours", type=int, default=None, help="window compensation hours, default from config")
ap.add_argument(
"--include-dimensions",
action="store_true",
default=None,
help="include dimension tables in ODS->DWD checks",
)
ap.add_argument(
"--no-include-dimensions",
action="store_true",
help="exclude dimension tables in ODS->DWD checks",
)
ap.add_argument("--ods-task-codes", default="", help="comma-separated ODS task codes for API checks")
ap.add_argument("--compare-content", action="store_true", help="compare API vs ODS content hash")
ap.add_argument("--no-compare-content", action="store_true", help="disable content comparison even if enabled in config")
ap.add_argument("--include-mismatch", action="store_true", help="backfill mismatch records as well")
ap.add_argument("--no-include-mismatch", action="store_true", help="disable mismatch backfill")
ap.add_argument("--recheck", action="store_true", help="re-run checks after backfill")
ap.add_argument("--no-recheck", action="store_true", help="skip recheck after backfill")
ap.add_argument("--content-sample-limit", type=int, default=None, help="max mismatch samples per table")
ap.add_argument("--out", default="", help="output JSON path")
ap.add_argument("--log-file", default="", help="log file path")
ap.add_argument("--log-dir", default="", help="log directory")
ap.add_argument("--log-level", default="INFO", help="log level")
ap.add_argument("--no-log-console", action="store_true", help="disable console logging")
args = ap.parse_args()
log_dir = Path(args.log_dir) if args.log_dir else (Path(__file__).resolve().parent / "logs")
log_file = Path(args.log_file) if args.log_file else build_log_path(log_dir, "data_integrity")
log_console = not args.no_log_console
with configure_logging(
"data_integrity",
log_file,
level=args.log_level,
console=log_console,
tee_std=True,
) as logger:
cfg = AppConfig.load({})
tz = ZoneInfo(cfg.get("app.timezone", "Asia/Shanghai"))
report_path = Path(args.out) if args.out else None
if args.recheck and args.no_recheck:
raise SystemExit("cannot set both --recheck and --no-recheck")
if args.include_mismatch and args.no_include_mismatch:
raise SystemExit("cannot set both --include-mismatch and --no-include-mismatch")
if args.include_dimensions and args.no_include_dimensions:
raise SystemExit("cannot set both --include-dimensions and --no-include-dimensions")
compare_content = None
if args.compare_content and args.no_compare_content:
raise SystemExit("cannot set both --compare-content and --no-compare-content")
if args.compare_content:
compare_content = True
elif args.no_compare_content:
compare_content = False
include_mismatch = cfg.get("integrity.backfill_mismatch", True)
if args.include_mismatch:
include_mismatch = True
elif args.no_include_mismatch:
include_mismatch = False
recheck_after_backfill = cfg.get("integrity.recheck_after_backfill", True)
if args.recheck:
recheck_after_backfill = True
elif args.no_recheck:
recheck_after_backfill = False
include_dimensions = cfg.get("integrity.include_dimensions", True)
if args.include_dimensions:
include_dimensions = True
elif args.no_include_dimensions:
include_dimensions = False
if args.mode == "window":
if not args.window_start or not args.window_end:
raise SystemExit("window-start and window-end are required for mode=window")
start_dt = _parse_dt(args.window_start, tz)
end_dt = _parse_dt(args.window_end, tz)
split_unit = (args.window_split_unit or cfg.get("run.window_split.unit", "month") or "month").strip()
comp_hours = args.window_compensation_hours
if comp_hours is None:
comp_hours = cfg.get("run.window_split.compensation_hours", 0)
windows = split_window(
start_dt,
end_dt,
tz=tz,
split_unit=split_unit,
compensation_hours=comp_hours,
)
if not windows:
windows = [(start_dt, end_dt)]
report, counts = run_window_flow(
cfg=cfg,
windows=windows,
include_dimensions=bool(include_dimensions),
task_codes=args.ods_task_codes,
logger=logger,
compare_content=compare_content,
content_sample_limit=args.content_sample_limit,
do_backfill=args.flow == "update_and_verify",
include_mismatch=bool(include_mismatch),
recheck_after_backfill=bool(recheck_after_backfill),
page_size=int(cfg.get("api.page_size") or 200),
chunk_size=500,
)
report_path = write_report(report, prefix="data_integrity_window", tz=tz, report_path=report_path)
report["report_path"] = report_path
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
else:
start_dt = _parse_dt(args.start, tz)
if args.end:
end_dt = _parse_dt(args.end, tz)
else:
end_dt = None
report, counts = run_history_flow(
cfg=cfg,
start_dt=start_dt,
end_dt=end_dt,
include_dimensions=bool(include_dimensions),
task_codes=args.ods_task_codes,
logger=logger,
compare_content=compare_content,
content_sample_limit=args.content_sample_limit,
do_backfill=args.flow == "update_and_verify",
include_mismatch=bool(include_mismatch),
recheck_after_backfill=bool(recheck_after_backfill),
page_size=int(cfg.get("api.page_size") or 200),
chunk_size=500,
)
report_path = write_report(report, prefix="data_integrity_history", tz=tz, report_path=report_path)
report["report_path"] = report_path
logger.info("REPORT_WRITTEN path=%s", report.get("report_path"))
logger.info(
"SUMMARY missing=%s mismatch=%s errors=%s",
counts.get("missing"),
counts.get("mismatch"),
counts.get("errors"),
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
import sys
sys.path.insert(0, '.')
from config.settings import AppConfig
from database.connection import DatabaseConnection
from database.operations import DatabaseOperations
config = AppConfig.load()
db_conn = DatabaseConnection(config.config['db']['dsn'])
db = DatabaseOperations(db_conn)
# 检查DWD层服务记录分布
print("=== DWD层服务记录分析 ===")
print()
# 1. 总体统计
sql1 = """
SELECT
COUNT(*) as total_records,
COUNT(DISTINCT tenant_member_id) as unique_members,
COUNT(DISTINCT site_assistant_id) as unique_assistants,
COUNT(DISTINCT (tenant_member_id, site_assistant_id)) as unique_pairs
FROM dwd.dwd_assistant_service_log
WHERE tenant_member_id > 0 AND is_delete = 0
"""
r = dict(db.query(sql1)[0])
print("总体统计:")
print(f" 总服务记录数: {r['total_records']}")
print(f" 唯一会员数: {r['unique_members']}")
print(f" 唯一助教数: {r['unique_assistants']}")
print(f" 唯一客户-助教对: {r['unique_pairs']}")
# 2. 助教服务会员数分布
print()
print("助教服务会员数分布 (Top 10):")
sql2 = """
SELECT site_assistant_id, COUNT(DISTINCT tenant_member_id) as member_count
FROM dwd.dwd_assistant_service_log
WHERE tenant_member_id > 0 AND is_delete = 0
GROUP BY site_assistant_id
ORDER BY member_count DESC
LIMIT 10
"""
for row in db.query(sql2):
r = dict(row)
print(f" 助教 {r['site_assistant_id']}: 服务 {r['member_count']} 个会员")
# 3. 每个客户-助教对的服务次数分布
print()
print("客户-助教对 服务次数分布 (Top 10):")
sql3 = """
SELECT tenant_member_id, site_assistant_id, COUNT(*) as service_count
FROM dwd.dwd_assistant_service_log
WHERE tenant_member_id > 0 AND is_delete = 0
GROUP BY tenant_member_id, site_assistant_id
ORDER BY service_count DESC
LIMIT 10
"""
for row in db.query(sql3):
r = dict(row)
print(f" 会员 {r['tenant_member_id']} - 助教 {r['site_assistant_id']}: {r['service_count']} 次服务")
# 4. 近60天的数据
print()
print("=== 近60天数据 ===")
sql4 = """
SELECT
COUNT(*) as total_records,
COUNT(DISTINCT tenant_member_id) as unique_members,
COUNT(DISTINCT site_assistant_id) as unique_assistants,
COUNT(DISTINCT (tenant_member_id, site_assistant_id)) as unique_pairs
FROM dwd.dwd_assistant_service_log
WHERE tenant_member_id > 0 AND is_delete = 0
AND last_use_time >= NOW() - INTERVAL '60 days'
"""
r4 = dict(db.query(sql4)[0])
print(f" 总服务记录数: {r4['total_records']}")
print(f" 唯一会员数: {r4['unique_members']}")
print(f" 唯一助教数: {r4['unique_assistants']}")
print(f" 唯一客户-助教对: {r4['unique_pairs']}")
db_conn.close()

View File

@@ -0,0 +1,248 @@
# -*- coding: utf-8 -*-
"""
Validate that ODS payload content matches stored content_hash.
Usage:
PYTHONPATH=. python -m scripts.check.check_ods_content_hash
PYTHONPATH=. python -m scripts.check.check_ods_content_hash --schema ods
PYTHONPATH=. python -m scripts.check.check_ods_content_hash --tables member_profiles,orders
"""
from __future__ import annotations
import argparse
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Iterable, Sequence
from psycopg2.extras import RealDictCursor
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from config.settings import AppConfig
from database.connection import DatabaseConnection
from tasks.ods.ods_tasks import BaseOdsTask
def _reconfigure_stdout_utf8() -> None:
if hasattr(sys.stdout, "reconfigure"):
try:
sys.stdout.reconfigure(encoding="utf-8")
except Exception:
pass
def _fetch_tables(conn, schema: str) -> list[str]:
sql = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = %s AND table_type = 'BASE TABLE'
ORDER BY table_name
"""
with conn.cursor() as cur:
cur.execute(sql, (schema,))
return [r[0] for r in cur.fetchall()]
def _fetch_columns(conn, schema: str, table: str) -> list[str]:
sql = """
SELECT column_name
FROM information_schema.columns
WHERE table_schema = %s AND table_name = %s
ORDER BY ordinal_position
"""
with conn.cursor() as cur:
cur.execute(sql, (schema, table))
cols = [r[0] for r in cur.fetchall()]
return [c for c in cols if c]
def _fetch_pk_columns(conn, schema: str, table: str) -> list[str]:
sql = """
SELECT kcu.column_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
ON tc.constraint_name = kcu.constraint_name
AND tc.table_schema = kcu.table_schema
WHERE tc.constraint_type = 'PRIMARY KEY'
AND tc.table_schema = %s
AND tc.table_name = %s
ORDER BY kcu.ordinal_position
"""
with conn.cursor() as cur:
cur.execute(sql, (schema, table))
cols = [r[0] for r in cur.fetchall()]
return [c for c in cols if c.lower() != "content_hash"]
def _fetch_row_count(conn, schema: str, table: str) -> int:
sql = f'SELECT COUNT(*) FROM "{schema}"."{table}"'
with conn.cursor() as cur:
cur.execute(sql)
row = cur.fetchone()
return int(row[0] if row else 0)
def _iter_rows(
conn,
schema: str,
table: str,
select_cols: Sequence[str],
batch_size: int,
) -> Iterable[dict]:
cols_sql = ", ".join(f'"{c}"' for c in select_cols)
sql = f'SELECT {cols_sql} FROM "{schema}"."{table}"'
with conn.cursor(name=f"ods_hash_{table}", cursor_factory=RealDictCursor) as cur:
cur.itersize = max(1, int(batch_size or 500))
cur.execute(sql)
for row in cur:
yield row
def _build_report_path(out_arg: str | None) -> Path:
if out_arg:
return Path(out_arg)
reports_dir = PROJECT_ROOT / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
return reports_dir / f"ods_content_hash_check_{ts}.json"
def _print_progress(
table_label: str,
processed: int,
total: int,
mismatched: int,
missing_hash: int,
invalid_payload: int,
) -> None:
if total:
msg = (
f"[{table_label}] checked {processed}/{total} "
f"mismatch={mismatched} missing_hash={missing_hash} invalid_payload={invalid_payload}"
)
else:
msg = (
f"[{table_label}] checked {processed} "
f"mismatch={mismatched} missing_hash={missing_hash} invalid_payload={invalid_payload}"
)
print(msg, flush=True)
def main() -> int:
_reconfigure_stdout_utf8()
ap = argparse.ArgumentParser(description="Validate ODS payload vs content_hash consistency")
ap.add_argument("--schema", default="ods", help="ODS schema name")
ap.add_argument("--tables", default="", help="comma-separated table names (optional)")
ap.add_argument("--batch-size", type=int, default=500, help="DB fetch batch size")
ap.add_argument("--progress-every", type=int, default=100, help="print progress every N rows")
ap.add_argument("--sample-limit", type=int, default=5, help="sample mismatch rows per table")
ap.add_argument("--out", default="", help="output report JSON path")
args = ap.parse_args()
cfg = AppConfig.load({})
db = DatabaseConnection(dsn=cfg["db"]["dsn"], session=cfg["db"].get("session"))
conn = db.conn
tables = _fetch_tables(conn, args.schema)
if args.tables.strip():
whitelist = {t.strip() for t in args.tables.split(",") if t.strip()}
tables = [t for t in tables if t in whitelist]
report = {
"schema": args.schema,
"tables": [],
"summary": {
"total_tables": 0,
"checked_tables": 0,
"total_rows": 0,
"checked_rows": 0,
"mismatch_rows": 0,
"missing_hash_rows": 0,
"invalid_payload_rows": 0,
},
}
for table in tables:
table_label = f"{args.schema}.{table}"
cols = _fetch_columns(conn, args.schema, table)
cols_lower = {c.lower() for c in cols}
if "payload" not in cols_lower or "content_hash" not in cols_lower:
print(f"[{table_label}] skip: missing payload/content_hash", flush=True)
continue
total = _fetch_row_count(conn, args.schema, table)
pk_cols = _fetch_pk_columns(conn, args.schema, table)
select_cols = ["content_hash", "payload", *pk_cols]
processed = 0
mismatched = 0
missing_hash = 0
invalid_payload = 0
samples: list[dict[str, Any]] = []
print(f"[{table_label}] start: total_rows={total}", flush=True)
for row in _iter_rows(conn, args.schema, table, select_cols, args.batch_size):
processed += 1
content_hash = row.get("content_hash")
payload = row.get("payload")
recomputed = BaseOdsTask._compute_compare_hash_from_payload(payload)
row_mismatch = False
if not content_hash:
missing_hash += 1
mismatched += 1
row_mismatch = True
elif not recomputed:
invalid_payload += 1
mismatched += 1
row_mismatch = True
elif content_hash != recomputed:
mismatched += 1
row_mismatch = True
if row_mismatch and len(samples) < max(0, int(args.sample_limit or 0)):
sample = {k: row.get(k) for k in pk_cols}
sample["content_hash"] = content_hash
sample["recomputed_hash"] = recomputed
samples.append(sample)
if args.progress_every and processed % int(args.progress_every) == 0:
_print_progress(table_label, processed, total, mismatched, missing_hash, invalid_payload)
if processed and (not args.progress_every or processed % int(args.progress_every) != 0):
_print_progress(table_label, processed, total, mismatched, missing_hash, invalid_payload)
report["tables"].append(
{
"table": table_label,
"total_rows": total,
"checked_rows": processed,
"mismatch_rows": mismatched,
"missing_hash_rows": missing_hash,
"invalid_payload_rows": invalid_payload,
"sample_mismatches": samples,
}
)
report["summary"]["checked_tables"] += 1
report["summary"]["total_rows"] += total
report["summary"]["checked_rows"] += processed
report["summary"]["mismatch_rows"] += mismatched
report["summary"]["missing_hash_rows"] += missing_hash
report["summary"]["invalid_payload_rows"] += invalid_payload
report["summary"]["total_tables"] = len(tables)
out_path = _build_report_path(args.out)
out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"[REPORT] {out_path}", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
"""
ODS JSON 字段核对脚本:对照当前数据库中的 ODS 表字段,检查示例 JSON默认目录 export/test-json-doc
是否包含同名键,并输出每表未命中的字段,便于补充映射或确认确实无源字段。
使用方法:
set PG_DSN=postgresql://... # 如 .env 中配置
python -m scripts.check.check_ods_json_vs_table
"""
from __future__ import annotations
import json
import os
import pathlib
from typing import Dict, Iterable, Set, Tuple
import psycopg2
from tasks.manual_ingest_task import ManualIngestTask
def _flatten_keys(obj, prefix: str = "") -> Set[str]:
"""递归展开 JSON 所有键路径,返回形如 data.assistantInfos.id 的集合。列表不保留索引,仅继续向下展开。"""
keys: Set[str] = set()
if isinstance(obj, dict):
for k, v in obj.items():
new_prefix = f"{prefix}.{k}" if prefix else k
keys.add(new_prefix)
keys |= _flatten_keys(v, new_prefix)
elif isinstance(obj, list):
for item in obj:
keys |= _flatten_keys(item, prefix)
return keys
def _load_json_keys(path: pathlib.Path) -> Tuple[Set[str], dict[str, Set[str]]]:
"""读取单个 JSON 文件并返回展开后的键集合以及末段->路径列表映射,若文件不存在或无法解析则返回空集合。"""
if not path.exists():
return set(), {}
data = json.loads(path.read_text(encoding="utf-8"))
paths = _flatten_keys(data)
last_map: dict[str, Set[str]] = {}
for p in paths:
last = p.split(".")[-1].lower()
last_map.setdefault(last, set()).add(p)
return paths, last_map
def _load_ods_columns(dsn: str) -> Dict[str, Set[str]]:
"""从数据库读取 ods.* 的列名集合,按表返回。"""
conn = psycopg2.connect(dsn)
cur = conn.cursor()
cur.execute(
"""
SELECT table_name, column_name
FROM information_schema.columns
WHERE table_schema='ods'
ORDER BY table_name, ordinal_position
"""
)
result: Dict[str, Set[str]] = {}
for table, col in cur.fetchall():
result.setdefault(table, set()).add(col.lower())
cur.close()
conn.close()
return result
def main() -> None:
"""主流程:遍历 FILE_MAPPING 中的 ODS 表,检查 JSON 键覆盖情况并打印报告。"""
dsn = os.environ.get("PG_DSN")
json_dir = pathlib.Path(os.environ.get("JSON_DOC_DIR", "export/test-json-doc"))
ods_cols_map = _load_ods_columns(dsn)
print(f"使用 JSON 目录: {json_dir}")
print(f"连接 DSN: {dsn}")
print("=" * 80)
for keywords, ods_table in ManualIngestTask.FILE_MAPPING:
table = ods_table.split(".")[-1]
cols = ods_cols_map.get(table, set())
file_name = f"{keywords[0]}.json"
file_path = json_dir / file_name
keys_full, path_map = _load_json_keys(file_path)
key_last_parts = set(path_map.keys())
missing: Set[str] = set()
extra_keys: Set[str] = set()
present: Set[str] = set()
for col in sorted(cols):
if col in key_last_parts:
present.add(col)
else:
missing.add(col)
for k in key_last_parts:
if k not in cols:
extra_keys.add(k)
print(f"[{table}] 文件={file_name} 列数={len(cols)} JSON键(末段)覆盖={len(present)}/{len(cols)}")
if missing:
print(" 未命中列:", ", ".join(sorted(missing)))
else:
print(" 未命中列: 无")
if extra_keys:
extras = []
for k in sorted(extra_keys):
paths = ", ".join(sorted(path_map.get(k, [])))
extras.append(f"{k} ({paths})")
print(" JSON 仅有(表无此列):", "; ".join(extras))
else:
print(" JSON 仅有(表无此列): 无")
print("-" * 80)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
"""验证DWS配置数据"""
import os
from pathlib import Path
from dotenv import load_dotenv
import psycopg2
def main():
load_dotenv(Path(__file__).parent.parent / ".env")
dsn = os.getenv("PG_DSN")
conn = psycopg2.connect(dsn)
tables = [
"cfg_performance_tier",
"cfg_assistant_level_price",
"cfg_bonus_rules",
"cfg_area_category",
"cfg_skill_type"
]
print("DWS 配置表数据统计:")
print("-" * 40)
with conn.cursor() as cur:
for t in tables:
cur.execute(f"SELECT COUNT(*) FROM dws.{t}")
cnt = cur.fetchone()[0]
print(f"{t}: {cnt}")
conn.close()
if __name__ == "__main__":
main()