在前后端开发联调前 的提交20260223
This commit is contained in:
345
scripts/ops/field_level_report.py
Normal file
345
scripts/ops/field_level_report.py
Normal file
@@ -0,0 +1,345 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
字段级数据质量采样分析报告(v2 - 性能优化版)
|
||||
|
||||
策略:每张表只执行 1~2 条 SQL(而非逐字段查询),大幅减少网络往返。
|
||||
- 用 information_schema 获取列元数据
|
||||
- 用动态 SQL 一次性获取所有列的 NULL 计数
|
||||
- 数值/日期/文本统计用单条聚合 SQL
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(Path(__file__).resolve().parents[2] / ".env")
|
||||
|
||||
ETL_REPORT_ROOT = os.environ.get("ETL_REPORT_ROOT")
|
||||
if not ETL_REPORT_ROOT:
|
||||
raise RuntimeError("ETL_REPORT_ROOT 未在 .env 中定义")
|
||||
|
||||
PG_DSN = os.environ.get("PG_DSN")
|
||||
if not PG_DSN:
|
||||
raise RuntimeError("PG_DSN 未在 .env 中定义")
|
||||
|
||||
TARGET_SCHEMAS = ["ods", "dwd", "dws"]
|
||||
# 跳过这些列的详细统计(ETL 元数据列,不影响业务判断)
|
||||
SKIP_STATS_COLS = {"payload", "content_hash", "record_index", "source_file", "source_endpoint"}
|
||||
|
||||
|
||||
def get_conn():
|
||||
conn = psycopg2.connect(PG_DSN, cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
conn.set_session(readonly=True)
|
||||
return conn
|
||||
|
||||
|
||||
def list_tables(conn, schema: str) -> list[str]:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT table_name FROM information_schema.tables
|
||||
WHERE table_schema = %s AND table_type = 'BASE TABLE'
|
||||
ORDER BY table_name
|
||||
""", (schema,))
|
||||
return [r["table_name"] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def get_columns_meta(conn, schema: str, table: str) -> list[dict]:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT column_name, udt_name, is_nullable,
|
||||
character_maximum_length, numeric_precision, numeric_scale
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = %s AND table_name = %s
|
||||
ORDER BY ordinal_position
|
||||
""", (schema, table))
|
||||
return [dict(r) for r in cur.fetchall()]
|
||||
|
||||
|
||||
def analyze_table_batch(conn, schema: str, table: str, columns: list[dict]) -> dict:
|
||||
"""用尽量少的 SQL 批量分析一张表的所有字段。
|
||||
|
||||
核心思路:构造一条 SELECT,每个列生成若干聚合表达式,一次性拿到所有统计。
|
||||
"""
|
||||
with conn.cursor() as cur:
|
||||
# 1) 行数
|
||||
try:
|
||||
cur.execute(f"SELECT COUNT(*) AS cnt FROM {schema}.{table}")
|
||||
total = cur.fetchone()["cnt"]
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
return {"table": f"{schema}.{table}", "total_rows": -1, "columns": [], "error": "无法读取"}
|
||||
|
||||
if total == 0:
|
||||
return {
|
||||
"table": f"{schema}.{table}",
|
||||
"total_rows": 0,
|
||||
"column_count": len(columns),
|
||||
"columns": [
|
||||
{"column": c["column_name"], "type": c["udt_name"], "total": 0,
|
||||
"null_count": 0, "null_pct": "0%", "distinct": 0, "notes": "空表"}
|
||||
for c in columns
|
||||
],
|
||||
}
|
||||
|
||||
# 2) 构造批量聚合 SQL
|
||||
# 对每个列生成: COUNT(*) FILTER (WHERE "col" IS NULL) AS null_col
|
||||
# 对数值列: MIN/MAX/AVG
|
||||
# 对日期列: MIN/MAX
|
||||
# 对文本列: MIN(LENGTH)/MAX(LENGTH)
|
||||
# 对 bool 列: COUNT FILTER TRUE/FALSE
|
||||
select_parts = [f"{total} AS _total"]
|
||||
col_plan = [] # 记录每列的统计计划
|
||||
|
||||
for c in columns:
|
||||
cname = c["column_name"]
|
||||
udt = c["udt_name"]
|
||||
safe = f'"{cname}"'
|
||||
alias_base = cname.replace(" ", "_").replace("-", "_")
|
||||
|
||||
plan = {"column": cname, "type": udt, "stats": []}
|
||||
|
||||
# NULL 计数(所有列都做)
|
||||
select_parts.append(f"COUNT(*) FILTER (WHERE {safe} IS NULL) AS null_{alias_base}")
|
||||
plan["stats"].append("null")
|
||||
|
||||
# 跳过 JSONB/bytea/ETL 元数据列的详细统计
|
||||
if udt in ("jsonb", "json", "bytea") or cname in SKIP_STATS_COLS:
|
||||
col_plan.append(plan)
|
||||
continue
|
||||
|
||||
if udt in ("int2", "int4", "int8", "float4", "float8", "numeric"):
|
||||
select_parts.append(f"MIN({safe}) AS min_{alias_base}")
|
||||
select_parts.append(f"MAX({safe}) AS max_{alias_base}")
|
||||
select_parts.append(f"ROUND(AVG({safe})::numeric, 2) AS avg_{alias_base}")
|
||||
plan["stats"].extend(["min", "max", "avg"])
|
||||
elif udt in ("date", "timestamp", "timestamptz"):
|
||||
# 用 text 比较避免 psycopg2 解析 year<1 的异常日期
|
||||
select_parts.append(f"MIN({safe}::text) FILTER (WHERE {safe}::text >= '0001') AS min_{alias_base}")
|
||||
select_parts.append(f"MAX({safe}::text) FILTER (WHERE {safe}::text <= '9999') AS max_{alias_base}")
|
||||
plan["stats"].extend(["earliest", "latest"])
|
||||
elif udt in ("text", "varchar", "bpchar", "name"):
|
||||
select_parts.append(f"MIN(LENGTH({safe})) AS minlen_{alias_base}")
|
||||
select_parts.append(f"MAX(LENGTH({safe})) AS maxlen_{alias_base}")
|
||||
plan["stats"].extend(["min_len", "max_len"])
|
||||
elif udt == "bool":
|
||||
select_parts.append(f"COUNT(*) FILTER (WHERE {safe} = TRUE) AS true_{alias_base}")
|
||||
select_parts.append(f"COUNT(*) FILTER (WHERE {safe} = FALSE) AS false_{alias_base}")
|
||||
plan["stats"].extend(["true_count", "false_count"])
|
||||
|
||||
col_plan.append(plan)
|
||||
|
||||
# 执行批量聚合
|
||||
sql = f"SELECT {', '.join(select_parts)} FROM {schema}.{table}"
|
||||
try:
|
||||
cur.execute(sql)
|
||||
agg = cur.fetchone()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
return {
|
||||
"table": f"{schema}.{table}",
|
||||
"total_rows": total,
|
||||
"column_count": len(columns),
|
||||
"columns": [],
|
||||
"error": f"聚合查询失败: {str(e)[:120]}",
|
||||
}
|
||||
|
||||
# 3) 解析结果
|
||||
results = []
|
||||
for plan in col_plan:
|
||||
cname = plan["column"]
|
||||
udt = plan["type"]
|
||||
alias_base = cname.replace(" ", "_").replace("-", "_")
|
||||
|
||||
null_cnt = agg.get(f"null_{alias_base}", 0) or 0
|
||||
null_pct = round(null_cnt / total * 100, 1) if total > 0 else 0
|
||||
|
||||
r = {
|
||||
"column": cname,
|
||||
"type": udt,
|
||||
"total": total,
|
||||
"null_count": null_cnt,
|
||||
"null_pct": f"{null_pct}%",
|
||||
}
|
||||
|
||||
if udt in ("jsonb", "json", "bytea"):
|
||||
r["samples"] = [f"({udt.upper()})"]
|
||||
results.append(r)
|
||||
continue
|
||||
if cname in SKIP_STATS_COLS:
|
||||
r["samples"] = ["(ETL元数据)"]
|
||||
results.append(r)
|
||||
continue
|
||||
|
||||
if "min" in plan["stats"]:
|
||||
r["min"] = agg.get(f"min_{alias_base}")
|
||||
r["max"] = agg.get(f"max_{alias_base}")
|
||||
r["avg"] = agg.get(f"avg_{alias_base}")
|
||||
if "earliest" in plan["stats"]:
|
||||
v = agg.get(f"min_{alias_base}")
|
||||
r["earliest"] = str(v) if v else None
|
||||
v = agg.get(f"max_{alias_base}")
|
||||
r["latest"] = str(v) if v else None
|
||||
if "min_len" in plan["stats"]:
|
||||
r["min_len"] = agg.get(f"minlen_{alias_base}")
|
||||
r["max_len"] = agg.get(f"maxlen_{alias_base}")
|
||||
if "true_count" in plan["stats"]:
|
||||
r["true_count"] = agg.get(f"true_{alias_base}")
|
||||
r["false_count"] = agg.get(f"false_{alias_base}")
|
||||
|
||||
results.append(r)
|
||||
|
||||
# 4) 对非大表补充 distinct 计数(小表逐列,大表跳过)
|
||||
if total <= 3000:
|
||||
for r in results:
|
||||
cname = r["column"]
|
||||
udt = r["type"]
|
||||
if udt in ("jsonb", "json", "bytea") or cname in SKIP_STATS_COLS:
|
||||
r["distinct"] = "-"
|
||||
continue
|
||||
try:
|
||||
cur.execute(f'SELECT COUNT(DISTINCT "{cname}") AS d FROM {schema}.{table}')
|
||||
r["distinct"] = cur.fetchone()["d"]
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
r["distinct"] = "?"
|
||||
else:
|
||||
for r in results:
|
||||
r["distinct"] = "-"
|
||||
|
||||
return {
|
||||
"table": f"{schema}.{table}",
|
||||
"total_rows": total,
|
||||
"column_count": len(columns),
|
||||
"columns": results,
|
||||
}
|
||||
|
||||
|
||||
# ── 报告格式化 ────────────────────────────────────────────────
|
||||
|
||||
def fmt_col_row(c: dict) -> str:
|
||||
"""格式化单个字段为 Markdown 表格行"""
|
||||
col = c.get("column", "?")
|
||||
typ = c.get("type", "?")
|
||||
null_pct = c.get("null_pct", "?")
|
||||
distinct = c.get("distinct", "-")
|
||||
|
||||
stats_parts = []
|
||||
if "min" in c and c["min"] is not None:
|
||||
stats_parts.append(f"min={c['min']}, max={c['max']}, avg={c['avg']}")
|
||||
if "earliest" in c and c["earliest"] is not None:
|
||||
stats_parts.append(f"{c['earliest']} ~ {c['latest']}")
|
||||
if "min_len" in c and c["min_len"] is not None:
|
||||
stats_parts.append(f"len={c['min_len']}~{c['max_len']}")
|
||||
if "true_count" in c:
|
||||
stats_parts.append(f"T={c['true_count']}, F={c['false_count']}")
|
||||
stats = "; ".join(stats_parts) if stats_parts else "-"
|
||||
|
||||
samples = c.get("samples", [])
|
||||
sample_str = ", ".join(str(s)[:40] for s in samples[:3]) if samples else "-"
|
||||
|
||||
return f"| {col} | {typ} | {null_pct} | {distinct} | {stats} | {sample_str} |"
|
||||
|
||||
|
||||
def generate_report(all_results: dict[str, list[dict]]) -> str:
|
||||
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
lines = [
|
||||
f"# 字段级数据质量采样报告",
|
||||
f"",
|
||||
f"生成时间: {ts}",
|
||||
f"",
|
||||
]
|
||||
|
||||
for schema in TARGET_SCHEMAS:
|
||||
tables = all_results.get(schema, [])
|
||||
if not tables:
|
||||
continue
|
||||
|
||||
total_rows_sum = sum(t["total_rows"] for t in tables if t["total_rows"] > 0)
|
||||
lines.append(f"## {schema.upper()} 层({len(tables)} 张表,共 {total_rows_sum:,} 行)")
|
||||
lines.append("")
|
||||
|
||||
for tbl in tables:
|
||||
tname = tbl["table"]
|
||||
total = tbl["total_rows"]
|
||||
col_count = tbl.get("column_count", 0)
|
||||
|
||||
lines.append(f"### {tname}({total:,} 行,{col_count} 列)")
|
||||
lines.append("")
|
||||
|
||||
if tbl.get("error"):
|
||||
lines.append(f"> ❌ {tbl['error']}")
|
||||
lines.append("")
|
||||
continue
|
||||
|
||||
if not tbl["columns"]:
|
||||
lines.append("> 无列信息")
|
||||
lines.append("")
|
||||
continue
|
||||
|
||||
lines.append("| 字段 | 类型 | NULL率 | 唯一值 | 统计 | 样本 |")
|
||||
lines.append("|------|------|--------|--------|------|------|")
|
||||
|
||||
for col in tbl["columns"]:
|
||||
lines.append(fmt_col_row(col))
|
||||
|
||||
lines.append("")
|
||||
|
||||
total_tables = sum(len(v) for v in all_results.values())
|
||||
total_cols = sum(
|
||||
tbl.get("column_count", 0)
|
||||
for tables in all_results.values()
|
||||
for tbl in tables
|
||||
)
|
||||
lines.append("## 汇总")
|
||||
lines.append("")
|
||||
lines.append(f"- 分析表数: {total_tables}")
|
||||
lines.append(f"- 分析字段数: {total_cols}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main():
|
||||
print("=== 字段级数据质量采样分析 (v2) ===")
|
||||
conn = get_conn()
|
||||
|
||||
all_results: dict[str, list[dict]] = {}
|
||||
|
||||
for schema in TARGET_SCHEMAS:
|
||||
print(f"\n分析 {schema} 层...")
|
||||
tables = list_tables(conn, schema)
|
||||
print(f" {len(tables)} 张表")
|
||||
schema_results = []
|
||||
for i, t in enumerate(tables, 1):
|
||||
cols = get_columns_meta(conn, schema, t)
|
||||
print(f" [{i}/{len(tables)}] {schema}.{t} ({len(cols)} 列)...", end="", flush=True)
|
||||
result = analyze_table_batch(conn, schema, t, cols)
|
||||
schema_results.append(result)
|
||||
print(f" {result['total_rows']:,} 行", end="")
|
||||
if result.get("error"):
|
||||
print(f" ❌ {result['error'][:60]}")
|
||||
else:
|
||||
print(" ✓")
|
||||
all_results[schema] = schema_results
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n生成报告...")
|
||||
report = generate_report(all_results)
|
||||
|
||||
out_dir = Path(ETL_REPORT_ROOT)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
out_path = out_dir / f"field_level_report_{ts}.md"
|
||||
out_path.write_text(report, encoding="utf-8")
|
||||
print(f"报告已生成: {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user