Files
feiqiu-ETL/tmp/query_schema_and_samples.py

156 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import psycopg2
import os
import json
DSN = 'postgresql://local-Python:Neo-local-1991125@100.64.0.4:5432/LLZQ-test'
conn = psycopg2.connect(DSN)
cur = conn.cursor()
# 获取所有表
cur.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'billiards_dwd'
ORDER BY table_name
""")
tables = [row[0] for row in cur.fetchall()]
output_dir = r'c:\dev\LLTQ\ETL\feiqiu-ETL\tmp\table_analysis'
os.makedirs(output_dir, exist_ok=True)
for table in tables:
print(f"Processing {table}...")
# 1. 获取字段结构
cur.execute("""
SELECT
c.column_name,
c.data_type,
c.character_maximum_length,
c.numeric_precision,
c.numeric_scale,
c.is_nullable,
c.column_default
FROM information_schema.columns c
WHERE c.table_schema = 'billiards_dwd' AND c.table_name = %s
ORDER BY c.ordinal_position
""", (table,))
columns = cur.fetchall()
# 2. 获取主键信息
cur.execute("""
SELECT kcu.column_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
ON tc.constraint_name = kcu.constraint_name
AND tc.table_schema = kcu.table_schema
WHERE tc.table_schema = 'billiards_dwd'
AND tc.table_name = %s
AND tc.constraint_type = 'PRIMARY KEY'
""", (table,))
pk_cols = [row[0] for row in cur.fetchall()]
# 3. 获取500行样本数据
try:
cur.execute(f'SELECT * FROM billiards_dwd."{table}" LIMIT 500')
sample_rows = cur.fetchall()
col_names = [desc[0] for desc in cur.description]
except Exception as e:
sample_rows = []
col_names = []
print(f" Error fetching samples: {e}")
# 4. 分析每个字段的值分布(用于识别枚举)
value_analysis = {}
for col_info in columns:
col_name = col_info[0]
data_type = col_info[1]
# 只对整数、小整数、文本类型做枚举分析
if data_type in ('integer', 'smallint', 'bigint', 'text', 'character varying', 'boolean'):
try:
cur.execute(f"""
SELECT "{col_name}", COUNT(*) as cnt
FROM billiards_dwd."{table}"
WHERE "{col_name}" IS NOT NULL
GROUP BY "{col_name}"
ORDER BY cnt DESC
LIMIT 20
""")
distinct_values = cur.fetchall()
# 获取总行数和非空数
cur.execute(f"""
SELECT
COUNT(*) as total,
COUNT("{col_name}") as non_null,
COUNT(DISTINCT "{col_name}") as distinct_count
FROM billiards_dwd."{table}"
""")
stats = cur.fetchone()
value_analysis[col_name] = {
'total_rows': stats[0],
'non_null': stats[1],
'distinct_count': stats[2],
'top_values': [(str(v[0])[:100] if v[0] is not None else 'NULL', v[1]) for v in distinct_values]
}
except Exception as e:
value_analysis[col_name] = {'error': str(e)}
# 5. 写入分析文件
with open(os.path.join(output_dir, f'{table}.txt'), 'w', encoding='utf-8') as f:
f.write(f"=" * 80 + "\n")
f.write(f"Table: billiards_dwd.{table}\n")
f.write(f"Primary Key: {', '.join(pk_cols) if pk_cols else 'None'}\n")
f.write(f"=" * 80 + "\n\n")
f.write("## COLUMNS\n")
f.write("-" * 80 + "\n")
for col in columns:
col_name, data_type, char_len, num_prec, num_scale, nullable, default = col
type_str = data_type
if char_len:
type_str = f"{data_type}({char_len})"
elif num_prec and data_type == 'numeric':
type_str = f"numeric({num_prec},{num_scale or 0})"
is_pk = "PK" if col_name in pk_cols else ""
f.write(f"{col_name}: {type_str} | nullable={nullable} | {is_pk}\n")
f.write("\n## VALUE ANALYSIS (for enum detection)\n")
f.write("-" * 80 + "\n")
for col_name, analysis in value_analysis.items():
if 'error' in analysis:
f.write(f"\n{col_name}: ERROR - {analysis['error']}\n")
continue
f.write(f"\n{col_name}:\n")
f.write(f" Total: {analysis['total_rows']}, Non-null: {analysis['non_null']}, Distinct: {analysis['distinct_count']}\n")
# 如果不同值少于15个可能是枚举
if analysis['distinct_count'] <= 15 and analysis['distinct_count'] > 0:
f.write(f" *** LIKELY ENUM (distinct <= 15) ***\n")
f.write(f" Top values:\n")
for val, cnt in analysis['top_values'][:15]:
f.write(f" {val}: {cnt}\n")
# 写入样本数据的前10行
f.write("\n## SAMPLE DATA (first 10 rows)\n")
f.write("-" * 80 + "\n")
if sample_rows:
f.write(f"Columns: {col_names}\n\n")
for i, row in enumerate(sample_rows[:10]):
f.write(f"Row {i+1}:\n")
for j, val in enumerate(row):
val_str = str(val)[:200] if val is not None else 'NULL'
f.write(f" {col_names[j]}: {val_str}\n")
f.write("\n")
else:
f.write("No sample data available.\n")
print(f"\nGenerated analysis files for {len(tables)} tables in {output_dir}")
conn.close()