156 lines
5.7 KiB
Python
156 lines
5.7 KiB
Python
import psycopg2
|
||
import os
|
||
import json
|
||
|
||
DSN = 'postgresql://local-Python:Neo-local-1991125@100.64.0.4:5432/LLZQ-test'
|
||
|
||
conn = psycopg2.connect(DSN)
|
||
cur = conn.cursor()
|
||
|
||
# 获取所有表
|
||
cur.execute("""
|
||
SELECT table_name
|
||
FROM information_schema.tables
|
||
WHERE table_schema = 'billiards_dwd'
|
||
ORDER BY table_name
|
||
""")
|
||
tables = [row[0] for row in cur.fetchall()]
|
||
|
||
output_dir = r'c:\dev\LLTQ\ETL\feiqiu-ETL\tmp\table_analysis'
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
for table in tables:
|
||
print(f"Processing {table}...")
|
||
|
||
# 1. 获取字段结构
|
||
cur.execute("""
|
||
SELECT
|
||
c.column_name,
|
||
c.data_type,
|
||
c.character_maximum_length,
|
||
c.numeric_precision,
|
||
c.numeric_scale,
|
||
c.is_nullable,
|
||
c.column_default
|
||
FROM information_schema.columns c
|
||
WHERE c.table_schema = 'billiards_dwd' AND c.table_name = %s
|
||
ORDER BY c.ordinal_position
|
||
""", (table,))
|
||
columns = cur.fetchall()
|
||
|
||
# 2. 获取主键信息
|
||
cur.execute("""
|
||
SELECT kcu.column_name
|
||
FROM information_schema.table_constraints tc
|
||
JOIN information_schema.key_column_usage kcu
|
||
ON tc.constraint_name = kcu.constraint_name
|
||
AND tc.table_schema = kcu.table_schema
|
||
WHERE tc.table_schema = 'billiards_dwd'
|
||
AND tc.table_name = %s
|
||
AND tc.constraint_type = 'PRIMARY KEY'
|
||
""", (table,))
|
||
pk_cols = [row[0] for row in cur.fetchall()]
|
||
|
||
# 3. 获取500行样本数据
|
||
try:
|
||
cur.execute(f'SELECT * FROM billiards_dwd."{table}" LIMIT 500')
|
||
sample_rows = cur.fetchall()
|
||
col_names = [desc[0] for desc in cur.description]
|
||
except Exception as e:
|
||
sample_rows = []
|
||
col_names = []
|
||
print(f" Error fetching samples: {e}")
|
||
|
||
# 4. 分析每个字段的值分布(用于识别枚举)
|
||
value_analysis = {}
|
||
for col_info in columns:
|
||
col_name = col_info[0]
|
||
data_type = col_info[1]
|
||
|
||
# 只对整数、小整数、文本类型做枚举分析
|
||
if data_type in ('integer', 'smallint', 'bigint', 'text', 'character varying', 'boolean'):
|
||
try:
|
||
cur.execute(f"""
|
||
SELECT "{col_name}", COUNT(*) as cnt
|
||
FROM billiards_dwd."{table}"
|
||
WHERE "{col_name}" IS NOT NULL
|
||
GROUP BY "{col_name}"
|
||
ORDER BY cnt DESC
|
||
LIMIT 20
|
||
""")
|
||
distinct_values = cur.fetchall()
|
||
|
||
# 获取总行数和非空数
|
||
cur.execute(f"""
|
||
SELECT
|
||
COUNT(*) as total,
|
||
COUNT("{col_name}") as non_null,
|
||
COUNT(DISTINCT "{col_name}") as distinct_count
|
||
FROM billiards_dwd."{table}"
|
||
""")
|
||
stats = cur.fetchone()
|
||
|
||
value_analysis[col_name] = {
|
||
'total_rows': stats[0],
|
||
'non_null': stats[1],
|
||
'distinct_count': stats[2],
|
||
'top_values': [(str(v[0])[:100] if v[0] is not None else 'NULL', v[1]) for v in distinct_values]
|
||
}
|
||
except Exception as e:
|
||
value_analysis[col_name] = {'error': str(e)}
|
||
|
||
# 5. 写入分析文件
|
||
with open(os.path.join(output_dir, f'{table}.txt'), 'w', encoding='utf-8') as f:
|
||
f.write(f"=" * 80 + "\n")
|
||
f.write(f"Table: billiards_dwd.{table}\n")
|
||
f.write(f"Primary Key: {', '.join(pk_cols) if pk_cols else 'None'}\n")
|
||
f.write(f"=" * 80 + "\n\n")
|
||
|
||
f.write("## COLUMNS\n")
|
||
f.write("-" * 80 + "\n")
|
||
for col in columns:
|
||
col_name, data_type, char_len, num_prec, num_scale, nullable, default = col
|
||
type_str = data_type
|
||
if char_len:
|
||
type_str = f"{data_type}({char_len})"
|
||
elif num_prec and data_type == 'numeric':
|
||
type_str = f"numeric({num_prec},{num_scale or 0})"
|
||
|
||
is_pk = "PK" if col_name in pk_cols else ""
|
||
f.write(f"{col_name}: {type_str} | nullable={nullable} | {is_pk}\n")
|
||
|
||
f.write("\n## VALUE ANALYSIS (for enum detection)\n")
|
||
f.write("-" * 80 + "\n")
|
||
for col_name, analysis in value_analysis.items():
|
||
if 'error' in analysis:
|
||
f.write(f"\n{col_name}: ERROR - {analysis['error']}\n")
|
||
continue
|
||
|
||
f.write(f"\n{col_name}:\n")
|
||
f.write(f" Total: {analysis['total_rows']}, Non-null: {analysis['non_null']}, Distinct: {analysis['distinct_count']}\n")
|
||
|
||
# 如果不同值少于15个,可能是枚举
|
||
if analysis['distinct_count'] <= 15 and analysis['distinct_count'] > 0:
|
||
f.write(f" *** LIKELY ENUM (distinct <= 15) ***\n")
|
||
|
||
f.write(f" Top values:\n")
|
||
for val, cnt in analysis['top_values'][:15]:
|
||
f.write(f" {val}: {cnt}\n")
|
||
|
||
# 写入样本数据的前10行
|
||
f.write("\n## SAMPLE DATA (first 10 rows)\n")
|
||
f.write("-" * 80 + "\n")
|
||
if sample_rows:
|
||
f.write(f"Columns: {col_names}\n\n")
|
||
for i, row in enumerate(sample_rows[:10]):
|
||
f.write(f"Row {i+1}:\n")
|
||
for j, val in enumerate(row):
|
||
val_str = str(val)[:200] if val is not None else 'NULL'
|
||
f.write(f" {col_names[j]}: {val_str}\n")
|
||
f.write("\n")
|
||
else:
|
||
f.write("No sample data available.\n")
|
||
|
||
print(f"\nGenerated analysis files for {len(tables)} tables in {output_dir}")
|
||
conn.close()
|