数据库 数据校验写入等逻辑更新。
This commit is contained in:
155
tmp/query_schema_and_samples.py
Normal file
155
tmp/query_schema_and_samples.py
Normal file
@@ -0,0 +1,155 @@
|
||||
import psycopg2
|
||||
import os
|
||||
import json
|
||||
|
||||
DSN = 'postgresql://local-Python:Neo-local-1991125@100.64.0.4:5432/LLZQ-test'
|
||||
|
||||
conn = psycopg2.connect(DSN)
|
||||
cur = conn.cursor()
|
||||
|
||||
# 获取所有表
|
||||
cur.execute("""
|
||||
SELECT table_name
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = 'billiards_dwd'
|
||||
ORDER BY table_name
|
||||
""")
|
||||
tables = [row[0] for row in cur.fetchall()]
|
||||
|
||||
output_dir = r'c:\dev\LLTQ\ETL\feiqiu-ETL\tmp\table_analysis'
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
for table in tables:
|
||||
print(f"Processing {table}...")
|
||||
|
||||
# 1. 获取字段结构
|
||||
cur.execute("""
|
||||
SELECT
|
||||
c.column_name,
|
||||
c.data_type,
|
||||
c.character_maximum_length,
|
||||
c.numeric_precision,
|
||||
c.numeric_scale,
|
||||
c.is_nullable,
|
||||
c.column_default
|
||||
FROM information_schema.columns c
|
||||
WHERE c.table_schema = 'billiards_dwd' AND c.table_name = %s
|
||||
ORDER BY c.ordinal_position
|
||||
""", (table,))
|
||||
columns = cur.fetchall()
|
||||
|
||||
# 2. 获取主键信息
|
||||
cur.execute("""
|
||||
SELECT kcu.column_name
|
||||
FROM information_schema.table_constraints tc
|
||||
JOIN information_schema.key_column_usage kcu
|
||||
ON tc.constraint_name = kcu.constraint_name
|
||||
AND tc.table_schema = kcu.table_schema
|
||||
WHERE tc.table_schema = 'billiards_dwd'
|
||||
AND tc.table_name = %s
|
||||
AND tc.constraint_type = 'PRIMARY KEY'
|
||||
""", (table,))
|
||||
pk_cols = [row[0] for row in cur.fetchall()]
|
||||
|
||||
# 3. 获取500行样本数据
|
||||
try:
|
||||
cur.execute(f'SELECT * FROM billiards_dwd."{table}" LIMIT 500')
|
||||
sample_rows = cur.fetchall()
|
||||
col_names = [desc[0] for desc in cur.description]
|
||||
except Exception as e:
|
||||
sample_rows = []
|
||||
col_names = []
|
||||
print(f" Error fetching samples: {e}")
|
||||
|
||||
# 4. 分析每个字段的值分布(用于识别枚举)
|
||||
value_analysis = {}
|
||||
for col_info in columns:
|
||||
col_name = col_info[0]
|
||||
data_type = col_info[1]
|
||||
|
||||
# 只对整数、小整数、文本类型做枚举分析
|
||||
if data_type in ('integer', 'smallint', 'bigint', 'text', 'character varying', 'boolean'):
|
||||
try:
|
||||
cur.execute(f"""
|
||||
SELECT "{col_name}", COUNT(*) as cnt
|
||||
FROM billiards_dwd."{table}"
|
||||
WHERE "{col_name}" IS NOT NULL
|
||||
GROUP BY "{col_name}"
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 20
|
||||
""")
|
||||
distinct_values = cur.fetchall()
|
||||
|
||||
# 获取总行数和非空数
|
||||
cur.execute(f"""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT("{col_name}") as non_null,
|
||||
COUNT(DISTINCT "{col_name}") as distinct_count
|
||||
FROM billiards_dwd."{table}"
|
||||
""")
|
||||
stats = cur.fetchone()
|
||||
|
||||
value_analysis[col_name] = {
|
||||
'total_rows': stats[0],
|
||||
'non_null': stats[1],
|
||||
'distinct_count': stats[2],
|
||||
'top_values': [(str(v[0])[:100] if v[0] is not None else 'NULL', v[1]) for v in distinct_values]
|
||||
}
|
||||
except Exception as e:
|
||||
value_analysis[col_name] = {'error': str(e)}
|
||||
|
||||
# 5. 写入分析文件
|
||||
with open(os.path.join(output_dir, f'{table}.txt'), 'w', encoding='utf-8') as f:
|
||||
f.write(f"=" * 80 + "\n")
|
||||
f.write(f"Table: billiards_dwd.{table}\n")
|
||||
f.write(f"Primary Key: {', '.join(pk_cols) if pk_cols else 'None'}\n")
|
||||
f.write(f"=" * 80 + "\n\n")
|
||||
|
||||
f.write("## COLUMNS\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
for col in columns:
|
||||
col_name, data_type, char_len, num_prec, num_scale, nullable, default = col
|
||||
type_str = data_type
|
||||
if char_len:
|
||||
type_str = f"{data_type}({char_len})"
|
||||
elif num_prec and data_type == 'numeric':
|
||||
type_str = f"numeric({num_prec},{num_scale or 0})"
|
||||
|
||||
is_pk = "PK" if col_name in pk_cols else ""
|
||||
f.write(f"{col_name}: {type_str} | nullable={nullable} | {is_pk}\n")
|
||||
|
||||
f.write("\n## VALUE ANALYSIS (for enum detection)\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
for col_name, analysis in value_analysis.items():
|
||||
if 'error' in analysis:
|
||||
f.write(f"\n{col_name}: ERROR - {analysis['error']}\n")
|
||||
continue
|
||||
|
||||
f.write(f"\n{col_name}:\n")
|
||||
f.write(f" Total: {analysis['total_rows']}, Non-null: {analysis['non_null']}, Distinct: {analysis['distinct_count']}\n")
|
||||
|
||||
# 如果不同值少于15个,可能是枚举
|
||||
if analysis['distinct_count'] <= 15 and analysis['distinct_count'] > 0:
|
||||
f.write(f" *** LIKELY ENUM (distinct <= 15) ***\n")
|
||||
|
||||
f.write(f" Top values:\n")
|
||||
for val, cnt in analysis['top_values'][:15]:
|
||||
f.write(f" {val}: {cnt}\n")
|
||||
|
||||
# 写入样本数据的前10行
|
||||
f.write("\n## SAMPLE DATA (first 10 rows)\n")
|
||||
f.write("-" * 80 + "\n")
|
||||
if sample_rows:
|
||||
f.write(f"Columns: {col_names}\n\n")
|
||||
for i, row in enumerate(sample_rows[:10]):
|
||||
f.write(f"Row {i+1}:\n")
|
||||
for j, val in enumerate(row):
|
||||
val_str = str(val)[:200] if val is not None else 'NULL'
|
||||
f.write(f" {col_names[j]}: {val_str}\n")
|
||||
f.write("\n")
|
||||
else:
|
||||
f.write("No sample data available.\n")
|
||||
|
||||
print(f"\nGenerated analysis files for {len(tables)} tables in {output_dir}")
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user