import psycopg2 import os import json DSN = 'postgresql://local-Python:Neo-local-1991125@100.64.0.4:5432/LLZQ-test' conn = psycopg2.connect(DSN) cur = conn.cursor() # 获取所有表 cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'billiards_dwd' ORDER BY table_name """) tables = [row[0] for row in cur.fetchall()] output_dir = r'c:\dev\LLTQ\ETL\feiqiu-ETL\tmp\table_analysis' os.makedirs(output_dir, exist_ok=True) for table in tables: print(f"Processing {table}...") # 1. 获取字段结构 cur.execute(""" SELECT c.column_name, c.data_type, c.character_maximum_length, c.numeric_precision, c.numeric_scale, c.is_nullable, c.column_default FROM information_schema.columns c WHERE c.table_schema = 'billiards_dwd' AND c.table_name = %s ORDER BY c.ordinal_position """, (table,)) columns = cur.fetchall() # 2. 获取主键信息 cur.execute(""" SELECT kcu.column_name FROM information_schema.table_constraints tc JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name AND tc.table_schema = kcu.table_schema WHERE tc.table_schema = 'billiards_dwd' AND tc.table_name = %s AND tc.constraint_type = 'PRIMARY KEY' """, (table,)) pk_cols = [row[0] for row in cur.fetchall()] # 3. 获取500行样本数据 try: cur.execute(f'SELECT * FROM billiards_dwd."{table}" LIMIT 500') sample_rows = cur.fetchall() col_names = [desc[0] for desc in cur.description] except Exception as e: sample_rows = [] col_names = [] print(f" Error fetching samples: {e}") # 4. 分析每个字段的值分布(用于识别枚举) value_analysis = {} for col_info in columns: col_name = col_info[0] data_type = col_info[1] # 只对整数、小整数、文本类型做枚举分析 if data_type in ('integer', 'smallint', 'bigint', 'text', 'character varying', 'boolean'): try: cur.execute(f""" SELECT "{col_name}", COUNT(*) as cnt FROM billiards_dwd."{table}" WHERE "{col_name}" IS NOT NULL GROUP BY "{col_name}" ORDER BY cnt DESC LIMIT 20 """) distinct_values = cur.fetchall() # 获取总行数和非空数 cur.execute(f""" SELECT COUNT(*) as total, COUNT("{col_name}") as non_null, COUNT(DISTINCT "{col_name}") as distinct_count FROM billiards_dwd."{table}" """) stats = cur.fetchone() value_analysis[col_name] = { 'total_rows': stats[0], 'non_null': stats[1], 'distinct_count': stats[2], 'top_values': [(str(v[0])[:100] if v[0] is not None else 'NULL', v[1]) for v in distinct_values] } except Exception as e: value_analysis[col_name] = {'error': str(e)} # 5. 写入分析文件 with open(os.path.join(output_dir, f'{table}.txt'), 'w', encoding='utf-8') as f: f.write(f"=" * 80 + "\n") f.write(f"Table: billiards_dwd.{table}\n") f.write(f"Primary Key: {', '.join(pk_cols) if pk_cols else 'None'}\n") f.write(f"=" * 80 + "\n\n") f.write("## COLUMNS\n") f.write("-" * 80 + "\n") for col in columns: col_name, data_type, char_len, num_prec, num_scale, nullable, default = col type_str = data_type if char_len: type_str = f"{data_type}({char_len})" elif num_prec and data_type == 'numeric': type_str = f"numeric({num_prec},{num_scale or 0})" is_pk = "PK" if col_name in pk_cols else "" f.write(f"{col_name}: {type_str} | nullable={nullable} | {is_pk}\n") f.write("\n## VALUE ANALYSIS (for enum detection)\n") f.write("-" * 80 + "\n") for col_name, analysis in value_analysis.items(): if 'error' in analysis: f.write(f"\n{col_name}: ERROR - {analysis['error']}\n") continue f.write(f"\n{col_name}:\n") f.write(f" Total: {analysis['total_rows']}, Non-null: {analysis['non_null']}, Distinct: {analysis['distinct_count']}\n") # 如果不同值少于15个,可能是枚举 if analysis['distinct_count'] <= 15 and analysis['distinct_count'] > 0: f.write(f" *** LIKELY ENUM (distinct <= 15) ***\n") f.write(f" Top values:\n") for val, cnt in analysis['top_values'][:15]: f.write(f" {val}: {cnt}\n") # 写入样本数据的前10行 f.write("\n## SAMPLE DATA (first 10 rows)\n") f.write("-" * 80 + "\n") if sample_rows: f.write(f"Columns: {col_names}\n\n") for i, row in enumerate(sample_rows[:10]): f.write(f"Row {i+1}:\n") for j, val in enumerate(row): val_str = str(val)[:200] if val is not None else 'NULL' f.write(f" {col_names[j]}: {val_str}\n") f.write("\n") else: f.write("No sample data available.\n") print(f"\nGenerated analysis files for {len(tables)} tables in {output_dir}") conn.close()