数据库 数据校验写入等逻辑更新。

This commit is contained in:
Neo
2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions

View File

@@ -0,0 +1,155 @@
import psycopg2
import os
import json
DSN = 'postgresql://local-Python:Neo-local-1991125@100.64.0.4:5432/LLZQ-test'
conn = psycopg2.connect(DSN)
cur = conn.cursor()
# 获取所有表
cur.execute("""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'billiards_dwd'
ORDER BY table_name
""")
tables = [row[0] for row in cur.fetchall()]
output_dir = r'c:\dev\LLTQ\ETL\feiqiu-ETL\tmp\table_analysis'
os.makedirs(output_dir, exist_ok=True)
for table in tables:
print(f"Processing {table}...")
# 1. 获取字段结构
cur.execute("""
SELECT
c.column_name,
c.data_type,
c.character_maximum_length,
c.numeric_precision,
c.numeric_scale,
c.is_nullable,
c.column_default
FROM information_schema.columns c
WHERE c.table_schema = 'billiards_dwd' AND c.table_name = %s
ORDER BY c.ordinal_position
""", (table,))
columns = cur.fetchall()
# 2. 获取主键信息
cur.execute("""
SELECT kcu.column_name
FROM information_schema.table_constraints tc
JOIN information_schema.key_column_usage kcu
ON tc.constraint_name = kcu.constraint_name
AND tc.table_schema = kcu.table_schema
WHERE tc.table_schema = 'billiards_dwd'
AND tc.table_name = %s
AND tc.constraint_type = 'PRIMARY KEY'
""", (table,))
pk_cols = [row[0] for row in cur.fetchall()]
# 3. 获取500行样本数据
try:
cur.execute(f'SELECT * FROM billiards_dwd."{table}" LIMIT 500')
sample_rows = cur.fetchall()
col_names = [desc[0] for desc in cur.description]
except Exception as e:
sample_rows = []
col_names = []
print(f" Error fetching samples: {e}")
# 4. 分析每个字段的值分布(用于识别枚举)
value_analysis = {}
for col_info in columns:
col_name = col_info[0]
data_type = col_info[1]
# 只对整数、小整数、文本类型做枚举分析
if data_type in ('integer', 'smallint', 'bigint', 'text', 'character varying', 'boolean'):
try:
cur.execute(f"""
SELECT "{col_name}", COUNT(*) as cnt
FROM billiards_dwd."{table}"
WHERE "{col_name}" IS NOT NULL
GROUP BY "{col_name}"
ORDER BY cnt DESC
LIMIT 20
""")
distinct_values = cur.fetchall()
# 获取总行数和非空数
cur.execute(f"""
SELECT
COUNT(*) as total,
COUNT("{col_name}") as non_null,
COUNT(DISTINCT "{col_name}") as distinct_count
FROM billiards_dwd."{table}"
""")
stats = cur.fetchone()
value_analysis[col_name] = {
'total_rows': stats[0],
'non_null': stats[1],
'distinct_count': stats[2],
'top_values': [(str(v[0])[:100] if v[0] is not None else 'NULL', v[1]) for v in distinct_values]
}
except Exception as e:
value_analysis[col_name] = {'error': str(e)}
# 5. 写入分析文件
with open(os.path.join(output_dir, f'{table}.txt'), 'w', encoding='utf-8') as f:
f.write(f"=" * 80 + "\n")
f.write(f"Table: billiards_dwd.{table}\n")
f.write(f"Primary Key: {', '.join(pk_cols) if pk_cols else 'None'}\n")
f.write(f"=" * 80 + "\n\n")
f.write("## COLUMNS\n")
f.write("-" * 80 + "\n")
for col in columns:
col_name, data_type, char_len, num_prec, num_scale, nullable, default = col
type_str = data_type
if char_len:
type_str = f"{data_type}({char_len})"
elif num_prec and data_type == 'numeric':
type_str = f"numeric({num_prec},{num_scale or 0})"
is_pk = "PK" if col_name in pk_cols else ""
f.write(f"{col_name}: {type_str} | nullable={nullable} | {is_pk}\n")
f.write("\n## VALUE ANALYSIS (for enum detection)\n")
f.write("-" * 80 + "\n")
for col_name, analysis in value_analysis.items():
if 'error' in analysis:
f.write(f"\n{col_name}: ERROR - {analysis['error']}\n")
continue
f.write(f"\n{col_name}:\n")
f.write(f" Total: {analysis['total_rows']}, Non-null: {analysis['non_null']}, Distinct: {analysis['distinct_count']}\n")
# 如果不同值少于15个可能是枚举
if analysis['distinct_count'] <= 15 and analysis['distinct_count'] > 0:
f.write(f" *** LIKELY ENUM (distinct <= 15) ***\n")
f.write(f" Top values:\n")
for val, cnt in analysis['top_values'][:15]:
f.write(f" {val}: {cnt}\n")
# 写入样本数据的前10行
f.write("\n## SAMPLE DATA (first 10 rows)\n")
f.write("-" * 80 + "\n")
if sample_rows:
f.write(f"Columns: {col_names}\n\n")
for i, row in enumerate(sample_rows[:10]):
f.write(f"Row {i+1}:\n")
for j, val in enumerate(row):
val_str = str(val)[:200] if val is not None else 'NULL'
f.write(f" {col_names[j]}: {val_str}\n")
f.write("\n")
else:
f.write("No sample data available.\n")
print(f"\nGenerated analysis files for {len(tables)} tables in {output_dir}")
conn.close()