数据库数据校验写入等逻辑更新。

2026-02-01 03:46:16 +08:00
parent 9948000b71
commit 076f5755ca
128 changed files with 494310 additions and 2819 deletions
--- a/tmp/query_schema_and_samples.py
+++ b/tmp/query_schema_and_samples.py
@@ -0,0 +1,155 @@
+import psycopg2
+import os
+import json
+
+DSN = 'postgresql://local-Python:Neo-local-1991125@100.64.0.4:5432/LLZQ-test'
+
+conn = psycopg2.connect(DSN)
+cur = conn.cursor()
+
+# 获取所有表
+cur.execute("""
+    SELECT table_name
+    FROM information_schema.tables
+    WHERE table_schema = 'billiards_dwd'
+    ORDER BY table_name
+""")
+tables = [row[0] for row in cur.fetchall()]
+
+output_dir = r'c:\dev\LLTQ\ETL\feiqiu-ETL\tmp\table_analysis'
+os.makedirs(output_dir, exist_ok=True)
+
+for table in tables:
+    print(f"Processing {table}...")
+    
+    # 1. 获取字段结构
+    cur.execute("""
+        SELECT
+            c.column_name,
+            c.data_type,
+            c.character_maximum_length,
+            c.numeric_precision,
+            c.numeric_scale,
+            c.is_nullable,
+            c.column_default
+        FROM information_schema.columns c
+        WHERE c.table_schema = 'billiards_dwd' AND c.table_name = %s
+        ORDER BY c.ordinal_position
+    """, (table,))
+    columns = cur.fetchall()
+    
+    # 2. 获取主键信息
+    cur.execute("""
+        SELECT kcu.column_name
+        FROM information_schema.table_constraints tc
+        JOIN information_schema.key_column_usage kcu
+            ON tc.constraint_name = kcu.constraint_name
+            AND tc.table_schema = kcu.table_schema
+        WHERE tc.table_schema = 'billiards_dwd'
+            AND tc.table_name = %s
+            AND tc.constraint_type = 'PRIMARY KEY'
+    """, (table,))
+    pk_cols = [row[0] for row in cur.fetchall()]
+    
+    # 3. 获取500行样本数据
+    try:
+        cur.execute(f'SELECT * FROM billiards_dwd."{table}" LIMIT 500')
+        sample_rows = cur.fetchall()
+        col_names = [desc[0] for desc in cur.description]
+    except Exception as e:
+        sample_rows = []
+        col_names = []
+        print(f"  Error fetching samples: {e}")
+    
+    # 4. 分析每个字段的值分布（用于识别枚举）
+    value_analysis = {}
+    for col_info in columns:
+        col_name = col_info[0]
+        data_type = col_info[1]
+        
+        # 只对整数、小整数、文本类型做枚举分析
+        if data_type in ('integer', 'smallint', 'bigint', 'text', 'character varying', 'boolean'):
+            try:
+                cur.execute(f"""
+                    SELECT "{col_name}", COUNT(*) as cnt
+                    FROM billiards_dwd."{table}"
+                    WHERE "{col_name}" IS NOT NULL
+                    GROUP BY "{col_name}"
+                    ORDER BY cnt DESC
+                    LIMIT 20
+                """)
+                distinct_values = cur.fetchall()
+                
+                # 获取总行数和非空数
+                cur.execute(f"""
+                    SELECT 
+                        COUNT(*) as total,
+                        COUNT("{col_name}") as non_null,
+                        COUNT(DISTINCT "{col_name}") as distinct_count
+                    FROM billiards_dwd."{table}"
+                """)
+                stats = cur.fetchone()
+                
+                value_analysis[col_name] = {
+                    'total_rows': stats[0],
+                    'non_null': stats[1],
+                    'distinct_count': stats[2],
+                    'top_values': [(str(v[0])[:100] if v[0] is not None else 'NULL', v[1]) for v in distinct_values]
+                }
+            except Exception as e:
+                value_analysis[col_name] = {'error': str(e)}
+    
+    # 5. 写入分析文件
+    with open(os.path.join(output_dir, f'{table}.txt'), 'w', encoding='utf-8') as f:
+        f.write(f"=" * 80 + "\n")
+        f.write(f"Table: billiards_dwd.{table}\n")
+        f.write(f"Primary Key: {', '.join(pk_cols) if pk_cols else 'None'}\n")
+        f.write(f"=" * 80 + "\n\n")
+        
+        f.write("## COLUMNS\n")
+        f.write("-" * 80 + "\n")
+        for col in columns:
+            col_name, data_type, char_len, num_prec, num_scale, nullable, default = col
+            type_str = data_type
+            if char_len:
+                type_str = f"{data_type}({char_len})"
+            elif num_prec and data_type == 'numeric':
+                type_str = f"numeric({num_prec},{num_scale or 0})"
+            
+            is_pk = "PK" if col_name in pk_cols else ""
+            f.write(f"{col_name}: {type_str} | nullable={nullable} | {is_pk}\n")
+        
+        f.write("\n## VALUE ANALYSIS (for enum detection)\n")
+        f.write("-" * 80 + "\n")
+        for col_name, analysis in value_analysis.items():
+            if 'error' in analysis:
+                f.write(f"\n{col_name}: ERROR - {analysis['error']}\n")
+                continue
+            
+            f.write(f"\n{col_name}:\n")
+            f.write(f"  Total: {analysis['total_rows']}, Non-null: {analysis['non_null']}, Distinct: {analysis['distinct_count']}\n")
+            
+            # 如果不同值少于15个，可能是枚举
+            if analysis['distinct_count'] <= 15 and analysis['distinct_count'] > 0:
+                f.write(f"  *** LIKELY ENUM (distinct <= 15) ***\n")
+            
+            f.write(f"  Top values:\n")
+            for val, cnt in analysis['top_values'][:15]:
+                f.write(f"    {val}: {cnt}\n")
+        
+        # 写入样本数据的前10行
+        f.write("\n## SAMPLE DATA (first 10 rows)\n")
+        f.write("-" * 80 + "\n")
+        if sample_rows:
+            f.write(f"Columns: {col_names}\n\n")
+            for i, row in enumerate(sample_rows[:10]):
+                f.write(f"Row {i+1}:\n")
+                for j, val in enumerate(row):
+                    val_str = str(val)[:200] if val is not None else 'NULL'
+                    f.write(f"  {col_names[j]}: {val_str}\n")
+                f.write("\n")
+        else:
+            f.write("No sample data available.\n")
+
+print(f"\nGenerated analysis files for {len(tables)} tables in {output_dir}")
+conn.close()