154 lines
5.5 KiB
Python
154 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
深度调查 ETL 执行时间线,查明凌晨全量更新后为什么还有数据缺失
|
|
"""
|
|
|
|
import os
|
|
import glob
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
from dotenv import load_dotenv
|
|
|
|
def main():
|
|
# 加载环境变量
|
|
load_dotenv()
|
|
|
|
log_root = os.environ.get('LOG_ROOT')
|
|
if not log_root:
|
|
raise RuntimeError("LOG_ROOT 环境变量未设置")
|
|
|
|
log_dir = Path(log_root)
|
|
|
|
print("🔍 深度调查 ETL 执行时间线")
|
|
print("=" * 60)
|
|
|
|
# 获取所有日志文件并按修改时间排序
|
|
log_files = list(log_dir.glob("*.log"))
|
|
log_files.sort(key=lambda x: x.stat().st_mtime)
|
|
|
|
print(f"找到 {len(log_files)} 个日志文件")
|
|
|
|
# 分析最近 24 小时的日志
|
|
now = datetime.now()
|
|
yesterday = now - timedelta(hours=24)
|
|
|
|
recent_logs = []
|
|
for log_file in log_files:
|
|
mtime = datetime.fromtimestamp(log_file.stat().st_mtime)
|
|
if mtime >= yesterday:
|
|
recent_logs.append((log_file, mtime))
|
|
|
|
print(f"\n📅 最近 24 小时内的日志文件 ({len(recent_logs)} 个):")
|
|
for log_file, mtime in recent_logs:
|
|
print(f" {mtime.strftime('%Y-%m-%d %H:%M:%S')} - {log_file.name}")
|
|
|
|
# 分析每个日志文件的关键信息
|
|
print(f"\n🔍 详细分析最近的日志:")
|
|
|
|
for i, (log_file, mtime) in enumerate(recent_logs[-5:]): # 最近5个
|
|
print(f"\n--- 日志 {i+1}: {log_file.name} ---")
|
|
print(f"修改时间: {mtime.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
try:
|
|
with open(log_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
lines = content.split('\n')
|
|
|
|
# 查找关键信息
|
|
start_time = None
|
|
end_time = None
|
|
flow_type = None
|
|
window_info = None
|
|
dwd_info = None
|
|
errors = []
|
|
|
|
for line in lines:
|
|
if '开始运行任务' in line and 'run_uuid' in line:
|
|
start_time = line.split(']')[0].replace('[', '')
|
|
if 'api_full' in line:
|
|
flow_type = 'api_full (全量)'
|
|
elif 'api_ods_dwd' in line:
|
|
flow_type = 'api_ods_dwd (增量)'
|
|
else:
|
|
flow_type = '未知'
|
|
|
|
if 'ETL运行完成' in line:
|
|
end_time = line.split(']')[0].replace('[', '')
|
|
|
|
if 'force_full' in line or '时间窗口' in line:
|
|
window_info = line.strip()
|
|
|
|
if 'DWD_LOAD_FROM_ODS' in line and ('开始' in line or '完成' in line):
|
|
dwd_info = line.strip()
|
|
|
|
if 'ERROR' in line or '失败' in line:
|
|
errors.append(line.strip())
|
|
|
|
print(f" 执行类型: {flow_type or '未知'}")
|
|
print(f" 开始时间: {start_time or '未找到'}")
|
|
print(f" 结束时间: {end_time or '未找到'}")
|
|
|
|
if window_info:
|
|
print(f" 窗口信息: {window_info}")
|
|
|
|
if dwd_info:
|
|
print(f" DWD 处理: {dwd_info}")
|
|
|
|
if errors:
|
|
print(f" 错误数量: {len(errors)}")
|
|
for error in errors[:3]: # 只显示前3个错误
|
|
print(f" - {error}")
|
|
|
|
# 计算执行时长
|
|
if start_time and end_time:
|
|
try:
|
|
start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
|
|
end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
|
|
duration = end_dt - start_dt
|
|
print(f" 执行时长: {duration}")
|
|
except:
|
|
pass
|
|
|
|
except Exception as e:
|
|
print(f" 读取失败: {e}")
|
|
|
|
# 特别关注今天凌晨的执行
|
|
print(f"\n🌅 今天凌晨执行分析:")
|
|
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
morning_end = now.replace(hour=8, minute=0, second=0, microsecond=0)
|
|
|
|
morning_logs = []
|
|
for log_file, mtime in recent_logs:
|
|
if today_start <= mtime <= morning_end:
|
|
morning_logs.append((log_file, mtime))
|
|
|
|
if morning_logs:
|
|
print(f"找到 {len(morning_logs)} 个凌晨执行的日志:")
|
|
for log_file, mtime in morning_logs:
|
|
print(f" {mtime.strftime('%H:%M:%S')} - {log_file.name}")
|
|
|
|
# 检查是否包含 DWD 处理
|
|
try:
|
|
with open(log_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
if 'DWD_LOAD_FROM_ODS' in content:
|
|
print(f" ✅ 包含 DWD 处理")
|
|
|
|
# 检查处理的时间范围
|
|
lines = content.split('\n')
|
|
for line in lines:
|
|
if '时间窗口' in line or 'window' in line.lower():
|
|
print(f" 📅 {line.strip()}")
|
|
else:
|
|
print(f" ❌ 未包含 DWD 处理")
|
|
|
|
except Exception as e:
|
|
print(f" 读取失败: {e}")
|
|
else:
|
|
print("❌ 未找到今天凌晨的 ETL 执行记录!")
|
|
print("这可能解释了为什么有数据缺失。")
|
|
|
|
if __name__ == "__main__":
|
|
main() |