微信小程序页面迁移校验之前 P5任务处理之前
This commit is contained in:
154
scripts/ops/_deep_investigate_etl_timeline.py
Normal file
154
scripts/ops/_deep_investigate_etl_timeline.py
Normal file
@@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
深度调查 ETL 执行时间线,查明凌晨全量更新后为什么还有数据缺失
|
||||
"""
|
||||
|
||||
import os
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
from dotenv import load_dotenv
|
||||
|
||||
def main():
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
log_root = os.environ.get('LOG_ROOT')
|
||||
if not log_root:
|
||||
raise RuntimeError("LOG_ROOT 环境变量未设置")
|
||||
|
||||
log_dir = Path(log_root)
|
||||
|
||||
print("🔍 深度调查 ETL 执行时间线")
|
||||
print("=" * 60)
|
||||
|
||||
# 获取所有日志文件并按修改时间排序
|
||||
log_files = list(log_dir.glob("*.log"))
|
||||
log_files.sort(key=lambda x: x.stat().st_mtime)
|
||||
|
||||
print(f"找到 {len(log_files)} 个日志文件")
|
||||
|
||||
# 分析最近 24 小时的日志
|
||||
now = datetime.now()
|
||||
yesterday = now - timedelta(hours=24)
|
||||
|
||||
recent_logs = []
|
||||
for log_file in log_files:
|
||||
mtime = datetime.fromtimestamp(log_file.stat().st_mtime)
|
||||
if mtime >= yesterday:
|
||||
recent_logs.append((log_file, mtime))
|
||||
|
||||
print(f"\n📅 最近 24 小时内的日志文件 ({len(recent_logs)} 个):")
|
||||
for log_file, mtime in recent_logs:
|
||||
print(f" {mtime.strftime('%Y-%m-%d %H:%M:%S')} - {log_file.name}")
|
||||
|
||||
# 分析每个日志文件的关键信息
|
||||
print(f"\n🔍 详细分析最近的日志:")
|
||||
|
||||
for i, (log_file, mtime) in enumerate(recent_logs[-5:]): # 最近5个
|
||||
print(f"\n--- 日志 {i+1}: {log_file.name} ---")
|
||||
print(f"修改时间: {mtime.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
try:
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
lines = content.split('\n')
|
||||
|
||||
# 查找关键信息
|
||||
start_time = None
|
||||
end_time = None
|
||||
flow_type = None
|
||||
window_info = None
|
||||
dwd_info = None
|
||||
errors = []
|
||||
|
||||
for line in lines:
|
||||
if '开始运行任务' in line and 'run_uuid' in line:
|
||||
start_time = line.split(']')[0].replace('[', '')
|
||||
if 'api_full' in line:
|
||||
flow_type = 'api_full (全量)'
|
||||
elif 'api_ods_dwd' in line:
|
||||
flow_type = 'api_ods_dwd (增量)'
|
||||
else:
|
||||
flow_type = '未知'
|
||||
|
||||
if 'ETL运行完成' in line:
|
||||
end_time = line.split(']')[0].replace('[', '')
|
||||
|
||||
if 'force_full' in line or '时间窗口' in line:
|
||||
window_info = line.strip()
|
||||
|
||||
if 'DWD_LOAD_FROM_ODS' in line and ('开始' in line or '完成' in line):
|
||||
dwd_info = line.strip()
|
||||
|
||||
if 'ERROR' in line or '失败' in line:
|
||||
errors.append(line.strip())
|
||||
|
||||
print(f" 执行类型: {flow_type or '未知'}")
|
||||
print(f" 开始时间: {start_time or '未找到'}")
|
||||
print(f" 结束时间: {end_time or '未找到'}")
|
||||
|
||||
if window_info:
|
||||
print(f" 窗口信息: {window_info}")
|
||||
|
||||
if dwd_info:
|
||||
print(f" DWD 处理: {dwd_info}")
|
||||
|
||||
if errors:
|
||||
print(f" 错误数量: {len(errors)}")
|
||||
for error in errors[:3]: # 只显示前3个错误
|
||||
print(f" - {error}")
|
||||
|
||||
# 计算执行时长
|
||||
if start_time and end_time:
|
||||
try:
|
||||
start_dt = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
|
||||
end_dt = datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
|
||||
duration = end_dt - start_dt
|
||||
print(f" 执行时长: {duration}")
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
print(f" 读取失败: {e}")
|
||||
|
||||
# 特别关注今天凌晨的执行
|
||||
print(f"\n🌅 今天凌晨执行分析:")
|
||||
today_start = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
morning_end = now.replace(hour=8, minute=0, second=0, microsecond=0)
|
||||
|
||||
morning_logs = []
|
||||
for log_file, mtime in recent_logs:
|
||||
if today_start <= mtime <= morning_end:
|
||||
morning_logs.append((log_file, mtime))
|
||||
|
||||
if morning_logs:
|
||||
print(f"找到 {len(morning_logs)} 个凌晨执行的日志:")
|
||||
for log_file, mtime in morning_logs:
|
||||
print(f" {mtime.strftime('%H:%M:%S')} - {log_file.name}")
|
||||
|
||||
# 检查是否包含 DWD 处理
|
||||
try:
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
if 'DWD_LOAD_FROM_ODS' in content:
|
||||
print(f" ✅ 包含 DWD 处理")
|
||||
|
||||
# 检查处理的时间范围
|
||||
lines = content.split('\n')
|
||||
for line in lines:
|
||||
if '时间窗口' in line or 'window' in line.lower():
|
||||
print(f" 📅 {line.strip()}")
|
||||
else:
|
||||
print(f" ❌ 未包含 DWD 处理")
|
||||
|
||||
except Exception as e:
|
||||
print(f" 读取失败: {e}")
|
||||
else:
|
||||
print("❌ 未找到今天凌晨的 ETL 执行记录!")
|
||||
print("这可能解释了为什么有数据缺失。")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user