# -*- coding: utf-8 -*- # 后端 FastAPI 看门狗:探针 /health,卡死自动重启 # # 触发条件: # - 连续 $FailThreshold 次 /health 探针失败(默认 3 次,30 秒) # # 行为: # - kill 占用 8000 端口的所有进程(uvicorn worker + reloader 父进程) # - 等待端口释放 # - 启动新 uvicorn 进程(同 start-admin.ps1 的命令格式) # - 启动后 $StartupGrace 秒(默认 45)不再探针,等 lifespan 完成 # # 由 start-admin.ps1 在主启动后调用,主菜单退出时一并 kill 看门狗。 # # CHANGE 2026-05-05 | F1-5a 走查发现 reload 卡死问题: # 修改 services/* main 等核心文件 + admin-web 浏览器开着 WS, # uvicorn graceful shutdown 等不到 WebSocket / asyncio task / DB 连接释放, # 死等在 "Waiting for background tasks to complete"。 # 看门狗作为 Layer 2 保护:reload 卡死 → 探针失败 → 强杀 → 重启。 param( [string]$ProjectRoot = $env:NEOZQYY_ROOT, [int]$CheckInterval = 5, # 探针间隔(秒) [int]$FailThreshold = 12, # 连续失败次数阈值(达到则重启)3s × 3 = 9s 触发 [int]$StartupGrace = 60, # 启动后宽限期(秒,期间不探针) [int]$MaxRestartsPerHour = 3, # 每小时最大重启次数(超出则停止看门狗,避免误报死循环) [int]$Port = 8000, [string]$HealthPath = "/health" ) $ErrorActionPreference = "Continue" # ── 解析项目根 ───────────────────────────────────────── if (-not $ProjectRoot -or -not (Test-Path $ProjectRoot)) { if ($env:NEOZQYY_LAUNCH_DIR -and (Test-Path $env:NEOZQYY_LAUNCH_DIR)) { $ProjectRoot = $env:NEOZQYY_LAUNCH_DIR.TrimEnd('\') } elseif ($PSScriptRoot) { $ProjectRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot) } } if (-not $ProjectRoot -or -not (Test-Path $ProjectRoot)) { Write-Host "[watchdog] 无法定位 ProjectRoot,退出" -ForegroundColor Red exit 1 } $venvPython = Join-Path $ProjectRoot ".venv\Scripts\python.exe" $backendDir = Join-Path $ProjectRoot "apps\backend" if (-not (Test-Path $venvPython)) { Write-Host "[watchdog] .venv Python 不存在: $venvPython" -ForegroundColor Red exit 1 } $healthUrl = "http://127.0.0.1:$Port$HealthPath" $psExe = if (Get-Command pwsh -ErrorAction SilentlyContinue) { "pwsh" } else { "powershell" } # PS 5.1 + 传统控制台 UTF-8 if ($PSVersionTable.PSVersion.Major -lt 7 -and -not $env:WT_SESSION) { chcp 65001 | Out-Null [Console]::OutputEncoding = [System.Text.Encoding]::UTF8 } function Write-Log { param([string]$Msg, [string]$Color = "Gray") $ts = Get-Date -Format "HH:mm:ss" Write-Host "[watchdog $ts] $Msg" -ForegroundColor $Color } function Test-BackendAlive { # CHANGE 2026-05-05 v3 | 自主管理 socket,完全规避 .NET HttpClient pool 复用 + 系统代理: # - TcpClient 短连接(每次新建,不池化) # - 手写 HTTP/1.1 GET request,Connection: close 强制无复用 # - send/receive 各 3 秒超时,稳定且能识别 worker 卡死(端口在但不响应 read) $tcp = $null try { $tcp = [System.Net.Sockets.TcpClient]::new() $tcp.SendTimeout = 1500 $tcp.ReceiveTimeout = 3000 $iar = $tcp.BeginConnect("127.0.0.1", $Port, $null, $null) $ok = $iar.AsyncWaitHandle.WaitOne(1500, $false) if (-not $ok -or -not $tcp.Connected) { return $false } $stream = $tcp.GetStream() $stream.ReadTimeout = 3000 $stream.WriteTimeout = 1500 $req = "GET $HealthPath HTTP/1.1`r`nHost: 127.0.0.1:$Port`r`nConnection: close`r`nUser-Agent: NeoZQYY-Watchdog/1.0`r`n`r`n" $bytes = [System.Text.Encoding]::ASCII.GetBytes($req) $stream.Write($bytes, 0, $bytes.Length) $buf = New-Object byte[] 256 $n = $stream.Read($buf, 0, 256) if ($n -lt 12) { return $false } $statusLine = [System.Text.Encoding]::ASCII.GetString($buf, 0, [Math]::Min($n, 80)) return $statusLine -match "HTTP/1\.. 200" } catch { return $false } finally { if ($tcp) { try { $tcp.Close() } catch { } } } } function Get-BackendPids { # 找 8000 端口监听者 + 沿进程树往上找,直到 pwsh/powershell 后端窗口 # 链:uvicorn worker(python) → uvicorn reloader(python) → pwsh(后端窗口) # 关闭原窗口需要把 pwsh 也加入 kill 列表 $pids = @() $listeners = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue foreach ($l in $listeners) { $cur = $l.OwningProcess $pids += $cur for ($i = 0; $i -lt 5; $i++) { $proc = Get-CimInstance Win32_Process -Filter "ProcessId=$cur" -ErrorAction SilentlyContinue if (-not $proc -or -not $proc.ParentProcessId) { break } $parent = Get-CimInstance Win32_Process -Filter "ProcessId=$($proc.ParentProcessId)" -ErrorAction SilentlyContinue if (-not $parent) { break } # 链上是 python(reloader) 或 pwsh/powershell(后端窗口) 都加入 if ($parent.Name -in @("python.exe", "pwsh.exe", "powershell.exe")) { $pids += $parent.ProcessId $cur = $parent.ProcessId # 到了 pwsh/powershell 就停(再往上是 watchdog/explorer 不该杀) if ($parent.Name -in @("pwsh.exe", "powershell.exe")) { break } } else { break } } } return $pids | Sort-Object -Unique } function Stop-StuckBackend { Write-Log "强杀 8000 端口相关进程..." Yellow $targetPids = Get-BackendPids if (-not $targetPids -or $targetPids.Count -eq 0) { Write-Log "未发现 8000 端口监听进程,可能已退出" DarkGray return $true } foreach ($id in $targetPids) { Write-Log " taskkill /T /F PID=$id" taskkill /PID $id /T /F 2>$null | Out-Null } # 等端口释放 $waited = 0 while ($waited -lt 20) { Start-Sleep -Seconds 1 $waited++ $still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue if (-not $still) { Write-Log " 端口 $Port 已释放(等待 ${waited}s)" Green return $true } } Write-Log " 端口 $Port 仍占用,放弃自动重启" Red return $false } function Start-NewBackend { Write-Log "启动新 uvicorn 进程..." Yellow $ts = Get-Date -Format "yyyyMMdd_HHmmss" $beTmp = Join-Path $env:TEMP "neozqyy_watchdog_be_${ts}.ps1" $q = [char]39 # CHANGE 2026-05-05 v3 | 调用 start_uvicorn.py,wildcard 字符串封装在 Python 内部 $beLines = @( "`$env:NEOZQYY_ROOT = ${q}${ProjectRoot}${q}" "" "if (`$PSVersionTable.PSVersion.Major -lt 7 -and -not `$env:WT_SESSION) {" " chcp 65001 | Out-Null" " `$env:NO_COLOR = ${q}1${q}" " [Console]::OutputEncoding = [System.Text.Encoding]::UTF8" " Write-Host ${q}[提示] PS 5.1 传统控制台,已启用 UTF-8 + 禁用 ANSI 颜色${q} -ForegroundColor Yellow" "}" "" "Set-Location -LiteralPath ${q}${backendDir}${q}" "Write-Host ${q}=== 后端 FastAPI (watchdog 自动重启) ===${q} -ForegroundColor Magenta" "Write-Host `"NEOZQYY_ROOT=`$env:NEOZQYY_ROOT`"" "" "& ${q}${venvPython}${q} ${q}${backendDir}\start_uvicorn.py${q} --port ${Port}" "Write-Host ${q}后端已退出,按任意键关闭...${q} -ForegroundColor Red" "`$null = `$Host.UI.RawUI.ReadKey(${q}NoEcho,IncludeKeyDown${q})" ) $beLines | Set-Content -Path $beTmp -Encoding UTF8 Start-Process $psExe -ArgumentList "-NoExit", "-ExecutionPolicy", "Bypass", "-File", $beTmp | Out-Null Write-Log "新 uvicorn 进程已 spawn,等待 ${StartupGrace}s 启动" Green } function Force-RestartBackend { Write-Log "===========================================" Red Write-Log "检测到后端卡死,执行强制重启" Red Write-Log "===========================================" Red if (-not (Stop-StuckBackend)) { return $false } Start-Sleep -Seconds 1 Start-NewBackend return $true } # ── 主循环 ───────────────────────────────────────── Write-Host "" Write-Host "========================================" -ForegroundColor Cyan Write-Host " 后端看门狗 backend-watchdog" -ForegroundColor Cyan Write-Host "========================================" -ForegroundColor Cyan Write-Log "ProjectRoot = $ProjectRoot" Gray Write-Log "HealthURL = $healthUrl" Gray Write-Log "CheckInterval = ${CheckInterval}s" Gray Write-Log "FailThreshold = $FailThreshold (累计 $($CheckInterval * $FailThreshold)s 不响应触发重启)" Gray Write-Log "StartupGrace = ${StartupGrace}s (重启后宽限期)" Gray Write-Host "" $failCount = 0 $lastRestart = [DateTime]::MinValue $startupSeen = $false # 首次见到健康才启用探针 $restartTimes = @() # 重启时间窗(滑动 1 小时,PS array 避免 List null 异常) while ($true) { # grace 期间不探针 $sinceRestart = (Get-Date) - $lastRestart if ($sinceRestart.TotalSeconds -lt $StartupGrace) { Start-Sleep -Seconds $CheckInterval continue } if (Test-BackendAlive) { if (-not $startupSeen) { Write-Log "首次健康探针 200,看门狗激活" Green $startupSeen = $true } elseif ($failCount -gt 0) { Write-Log "后端恢复 (此前连续失败 $failCount 次)" Green } $failCount = 0 } else { if (-not $startupSeen) { # 启动期还没见过 200,先静默等待 Start-Sleep -Seconds $CheckInterval continue } $failCount++ Write-Log "健康探针失败 ($failCount/$FailThreshold)" Yellow if ($failCount -ge $FailThreshold) { # CHANGE 2026-05-05 | 重启频率限制:每小时最多 N 次,超出停止看门狗自我保护 $cutoff = (Get-Date).AddHours(-1) $restartTimes = @($restartTimes | Where-Object { $_ -gt $cutoff }) if ($restartTimes.Count -ge $MaxRestartsPerHour) { Write-Log "===========================================" Red Write-Log "已达每小时最大重启次数 $MaxRestartsPerHour,看门狗停止" Red Write-Log "可能是探针误报或后端持续异常,请人工排查" Red Write-Log "===========================================" Red exit 1 } if (Force-RestartBackend) { $lastRestart = Get-Date $restartTimes += $lastRestart Write-Log "本小时已重启 $($restartTimes.Count)/$MaxRestartsPerHour 次" DarkYellow $startupSeen = $false } $failCount = 0 } } Start-Sleep -Seconds $CheckInterval }