Files
Neo-ZQYY/scripts/ops/backend-watchdog.ps1
Neo 95a4500c75 chore(ops): reload 卡死三层预防 + F1-5a 完整走查报告
reload 卡死三层预防(走查中遭遇 uvicorn graceful shutdown 死等触发):
- Layer 1 (apps/backend/start_uvicorn.py 新): 把 reload-excludes
  封装在 Python 字符串内,ps1 命令行只有字面路径,根治 PowerShell
  PSNativeCommandArgumentPassing 在不同 profile 下 wildcard 展开
  行为差异(数组 splatting 和 --% 都不稳)。同时显式设
  timeout-graceful-shutdown=5,5 秒强杀防死等
- Layer 2 (scripts/ops/backend-watchdog.ps1 新): 自主 socket 探针
  (TcpClient + 手写 HTTP/1.1 GET,Connection: close)规避 .NET
  HttpClient pool 复用 + 系统代理误报;3s × 3 = 9s 触发重启;
  进程链 kill 至 pwsh 后端窗口(关闭原窗口);3 次/小时上限自停
- Layer 3 (scripts/ops/start-admin.ps1): 启动时拉起 watchdog,
  菜单 [4] 仅重启后端选项,主菜单退出时一并 kill 看门狗

CLAUDE.md: 新增"后端 reload 卡死预防(强制)"章节,
分级文件风险表 + SOP + 启动菜单速查

走查报告(应查尽查严肃版):
- 后端 6 个改造点 PASS(P1-P4 + GUC + ai_run_logs runtime 字段)
- admin-web 7 页 Playwright 实地走查 → 5 项 UI 不完整登记 F1-5b
- 小程序看板 tab 7 页 weixin-devtools-mcp 实地 + DB 数据核对 →
  board-finance 5/6 项上界裁剪吻合;board-customer 业务日生效;
  board-coach 月度聚合表设计盲区;5 项 sandbox 覆盖盲区登记 F1-5b
- 8 张走查截图归档 docs/audit/changes/screenshots/2026-05-05_f1_5a_walkthrough/

audit_dashboard 刷新到 153 条审计

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 11:53:08 +08:00

259 lines
11 KiB
PowerShell
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
# 后端 FastAPI 看门狗:探针 /health,卡死自动重启
#
# 触发条件:
# - 连续 $FailThreshold 次 /health 探针失败(默认 3 次,30 秒)
#
# 行为:
# - kill 占用 8000 端口的所有进程(uvicorn worker + reloader 父进程)
# - 等待端口释放
# - 启动新 uvicorn 进程(同 start-admin.ps1 的命令格式)
# - 启动后 $StartupGrace 秒(默认 45)不再探针,等 lifespan 完成
#
# 由 start-admin.ps1 在主启动后调用,主菜单退出时一并 kill 看门狗。
#
# CHANGE 2026-05-05 | F1-5a 走查发现 reload 卡死问题:
# 修改 services/* main 等核心文件 + admin-web 浏览器开着 WS,
# uvicorn graceful shutdown 等不到 WebSocket / asyncio task / DB 连接释放,
# 死等在 "Waiting for background tasks to complete"。
# 看门狗作为 Layer 2 保护:reload 卡死 → 探针失败 → 强杀 → 重启。
param(
[string]$ProjectRoot = $env:NEOZQYY_ROOT,
[int]$CheckInterval = 5, # 探针间隔(秒)
[int]$FailThreshold = 12, # 连续失败次数阈值(达到则重启)3s × 3 = 9s 触发
[int]$StartupGrace = 60, # 启动后宽限期(秒,期间不探针)
[int]$MaxRestartsPerHour = 3, # 每小时最大重启次数(超出则停止看门狗,避免误报死循环)
[int]$Port = 8000,
[string]$HealthPath = "/health"
)
$ErrorActionPreference = "Continue"
# ── 解析项目根 ─────────────────────────────────────────
if (-not $ProjectRoot -or -not (Test-Path $ProjectRoot)) {
if ($env:NEOZQYY_LAUNCH_DIR -and (Test-Path $env:NEOZQYY_LAUNCH_DIR)) {
$ProjectRoot = $env:NEOZQYY_LAUNCH_DIR.TrimEnd('\')
} elseif ($PSScriptRoot) {
$ProjectRoot = Split-Path -Parent (Split-Path -Parent $PSScriptRoot)
}
}
if (-not $ProjectRoot -or -not (Test-Path $ProjectRoot)) {
Write-Host "[watchdog] 无法定位 ProjectRoot,退出" -ForegroundColor Red
exit 1
}
$venvPython = Join-Path $ProjectRoot ".venv\Scripts\python.exe"
$backendDir = Join-Path $ProjectRoot "apps\backend"
if (-not (Test-Path $venvPython)) {
Write-Host "[watchdog] .venv Python 不存在: $venvPython" -ForegroundColor Red
exit 1
}
$healthUrl = "http://127.0.0.1:$Port$HealthPath"
$psExe = if (Get-Command pwsh -ErrorAction SilentlyContinue) { "pwsh" } else { "powershell" }
# PS 5.1 + 传统控制台 UTF-8
if ($PSVersionTable.PSVersion.Major -lt 7 -and -not $env:WT_SESSION) {
chcp 65001 | Out-Null
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
}
function Write-Log {
param([string]$Msg, [string]$Color = "Gray")
$ts = Get-Date -Format "HH:mm:ss"
Write-Host "[watchdog $ts] $Msg" -ForegroundColor $Color
}
function Test-BackendAlive {
# CHANGE 2026-05-05 v3 | 自主管理 socket,完全规避 .NET HttpClient pool 复用 + 系统代理:
# - TcpClient 短连接(每次新建,不池化)
# - 手写 HTTP/1.1 GET request,Connection: close 强制无复用
# - send/receive 各 3 秒超时,稳定且能识别 worker 卡死(端口在但不响应 read)
$tcp = $null
try {
$tcp = [System.Net.Sockets.TcpClient]::new()
$tcp.SendTimeout = 1500
$tcp.ReceiveTimeout = 3000
$iar = $tcp.BeginConnect("127.0.0.1", $Port, $null, $null)
$ok = $iar.AsyncWaitHandle.WaitOne(1500, $false)
if (-not $ok -or -not $tcp.Connected) { return $false }
$stream = $tcp.GetStream()
$stream.ReadTimeout = 3000
$stream.WriteTimeout = 1500
$req = "GET $HealthPath HTTP/1.1`r`nHost: 127.0.0.1:$Port`r`nConnection: close`r`nUser-Agent: NeoZQYY-Watchdog/1.0`r`n`r`n"
$bytes = [System.Text.Encoding]::ASCII.GetBytes($req)
$stream.Write($bytes, 0, $bytes.Length)
$buf = New-Object byte[] 256
$n = $stream.Read($buf, 0, 256)
if ($n -lt 12) { return $false }
$statusLine = [System.Text.Encoding]::ASCII.GetString($buf, 0, [Math]::Min($n, 80))
return $statusLine -match "HTTP/1\.. 200"
} catch {
return $false
} finally {
if ($tcp) { try { $tcp.Close() } catch { } }
}
}
function Get-BackendPids {
# 找 8000 端口监听者 + 沿进程树往上找,直到 pwsh/powershell 后端窗口
# 链:uvicorn worker(python) → uvicorn reloader(python) → pwsh(后端窗口)
# 关闭原窗口需要把 pwsh 也加入 kill 列表
$pids = @()
$listeners = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
foreach ($l in $listeners) {
$cur = $l.OwningProcess
$pids += $cur
for ($i = 0; $i -lt 5; $i++) {
$proc = Get-CimInstance Win32_Process -Filter "ProcessId=$cur" -ErrorAction SilentlyContinue
if (-not $proc -or -not $proc.ParentProcessId) { break }
$parent = Get-CimInstance Win32_Process -Filter "ProcessId=$($proc.ParentProcessId)" -ErrorAction SilentlyContinue
if (-not $parent) { break }
# 链上是 python(reloader) 或 pwsh/powershell(后端窗口) 都加入
if ($parent.Name -in @("python.exe", "pwsh.exe", "powershell.exe")) {
$pids += $parent.ProcessId
$cur = $parent.ProcessId
# 到了 pwsh/powershell 就停(再往上是 watchdog/explorer 不该杀)
if ($parent.Name -in @("pwsh.exe", "powershell.exe")) { break }
} else {
break
}
}
}
return $pids | Sort-Object -Unique
}
function Stop-StuckBackend {
Write-Log "强杀 8000 端口相关进程..." Yellow
$targetPids = Get-BackendPids
if (-not $targetPids -or $targetPids.Count -eq 0) {
Write-Log "未发现 8000 端口监听进程,可能已退出" DarkGray
return $true
}
foreach ($id in $targetPids) {
Write-Log " taskkill /T /F PID=$id"
taskkill /PID $id /T /F 2>$null | Out-Null
}
# 等端口释放
$waited = 0
while ($waited -lt 20) {
Start-Sleep -Seconds 1
$waited++
$still = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue
if (-not $still) {
Write-Log " 端口 $Port 已释放(等待 ${waited}s)" Green
return $true
}
}
Write-Log " 端口 $Port 仍占用,放弃自动重启" Red
return $false
}
function Start-NewBackend {
Write-Log "启动新 uvicorn 进程..." Yellow
$ts = Get-Date -Format "yyyyMMdd_HHmmss"
$beTmp = Join-Path $env:TEMP "neozqyy_watchdog_be_${ts}.ps1"
$q = [char]39
# CHANGE 2026-05-05 v3 | 调用 start_uvicorn.py,wildcard 字符串封装在 Python 内部
$beLines = @(
"`$env:NEOZQYY_ROOT = ${q}${ProjectRoot}${q}"
""
"if (`$PSVersionTable.PSVersion.Major -lt 7 -and -not `$env:WT_SESSION) {"
" chcp 65001 | Out-Null"
" `$env:NO_COLOR = ${q}1${q}"
" [Console]::OutputEncoding = [System.Text.Encoding]::UTF8"
" Write-Host ${q}[提示] PS 5.1 传统控制台,已启用 UTF-8 + 禁用 ANSI 颜色${q} -ForegroundColor Yellow"
"}"
""
"Set-Location -LiteralPath ${q}${backendDir}${q}"
"Write-Host ${q}=== 后端 FastAPI (watchdog 自动重启) ===${q} -ForegroundColor Magenta"
"Write-Host `"NEOZQYY_ROOT=`$env:NEOZQYY_ROOT`""
""
"& ${q}${venvPython}${q} ${q}${backendDir}\start_uvicorn.py${q} --port ${Port}"
"Write-Host ${q}后端已退出,按任意键关闭...${q} -ForegroundColor Red"
"`$null = `$Host.UI.RawUI.ReadKey(${q}NoEcho,IncludeKeyDown${q})"
)
$beLines | Set-Content -Path $beTmp -Encoding UTF8
Start-Process $psExe -ArgumentList "-NoExit", "-ExecutionPolicy", "Bypass", "-File", $beTmp | Out-Null
Write-Log "新 uvicorn 进程已 spawn,等待 ${StartupGrace}s 启动" Green
}
function Force-RestartBackend {
Write-Log "===========================================" Red
Write-Log "检测到后端卡死,执行强制重启" Red
Write-Log "===========================================" Red
if (-not (Stop-StuckBackend)) { return $false }
Start-Sleep -Seconds 1
Start-NewBackend
return $true
}
# ── 主循环 ─────────────────────────────────────────
Write-Host ""
Write-Host "========================================" -ForegroundColor Cyan
Write-Host " 后端看门狗 backend-watchdog" -ForegroundColor Cyan
Write-Host "========================================" -ForegroundColor Cyan
Write-Log "ProjectRoot = $ProjectRoot" Gray
Write-Log "HealthURL = $healthUrl" Gray
Write-Log "CheckInterval = ${CheckInterval}s" Gray
Write-Log "FailThreshold = $FailThreshold (累计 $($CheckInterval * $FailThreshold)s 不响应触发重启)" Gray
Write-Log "StartupGrace = ${StartupGrace}s (重启后宽限期)" Gray
Write-Host ""
$failCount = 0
$lastRestart = [DateTime]::MinValue
$startupSeen = $false # 首次见到健康才启用探针
$restartTimes = @() # 重启时间窗(滑动 1 小时,PS array 避免 List null 异常)
while ($true) {
# grace 期间不探针
$sinceRestart = (Get-Date) - $lastRestart
if ($sinceRestart.TotalSeconds -lt $StartupGrace) {
Start-Sleep -Seconds $CheckInterval
continue
}
if (Test-BackendAlive) {
if (-not $startupSeen) {
Write-Log "首次健康探针 200,看门狗激活" Green
$startupSeen = $true
} elseif ($failCount -gt 0) {
Write-Log "后端恢复 (此前连续失败 $failCount 次)" Green
}
$failCount = 0
} else {
if (-not $startupSeen) {
# 启动期还没见过 200,先静默等待
Start-Sleep -Seconds $CheckInterval
continue
}
$failCount++
Write-Log "健康探针失败 ($failCount/$FailThreshold)" Yellow
if ($failCount -ge $FailThreshold) {
# CHANGE 2026-05-05 | 重启频率限制:每小时最多 N 次,超出停止看门狗自我保护
$cutoff = (Get-Date).AddHours(-1)
$restartTimes = @($restartTimes | Where-Object { $_ -gt $cutoff })
if ($restartTimes.Count -ge $MaxRestartsPerHour) {
Write-Log "===========================================" Red
Write-Log "已达每小时最大重启次数 $MaxRestartsPerHour,看门狗停止" Red
Write-Log "可能是探针误报或后端持续异常,请人工排查" Red
Write-Log "===========================================" Red
exit 1
}
if (Force-RestartBackend) {
$lastRestart = Get-Date
$restartTimes += $lastRestart
Write-Log "本小时已重启 $($restartTimes.Count)/$MaxRestartsPerHour" DarkYellow
$startupSeen = $false
}
$failCount = 0
}
}
Start-Sleep -Seconds $CheckInterval
}