This commit is contained in:
Neo
2026-03-15 10:15:02 +08:00
parent 2dd217522c
commit 72bb11b34f
916 changed files with 65306 additions and 16102803 deletions

View File

@@ -366,6 +366,9 @@ class TaskQueue:
async def _process_once(self, executor: Any) -> None:
"""单次处理:扫描所有门店的 pending 队列并执行。"""
# CHANGE 2026-03-09 | 每次轮询先回收僵尸 running 任务
self._recover_zombie_tasks()
site_ids = self._get_pending_site_ids()
for site_id in site_ids:
@@ -415,6 +418,13 @@ class TaskQueue:
except Exception:
logger.exception("队列任务执行异常 [%s]", queue_id)
self._mark_failed(queue_id, "执行过程中发生未捕获异常")
finally:
# CHANGE 2026-03-09 | 兜底:确保 task_queue 不会卡在 running
# 背景_update_execution_log 内部异常(如 duration_ms integer 溢出)
# 被吞掉后_update_queue_status_from_log 读到的 execution_log 仍是
# running导致 task_queue 永远卡住,后续任务全部排队。
self._ensure_not_stuck_running(queue_id)
def _get_pending_site_ids(self) -> list[int]:
"""获取所有有 pending 任务的 site_id 列表(仅限本实例入队的)。"""
@@ -484,6 +494,84 @@ class TaskQueue:
finally:
conn.close()
def _ensure_not_stuck_running(self, queue_id: str) -> None:
"""兜底检查:如果 task_queue 仍是 running强制标记 failed。
CHANGE 2026-03-09 | 防止 _update_execution_log 内部异常导致
task_queue 永远卡在 running 状态。
"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute(
"SELECT status FROM task_queue WHERE id = %s",
(queue_id,),
)
row = cur.fetchone()
if row and row[0] == "running":
logger.warning(
"兜底修正task_queue [%s] 执行完毕但仍为 running"
"强制标记 failed",
queue_id,
)
cur.execute(
"""
UPDATE task_queue
SET status = 'failed', finished_at = NOW(),
error_message = %s
WHERE id = %s AND status = 'running'
""",
(
"[兜底修正] 执行流程结束但状态未同步,"
"可能因 execution_log 更新失败",
queue_id,
),
)
conn.commit()
except Exception:
logger.exception("_ensure_not_stuck_running 异常 [%s]", queue_id)
finally:
conn.close()
def _recover_zombie_tasks(self, max_running_minutes: int = 180) -> None:
"""恢复僵尸 running 任务:超过阈值时间仍为 running 的任务强制标记 failed。
CHANGE 2026-03-09 | 在 process_loop 每次轮询时调用,作为最后防线。
场景:后端进程崩溃/重启后,之前的 running 任务永远不会被更新。
"""
conn = get_connection()
try:
with conn.cursor() as cur:
cur.execute(
"""
UPDATE task_queue
SET status = 'failed', finished_at = NOW(),
error_message = %s
WHERE status = 'running'
AND (enqueued_by = %s OR enqueued_by IS NULL)
AND started_at < NOW() - INTERVAL '%s minutes'
RETURNING id
""",
(
f"[僵尸回收] running 超过 {max_running_minutes} 分钟,"
"自动标记 failed",
_INSTANCE_ID,
max_running_minutes,
),
)
recovered = cur.fetchall()
if recovered:
ids = [r[0] for r in recovered]
logger.warning(
"僵尸回收:%d 个 running 任务超时,已标记 failed: %s",
len(ids), ids,
)
conn.commit()
except Exception:
logger.exception("_recover_zombie_tasks 异常")
finally:
conn.close()
# ------------------------------------------------------------------
# 生命周期
# ------------------------------------------------------------------