1
This commit is contained in:
@@ -366,6 +366,9 @@ class TaskQueue:
|
||||
|
||||
async def _process_once(self, executor: Any) -> None:
|
||||
"""单次处理:扫描所有门店的 pending 队列并执行。"""
|
||||
# CHANGE 2026-03-09 | 每次轮询先回收僵尸 running 任务
|
||||
self._recover_zombie_tasks()
|
||||
|
||||
site_ids = self._get_pending_site_ids()
|
||||
|
||||
for site_id in site_ids:
|
||||
@@ -415,6 +418,13 @@ class TaskQueue:
|
||||
except Exception:
|
||||
logger.exception("队列任务执行异常 [%s]", queue_id)
|
||||
self._mark_failed(queue_id, "执行过程中发生未捕获异常")
|
||||
finally:
|
||||
# CHANGE 2026-03-09 | 兜底:确保 task_queue 不会卡在 running
|
||||
# 背景:_update_execution_log 内部异常(如 duration_ms integer 溢出)
|
||||
# 被吞掉后,_update_queue_status_from_log 读到的 execution_log 仍是
|
||||
# running,导致 task_queue 永远卡住,后续任务全部排队。
|
||||
self._ensure_not_stuck_running(queue_id)
|
||||
|
||||
|
||||
def _get_pending_site_ids(self) -> list[int]:
|
||||
"""获取所有有 pending 任务的 site_id 列表(仅限本实例入队的)。"""
|
||||
@@ -484,6 +494,84 @@ class TaskQueue:
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def _ensure_not_stuck_running(self, queue_id: str) -> None:
|
||||
"""兜底检查:如果 task_queue 仍是 running,强制标记 failed。
|
||||
|
||||
CHANGE 2026-03-09 | 防止 _update_execution_log 内部异常导致
|
||||
task_queue 永远卡在 running 状态。
|
||||
"""
|
||||
conn = get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT status FROM task_queue WHERE id = %s",
|
||||
(queue_id,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if row and row[0] == "running":
|
||||
logger.warning(
|
||||
"兜底修正:task_queue [%s] 执行完毕但仍为 running,"
|
||||
"强制标记 failed",
|
||||
queue_id,
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE task_queue
|
||||
SET status = 'failed', finished_at = NOW(),
|
||||
error_message = %s
|
||||
WHERE id = %s AND status = 'running'
|
||||
""",
|
||||
(
|
||||
"[兜底修正] 执行流程结束但状态未同步,"
|
||||
"可能因 execution_log 更新失败",
|
||||
queue_id,
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
except Exception:
|
||||
logger.exception("_ensure_not_stuck_running 异常 [%s]", queue_id)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def _recover_zombie_tasks(self, max_running_minutes: int = 180) -> None:
|
||||
"""恢复僵尸 running 任务:超过阈值时间仍为 running 的任务强制标记 failed。
|
||||
|
||||
CHANGE 2026-03-09 | 在 process_loop 每次轮询时调用,作为最后防线。
|
||||
场景:后端进程崩溃/重启后,之前的 running 任务永远不会被更新。
|
||||
"""
|
||||
conn = get_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""
|
||||
UPDATE task_queue
|
||||
SET status = 'failed', finished_at = NOW(),
|
||||
error_message = %s
|
||||
WHERE status = 'running'
|
||||
AND (enqueued_by = %s OR enqueued_by IS NULL)
|
||||
AND started_at < NOW() - INTERVAL '%s minutes'
|
||||
RETURNING id
|
||||
""",
|
||||
(
|
||||
f"[僵尸回收] running 超过 {max_running_minutes} 分钟,"
|
||||
"自动标记 failed",
|
||||
_INSTANCE_ID,
|
||||
max_running_minutes,
|
||||
),
|
||||
)
|
||||
recovered = cur.fetchall()
|
||||
if recovered:
|
||||
ids = [r[0] for r in recovered]
|
||||
logger.warning(
|
||||
"僵尸回收:%d 个 running 任务超时,已标记 failed: %s",
|
||||
len(ids), ids,
|
||||
)
|
||||
conn.commit()
|
||||
except Exception:
|
||||
logger.exception("_recover_zombie_tasks 异常")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 生命周期
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user