1

2026-03-15 10:15:02 +08:00
parent 2dd217522c
commit 72bb11b34f
916 changed files with 65306 additions and 16102803 deletions
--- a/apps/backend/app/services/note_reclassifier.py
+++ b/apps/backend/app/services/note_reclassifier.py
@@ -2,9 +2,14 @@
 """
 备注回溯重分类器（Note Reclassifier）

-召回完成后，回溯检查是否有普通备注需重分类为回访备注。
-查找 service_time 之后的第一条 normal 备注 → 更新为 follow_up →
-触发 AI 应用 6 接口（占位）→ 根据 ai_score 生成 follow_up_visit 任务。
+召回完成后，回溯检查是否有普通备注需重分类为回访备注，并创建回访任务。
+
+流程：
+1. 查找 service_time 之后的第一条 normal 备注
+2. 若找到 → 重分类为 follow_up，任务状态 = completed（回溯完成）
+3. 若未找到 → 任务状态 = active（等待备注）
+4. 冲突检查：已有 completed → 跳过；已有 active → 顶替；否则正常创建
+5. 保留 ai_analyze_note() 占位调用，返回值仅更新 ai_score 字段

 由 trigger_jobs 中的 note_reclassify_backfill 配置驱动（event: recall_completed）。
 """
@@ -62,21 +67,27 @@ def ai_analyze_note(note_id: int) -> int | None:
    return None


-def run(payload: dict | None = None) -> dict:
+def run(payload: dict | None = None, job_id: int | None = None) -> dict:
    """
    备注回溯主流程。

    payload 包含: {site_id, assistant_id, member_id, service_time}

-    1. 查找 biz.notes 中该 (site_id, target_type='member', target_id=member_id)
-       在 service_time 之后提交的第一条 type='normal' 的备注
-    2. 将该备注 type 从 'normal' 更新为 'follow_up'
-    3. 触发 AI 应用 6 接口（P5 实现，本 SPEC 仅定义触发接口）：
-       - 调用 ai_analyze_note(note_id) → 返回 ai_score
-    4. 若 ai_score >= 6：
-       - 生成 follow_up_visit 任务，status='completed'（回溯完成）
-    5. 若 ai_score < 6：
-       - 生成 follow_up_visit 任务，status='active'（需助教重新备注）
+    流程：
+    1. 查找 service_time 之后的第一条 normal 备注 → note_id
+    2. 若 note_id 存在：重分类为 follow_up，task_status = 'completed'（回溯完成）
+    3. 若 note_id 不存在：task_status = 'active'（等待备注）
+    4. 保留 ai_analyze_note() 占位调用，返回值仅更新 ai_score 字段
+    5. 冲突检查（T3）：
+       - 已有 completed → 跳过创建
+       - 已有 active → 旧任务标记 inactive + superseded 历史，创建新任务
+       - 不存在（或仅 inactive/abandoned）→ 正常创建
+    6. 创建 follow_up_visit 任务
+
+    参数:
+        payload: 事件载荷（由 trigger_scheduler 传入）
+        job_id: 触发器 job ID（由 trigger_scheduler 传入），用于在最终事务中
+                更新 last_run_at，保证 handler 数据变更与 last_run_at 原子提交

    返回: {"reclassified_count": int, "tasks_created": int}
    """
@@ -119,84 +130,166 @@ def run(payload: dict | None = None) -> dict:
                note_id = row[0]
        conn.commit()

-        if note_id is None:
-            logger.info(
-                "未找到符合条件的 normal 备注: site_id=%s, member_id=%s",
-                site_id, member_id,
-            )
-            return {"reclassified_count": 0, "tasks_created": 0}
-
-        # ── 2. 将备注 type 从 'normal' 更新为 'follow_up' ──
-        with conn.cursor() as cur:
-            cur.execute("BEGIN")
-            cur.execute(
-                """
-                UPDATE biz.notes
-                SET type = 'follow_up', updated_at = NOW()
-                WHERE id = %s AND type = 'normal'
-                """,
-                (note_id,),
-            )
-        conn.commit()
-        reclassified_count = 1
-
-        # ── 3. 触发 AI 应用 6 接口（占位，当前返回 None） ──
-        ai_score = ai_analyze_note(note_id)
-
-        # ── 4/5. 根据 ai_score 生成 follow_up_visit 任务 ──
-        if ai_score is not None:
-            if ai_score >= 6:
-                # 回溯完成：生成 completed 任务
-                task_status = "completed"
-            else:
-                # 需助教重新备注：生成 active 任务
-                task_status = "active"
-
+        # ── 2. 根据是否找到备注确定任务状态（T4） ──
+        if note_id is not None:
+            # 找到备注 → 重分类为 follow_up
            with conn.cursor() as cur:
                cur.execute("BEGIN")
                cur.execute(
                    """
-                    INSERT INTO biz.coach_tasks
-                        (site_id, assistant_id, member_id, task_type,
-                         status, completed_at, completed_task_type)
-                    VALUES (
-                        %s, %s, %s, 'follow_up_visit',
-                        %s,
-                        CASE WHEN %s = 'completed' THEN NOW() ELSE NULL END,
-                        CASE WHEN %s = 'completed' THEN 'follow_up_visit' ELSE NULL END
-                    )
-                    RETURNING id
+                    UPDATE biz.notes
+                    SET type = 'follow_up', updated_at = NOW()
+                    WHERE id = %s AND type = 'normal'
                    """,
-                    (
-                        site_id, assistant_id, member_id,
-                        task_status, task_status, task_status,
-                    ),
-                )
-                new_task_row = cur.fetchone()
-                new_task_id = new_task_row[0]
-
-                # 记录任务创建历史
-                _insert_history(
-                    cur,
-                    new_task_id,
-                    action="created_by_reclassify",
-                    old_status=None,
-                    new_status=task_status,
-                    old_task_type=None,
-                    new_task_type="follow_up_visit",
-                    detail={
-                        "note_id": note_id,
-                        "ai_score": ai_score,
-                        "source": "note_reclassifier",
-                    },
+                    (note_id,),
                )
            conn.commit()
-            tasks_created = 1
+            reclassified_count = 1
+
+            # 保留 AI 占位调用，返回值仅用于更新 ai_score 字段
+            ai_score = ai_analyze_note(note_id)
+            if ai_score is not None:
+                with conn.cursor() as cur:
+                    cur.execute("BEGIN")
+                    cur.execute(
+                        """
+                        UPDATE biz.notes
+                        SET ai_score = %s, updated_at = NOW()
+                        WHERE id = %s
+                        """,
+                        (ai_score, note_id),
+                    )
+                conn.commit()
+
+            # 有备注 → 回溯完成
+            task_status = "completed"
        else:
-            # AI 未就绪，跳过任务创建
+            # 未找到备注 → 等待备注
            logger.info(
-                "AI 接口未就绪，跳过任务创建: note_id=%s", note_id
+                "未找到符合条件的 normal 备注: site_id=%s, member_id=%s",
+                site_id, member_id,
            )
+            ai_score = None
+            task_status = "active"
+
+        # ── 3. 冲突检查（T3）：查询已有 follow_up_visit 任务 ──
+        with conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT id, status
+                FROM biz.coach_tasks
+                WHERE site_id = %s AND assistant_id = %s AND member_id = %s
+                  AND task_type = 'follow_up_visit'
+                  AND status IN ('active', 'completed')
+                ORDER BY CASE WHEN status = 'completed' THEN 0 ELSE 1 END
+                LIMIT 1
+                """,
+                (site_id, assistant_id, member_id),
+            )
+            existing = cur.fetchone()
+        conn.commit()
+
+        if existing:
+            existing_id, existing_status = existing
+            if existing_status == "completed":
+                # 已完成 → 跳过创建（回访完成语义已满足）
+                logger.info(
+                    "已存在 completed 回访任务 id=%s，跳过创建: "
+                    "site_id=%s, assistant_id=%s, member_id=%s",
+                    existing_id, site_id, assistant_id, member_id,
+                )
+                # 事务安全（T5）：即使跳过创建，handler 仍成功，更新 last_run_at
+                if job_id is not None:
+                    from app.services.trigger_scheduler import (
+                        update_job_last_run_at,
+                    )
+
+                    with conn.cursor() as cur:
+                        cur.execute("BEGIN")
+                        update_job_last_run_at(cur, job_id)
+                    conn.commit()
+                return {
+                    "reclassified_count": reclassified_count,
+                    "tasks_created": 0,
+                }
+            elif existing_status == "active":
+                # 顶替：旧任务 → inactive + superseded 历史
+                with conn.cursor() as cur:
+                    cur.execute("BEGIN")
+                    cur.execute(
+                        """
+                        UPDATE biz.coach_tasks
+                        SET status = 'inactive', updated_at = NOW()
+                        WHERE id = %s AND status = 'active'
+                        """,
+                        (existing_id,),
+                    )
+                    _insert_history(
+                        cur,
+                        existing_id,
+                        action="superseded",
+                        old_status="active",
+                        new_status="inactive",
+                        detail={
+                            "reason": "new_reclassify_task_supersedes",
+                            "source": "note_reclassifier",
+                        },
+                    )
+                conn.commit()
+                logger.info(
+                    "顶替旧 active 回访任务 id=%s → inactive: "
+                    "site_id=%s, assistant_id=%s, member_id=%s",
+                    existing_id, site_id, assistant_id, member_id,
+                )
+
+        # ── 4. 创建 follow_up_visit 任务 ──
+        with conn.cursor() as cur:
+            cur.execute("BEGIN")
+            cur.execute(
+                """
+                INSERT INTO biz.coach_tasks
+                    (site_id, assistant_id, member_id, task_type,
+                     status, completed_at, completed_task_type)
+                VALUES (
+                    %s, %s, %s, 'follow_up_visit',
+                    %s,
+                    CASE WHEN %s = 'completed' THEN NOW() ELSE NULL END,
+                    CASE WHEN %s = 'completed' THEN 'follow_up_visit' ELSE NULL END
+                )
+                RETURNING id
+                """,
+                (
+                    site_id, assistant_id, member_id,
+                    task_status, task_status, task_status,
+                ),
+            )
+            new_task_row = cur.fetchone()
+            new_task_id = new_task_row[0]
+
+            # 记录任务创建历史
+            _insert_history(
+                cur,
+                new_task_id,
+                action="created_by_reclassify",
+                old_status=None,
+                new_status=task_status,
+                old_task_type=None,
+                new_task_type="follow_up_visit",
+                detail={
+                    "note_id": note_id,
+                    "ai_score": ai_score,
+                    "source": "note_reclassifier",
+                },
+            )
+
+            # 事务安全（T5）：在最终 commit 前更新 last_run_at
+            if job_id is not None:
+                from app.services.trigger_scheduler import update_job_last_run_at
+
+                update_job_last_run_at(cur, job_id)
+
+        conn.commit()
+        tasks_created = 1

    except Exception:
        logger.exception(
@@ -215,3 +308,4 @@ def run(payload: dict | None = None) -> dict:
        "reclassified_count": reclassified_count,
        "tasks_created": tasks_created,
    }
+
--- a/apps/backend/app/services/note_service.py
+++ b/apps/backend/app/services/note_service.py
@@ -81,8 +81,8 @@ async def create_note(
       - 否则 → type='normal'
    3. INSERT INTO biz.notes
    4. 若 type='follow_up'：
-       - 触发 AI 应用 6 分析（P5 实现）
-       - 若 ai_score >= 6 且关联任务 status='active' → 标记任务 completed
+       - 保留 AI 占位调用（P5 接入时调用链不变），返回值仅更新 ai_score
+       - 不论 ai_score 如何，有备注即标记关联 active 回访任务 completed
    5. 返回创建的备注记录

    注意：星星评分不参与回访完成判定，不参与 AI 分析，仅存储。
@@ -171,8 +171,9 @@ async def create_note(
                "updated_at": row[13].isoformat() if row[13] else None,
            }

-            # 若 type='follow_up'，触发 AI 分析并可能标记任务完成
+            # 若 type='follow_up'，触发 AI 分析并标记回访任务完成
            if note_type == "follow_up" and task_id is not None:
+                # 保留 AI 占位调用（P5 接入时调用链不变）
                ai_score = ai_analyze_note(note["id"])

                if ai_score is not None:
@@ -187,32 +188,32 @@ async def create_note(
                    )
                    note["ai_score"] = ai_score

-                    # 若 ai_score >= 6 且关联任务 status='active' → 标记任务 completed
-                    if ai_score >= 6 and task_info and task_info["status"] == "active":
-                        cur.execute(
-                            """
-                            UPDATE biz.coach_tasks
-                            SET status = 'completed',
-                                completed_at = NOW(),
-                                completed_task_type = task_type,
-                                updated_at = NOW()
-                            WHERE id = %s AND status = 'active'
-                            """,
-                            (task_id,),
-                        )
-                        _record_history(
-                            cur,
-                            task_id,
-                            action="completed_by_note",
-                            old_status="active",
-                            new_status="completed",
-                            old_task_type=task_info["task_type"],
-                            new_task_type=task_info["task_type"],
-                            detail={
-                                "note_id": note["id"],
-                                "ai_score": ai_score,
-                            },
-                        )
+                # 不论 ai_score 如何，有备注即标记回访任务完成（T4）
+                if task_info and task_info["status"] == "active":
+                    cur.execute(
+                        """
+                        UPDATE biz.coach_tasks
+                        SET status = 'completed',
+                            completed_at = NOW(),
+                            completed_task_type = task_type,
+                            updated_at = NOW()
+                        WHERE id = %s AND status = 'active'
+                        """,
+                        (task_id,),
+                    )
+                    _record_history(
+                        cur,
+                        task_id,
+                        action="completed_by_note",
+                        old_status="active",
+                        new_status="completed",
+                        old_task_type=task_info["task_type"],
+                        new_task_type=task_info["task_type"],
+                        detail={
+                            "note_id": note["id"],
+                            "ai_score": ai_score,
+                        },
+                    )

        conn.commit()
        return note
--- a/apps/backend/app/services/recall_detector.py
+++ b/apps/backend/app/services/recall_detector.py
@@ -52,7 +52,7 @@ def _insert_history(
    )


-def run(payload: dict | None = None) -> dict:
+def run(payload: dict | None = None, job_id: int | None = None) -> dict:
    """
    召回完成检测主流程。

@@ -69,6 +69,11 @@ def run(payload: dict | None = None) -> dict:
    6. 记录 coach_task_history
    7. 触发 fire_event('recall_completed', {site_id, assistant_id, member_id, service_time})

+    参数:
+        payload: 事件载荷（event 触发时由 trigger_scheduler 传入）
+        job_id: 触发器 job ID（由 trigger_scheduler 传入），用于在最终事务中
+                更新 last_run_at，保证 handler 数据变更与 last_run_at 原子提交
+
    返回: {"completed_count": int}
    """
    completed_count = 0
@@ -111,6 +116,17 @@ def run(payload: dict | None = None) -> dict:
                )
                conn.rollback()

+        # ── 事务安全（T5）：handler 成功后更新 last_run_at ──
+        # job_id 由 trigger_scheduler 传入，在 handler 最终事务中更新
+        # handler 异常时此处不会执行（异常向上传播），last_run_at 不变
+        if job_id is not None:
+            from app.services.trigger_scheduler import update_job_last_run_at
+
+            with conn.cursor() as cur:
+                cur.execute("BEGIN")
+                update_job_last_run_at(cur, job_id)
+            conn.commit()
+
    finally:
        conn.close()

@@ -193,7 +209,7 @@ def _process_service_record(
    with conn.cursor() as cur:
        cur.execute("BEGIN")

-        # 查找匹配的 active 任务
+        # 查找匹配的 active 召回类任务（仅完成召回任务，回访/关系构建不在此处理）
        cur.execute(
            """
            SELECT id, task_type
@@ -202,6 +218,7 @@ def _process_service_record(
              AND assistant_id = %s
              AND member_id = %s
              AND status = 'active'
+              AND task_type IN ('high_priority_recall', 'priority_recall')
            """,
            (site_id, assistant_id, member_id),
        )
--- a/apps/backend/app/services/task_executor.py
+++ b/apps/backend/app/services/task_executor.py
@@ -314,22 +314,55 @@ class TaskExecutor:
    async def cancel(self, execution_id: str) -> bool:
        """向子进程发送终止信号。

+        如果进程仍在内存中，发送 terminate 信号；
+        如果进程已不在内存中（如后端重启后），但数据库中仍为 running，
+        则直接将数据库状态标记为 cancelled（幽灵记录兜底）。
+
        Returns:
-            True 表示成功发送终止信号，False 表示进程不存在或已退出。
+            True 表示成功取消，False 表示任务不存在或已完成。
        """
        proc = self._processes.get(execution_id)
-        if proc is None:
-            return False
-        # subprocess.Popen: poll() 返回 None 表示仍在运行
-        if proc.poll() is not None:
-            return False
+        if proc is not None:
+            # 进程仍在内存中
+            if proc.poll() is not None:
+                return False
+            logger.info("取消 ETL 子进程 [%s], pid=%s", execution_id, proc.pid)
+            try:
+                proc.terminate()
+            except ProcessLookupError:
+                return False
+            return True

-        logger.info("取消 ETL 子进程 [%s], pid=%s", execution_id, proc.pid)
+        # 进程不在内存中（后端重启等场景），尝试兜底修正数据库幽灵记录
        try:
-            proc.terminate()
-        except ProcessLookupError:
-            return False
-        return True
+            conn = get_connection()
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(
+                        """
+                        UPDATE task_execution_log
+                        SET status      = 'cancelled',
+                            finished_at = NOW(),
+                            error_log   = COALESCE(error_log, '')
+                                || E'\n[cancel 兜底] 进程已不在内存中，标记为 cancelled'
+                        WHERE id = %s AND status = 'running'
+                        """,
+                        (execution_id,),
+                    )
+                    updated = cur.rowcount
+                conn.commit()
+            finally:
+                conn.close()
+            if updated:
+                logger.info(
+                    "兜底取消 execution_log [%s]：数据库状态从 running → cancelled",
+                    execution_id,
+                )
+                return True
+        except Exception:
+            logger.exception("兜底取消 execution_log [%s] 失败", execution_id)
+        return False
+

    # ------------------------------------------------------------------
    # 数据库操作（同步，在线程池中执行也可，此处简单直连）
--- a/apps/backend/app/services/task_manager.py
+++ b/apps/backend/app/services/task_manager.py
@@ -121,13 +121,13 @@ def _verify_task_ownership(

 async def get_task_list(user_id: int, site_id: int) -> list[dict]:
    """
-    获取助教的活跃任务列表。
+    获取助教的任务列表（含有效 + 已放弃）。

    1. 通过 auth.user_assistant_binding 获取 assistant_id
-    2. 查询 biz.coach_tasks WHERE status='active'
+    2. 查询 biz.coach_tasks WHERE status IN ('active', 'abandoned')
    3. 通过 FDW 读取客户基本信息（dim_member）和 RS 指数
    4. 计算爱心 icon 档位
-    5. 排序：is_pinned DESC, priority_score DESC, created_at ASC
+    5. 排序：abandoned 排最后 → is_pinned DESC → priority_score DESC → created_at ASC

    FDW 查询需要 SET LOCAL app.current_site_id。
    """
@@ -135,17 +135,21 @@ async def get_task_list(user_id: int, site_id: int) -> list[dict]:
    try:
        assistant_id = _get_assistant_id(conn, user_id, site_id)

-        # 查询活跃任务
+        # 查询有效 + 已放弃任务（abandoned 排最后）
        with conn.cursor() as cur:
            cur.execute(
                """
                SELECT id, task_type, status, priority_score, is_pinned,
-                       expires_at, created_at, member_id
+                       expires_at, created_at, member_id, abandon_reason
                FROM biz.coach_tasks
                WHERE site_id = %s
                  AND assistant_id = %s
-                  AND status = 'active'
-                ORDER BY is_pinned DESC, priority_score DESC NULLS LAST, created_at ASC
+                  AND status IN ('active', 'abandoned')
+                ORDER BY
+                    CASE WHEN status = 'abandoned' THEN 1 ELSE 0 END ASC,
+                    is_pinned DESC,
+                    priority_score DESC NULLS LAST,
+                    created_at ASC
                """,
                (site_id, assistant_id),
            )
@@ -201,7 +205,7 @@ async def get_task_list(user_id: int, site_id: int) -> list[dict]:
        result = []
        for task_row in tasks:
            (task_id, task_type, status, priority_score,
-             is_pinned, expires_at, created_at, member_id) = task_row
+             is_pinned, expires_at, created_at, member_id, abandon_reason) = task_row

            info = member_info_map.get(member_id, {})
            rs_score = rs_map.get(member_id, Decimal("0"))
@@ -220,6 +224,7 @@ async def get_task_list(user_id: int, site_id: int) -> list[dict]:
                "member_phone": info.get("member_phone"),
                "rs_score": float(rs_score),
                "heart_icon": heart_icon,
+                "abandon_reason": abandon_reason,
            })

        return result
@@ -372,6 +377,7 @@ async def cancel_abandon(task_id: int, user_id: int, site_id: int) -> dict:
                """
                UPDATE biz.coach_tasks
                SET status = 'active',
+                    is_pinned = FALSE,
                    abandon_reason = NULL,
                    updated_at = NOW()
                WHERE id = %s
@@ -389,7 +395,7 @@ async def cancel_abandon(task_id: int, user_id: int, site_id: int) -> dict:
            )
        conn.commit()

-        return {"id": task_id, "status": "active"}
+        return {"id": task_id, "status": "active", "is_pinned": False}

    finally:
        conn.close()
--- a/apps/backend/app/services/task_queue.py
+++ b/apps/backend/app/services/task_queue.py
@@ -366,6 +366,9 @@ class TaskQueue:

    async def _process_once(self, executor: Any) -> None:
        """单次处理：扫描所有门店的 pending 队列并执行。"""
+        # CHANGE 2026-03-09 | 每次轮询先回收僵尸 running 任务
+        self._recover_zombie_tasks()
+
        site_ids = self._get_pending_site_ids()

        for site_id in site_ids:
@@ -415,6 +418,13 @@ class TaskQueue:
        except Exception:
            logger.exception("队列任务执行异常 [%s]", queue_id)
            self._mark_failed(queue_id, "执行过程中发生未捕获异常")
+        finally:
+            # CHANGE 2026-03-09 | 兜底：确保 task_queue 不会卡在 running
+            # 背景：_update_execution_log 内部异常（如 duration_ms integer 溢出）
+            # 被吞掉后，_update_queue_status_from_log 读到的 execution_log 仍是
+            # running，导致 task_queue 永远卡住，后续任务全部排队。
+            self._ensure_not_stuck_running(queue_id)
+

    def _get_pending_site_ids(self) -> list[int]:
        """获取所有有 pending 任务的 site_id 列表（仅限本实例入队的）。"""
@@ -484,6 +494,84 @@ class TaskQueue:
        finally:
            conn.close()

+    def _ensure_not_stuck_running(self, queue_id: str) -> None:
+        """兜底检查：如果 task_queue 仍是 running，强制标记 failed。
+
+        CHANGE 2026-03-09 | 防止 _update_execution_log 内部异常导致
+        task_queue 永远卡在 running 状态。
+        """
+        conn = get_connection()
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT status FROM task_queue WHERE id = %s",
+                    (queue_id,),
+                )
+                row = cur.fetchone()
+                if row and row[0] == "running":
+                    logger.warning(
+                        "兜底修正：task_queue [%s] 执行完毕但仍为 running，"
+                        "强制标记 failed",
+                        queue_id,
+                    )
+                    cur.execute(
+                        """
+                        UPDATE task_queue
+                        SET status = 'failed', finished_at = NOW(),
+                            error_message = %s
+                        WHERE id = %s AND status = 'running'
+                        """,
+                        (
+                            "[兜底修正] 执行流程结束但状态未同步，"
+                            "可能因 execution_log 更新失败",
+                            queue_id,
+                        ),
+                    )
+            conn.commit()
+        except Exception:
+            logger.exception("_ensure_not_stuck_running 异常 [%s]", queue_id)
+        finally:
+            conn.close()
+
+    def _recover_zombie_tasks(self, max_running_minutes: int = 180) -> None:
+        """恢复僵尸 running 任务：超过阈值时间仍为 running 的任务强制标记 failed。
+
+        CHANGE 2026-03-09 | 在 process_loop 每次轮询时调用，作为最后防线。
+        场景：后端进程崩溃/重启后，之前的 running 任务永远不会被更新。
+        """
+        conn = get_connection()
+        try:
+            with conn.cursor() as cur:
+                cur.execute(
+                    """
+                    UPDATE task_queue
+                    SET status = 'failed', finished_at = NOW(),
+                        error_message = %s
+                    WHERE status = 'running'
+                      AND (enqueued_by = %s OR enqueued_by IS NULL)
+                      AND started_at < NOW() - INTERVAL '%s minutes'
+                    RETURNING id
+                    """,
+                    (
+                        f"[僵尸回收] running 超过 {max_running_minutes} 分钟，"
+                        "自动标记 failed",
+                        _INSTANCE_ID,
+                        max_running_minutes,
+                    ),
+                )
+                recovered = cur.fetchall()
+                if recovered:
+                    ids = [r[0] for r in recovered]
+                    logger.warning(
+                        "僵尸回收：%d 个 running 任务超时，已标记 failed: %s",
+                        len(ids), ids,
+                    )
+            conn.commit()
+        except Exception:
+            logger.exception("_recover_zombie_tasks 异常")
+        finally:
+            conn.close()
+
    # ------------------------------------------------------------------
    # 生命周期
    # ------------------------------------------------------------------
--- a/apps/backend/app/services/task_registry.py
+++ b/apps/backend/app/services/task_registry.py
@@ -86,6 +86,9 @@ DWS_TASKS: list[TaskDefinition] = [
    TaskDefinition("DWS_ASSISTANT_FINANCE", "助教财务汇总", "汇总助教财务数据", "助教", "DWS"),
    TaskDefinition("DWS_MEMBER_CONSUMPTION", "会员消费分析", "汇总会员消费数据", "会员", "DWS"),
    TaskDefinition("DWS_MEMBER_VISIT", "会员到店分析", "汇总会员到店频次", "会员", "DWS"),
+    # CHANGE [2026-03-09] intent: 注册项目标签任务，与 ETL 侧 task_registry 同步；全量重建不依赖日期窗口
+    TaskDefinition("DWS_ASSISTANT_PROJECT_TAG", "助教项目标签", "按时间窗口计算助教各项目时长占比标签", "助教", "DWS", requires_window=False),
+    TaskDefinition("DWS_MEMBER_PROJECT_TAG", "客户项目标签", "按时间窗口计算客户各项目消费时长占比标签", "会员", "DWS", requires_window=False),
    TaskDefinition("DWS_FINANCE_DAILY", "财务日报", "汇总每日财务数据", "财务", "DWS"),
    TaskDefinition("DWS_FINANCE_RECHARGE", "充值汇总", "汇总充值数据", "财务", "DWS"),
    TaskDefinition("DWS_FINANCE_INCOME_STRUCTURE", "收入结构", "分析收入结构", "财务", "DWS"),
--- a/apps/backend/app/services/trigger_scheduler.py
+++ b/apps/backend/app/services/trigger_scheduler.py
@@ -31,6 +31,20 @@ def register_job(job_type: str, handler: Callable) -> None:
    _JOB_REGISTRY[job_type] = handler


+def update_job_last_run_at(cur, job_id: int) -> None:
+    """
+    在 handler 的事务内更新 last_run_at。
+
+    handler 在最终 commit 前调用此函数，将 last_run_at 更新纳入同一事务。
+    handler 成功 → last_run_at 随事务一起 commit。
+    handler 失败 → last_run_at 随事务一起 rollback。
+    """
+    cur.execute(
+        "UPDATE biz.trigger_jobs SET last_run_at = NOW() WHERE id = %s",
+        (job_id,),
+    )
+
+
 def fire_event(event_name: str, payload: dict[str, Any] | None = None) -> int:
    """
    触发事件驱动型任务。
@@ -38,6 +52,10 @@ def fire_event(event_name: str, payload: dict[str, Any] | None = None) -> int:
    查找 trigger_condition='event' 且 trigger_config.event_name 匹配的 enabled job，
    立即执行对应的 handler。

+    事务安全：将 job_id 传入 handler，由 handler 在最终 commit 前
+    更新 last_run_at，保证 handler 数据变更与 last_run_at 在同一事务中。
+    handler 失败时整个事务回滚，last_run_at 不更新。
+
    返回: 执行的 job 数量
    """
    conn = _get_connection()
@@ -55,6 +73,7 @@ def fire_event(event_name: str, payload: dict[str, Any] | None = None) -> int:
                (event_name,),
            )
            rows = cur.fetchall()
+        conn.commit()

        for job_id, job_type, job_name in rows:
            handler = _JOB_REGISTRY.get(job_type)
@@ -64,18 +83,11 @@ def fire_event(event_name: str, payload: dict[str, Any] | None = None) -> int:
                )
                continue
            try:
-                handler(payload=payload)
+                # 将 job_id 传入 handler，handler 在最终 commit 前更新 last_run_at
+                handler(payload=payload, job_id=job_id)
                executed += 1
-                # 更新 last_run_at
-                with conn.cursor() as cur:
-                    cur.execute(
-                        "UPDATE biz.trigger_jobs SET last_run_at = NOW() WHERE id = %s",
-                        (job_id,),
-                    )
-                conn.commit()
            except Exception:
                logger.exception("触发器 %s 执行失败", job_name)
-                conn.rollback()

    finally:
        conn.close()
@@ -87,6 +99,11 @@ def check_scheduled_jobs() -> int:
    检查 cron/interval 类型的到期 job 并执行。

    由 Scheduler 后台循环调用。
+
+    事务安全：将 conn 和 job_id 传入 handler，由 handler 在最终 commit 前
+    更新 last_run_at 和 next_run_at，保证 handler 数据变更与时间戳在同一事务中。
+    handler 失败时整个事务回滚。
+
    返回: 执行的 job 数量
    """
    conn = _get_connection()
@@ -104,6 +121,7 @@ def check_scheduled_jobs() -> int:
                """,
            )
            rows = cur.fetchall()
+        conn.commit()

        for job_id, job_type, job_name, trigger_condition, trigger_config in rows:
            handler = _JOB_REGISTRY.get(job_type)
@@ -111,11 +129,12 @@ def check_scheduled_jobs() -> int:
                logger.warning("未注册的 job_type: %s", job_type)
                continue
            try:
-                handler()
-                executed += 1
-                # 计算 next_run_at 并更新
+                # cron/interval handler 接受 conn + job_id，在最终 commit 前更新时间戳
+                handler(conn=conn, job_id=job_id)
+                # 计算 next_run_at 并更新（在 handler commit 后的新事务中）
                next_run = _calculate_next_run(trigger_condition, trigger_config)
                with conn.cursor() as cur:
+                    cur.execute("BEGIN")
                    cur.execute(
                        """
                        UPDATE biz.trigger_jobs
@@ -125,6 +144,7 @@ def check_scheduled_jobs() -> int:
                        (next_run, job_id),
                    )
                conn.commit()
+                executed += 1
            except Exception:
                logger.exception("触发器 %s 执行失败", job_name)
                conn.rollback()
@@ -156,6 +176,6 @@ def _calculate_next_run(
            from apps.backend.app.services.scheduler import _parse_simple_cron

        return _parse_simple_cron(
-            trigger_config.get("cron_expression", "0 4 * * *"), now
+            trigger_config.get("cron_expression", "0 7 * * *"), now
        )
    return None  # event 类型无 next_run_at