feat: 累积功能变更 — 聊天集成、租户管理、小程序更新、ETL 增强、迁移脚本

包含多个会话的累积代码变更： - backend: AI 聊天服务、触发器调度、认证增强、WebSocket、调度器最小间隔 - admin-web: ETL 状态页、任务管理、调度配置、登录优化 - miniprogram: 看板页面、聊天集成、UI 组件、导航更新 - etl: DWS 新任务（finance_area_daily/board_cache）、连接器增强 - tenant-admin: 项目初始化 - db: 19 个迁移脚本（etl_feiqiu 11 + zqyy_app 8） - packages/shared: 枚举和工具函数更新 - tools: 数据库工具、报表生成、健康检查 - docs: PRD/架构/部署/合约文档更新 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-06 00:03:48 +08:00
parent 70324d8542
commit 6f8f12314f
515 changed files with 76604 additions and 7456 deletions
--- a/apps/backend/app/services/task_executor.py
+++ b/apps/backend/app/services/task_executor.py
@@ -26,6 +26,7 @@ from typing import Any
 # 禁止 from ..config import ETL_PROJECT_PATH（值拷贝，reload 后过期）
 from .. import config as _config_module
 from ..database import get_connection
+from psycopg2.extras import Json
 from ..schemas.tasks import TaskConfigSchema
 from ..services.cli_builder import cli_builder

@@ -184,6 +185,7 @@ class TaskExecutor:
            started_at=started_at,
            command=command_str_with_host,
            schedule_id=schedule_id,
+            config_json=config.model_dump(mode="json"),
        )

        exit_code: int | None = None
@@ -249,6 +251,9 @@ class TaskExecutor:
                error_log="\n".join(stderr_lines),
            )

+            # CHANGE 2026-03-22 | 释放内存缓冲区，防止长期运行内存泄漏
+            self.cleanup(execution_id)
+
    def _run_subprocess(
        self,
        cmd: list[str],
@@ -379,6 +384,7 @@ class TaskExecutor:
        started_at: datetime,
        command: str,
        schedule_id: str | None = None,
+        config_json: dict | None = None,
    ) -> None:
        """插入一条执行日志记录（running 状态）。"""
        try:
@@ -396,12 +402,13 @@ class TaskExecutor:
                        if row and row[0]:
                            effective_schedule_id = str(row[0])

+                    # CHANGE 2026-03-22 | 存储完整 TaskConfig JSON，供 rerun 还原原始参数
                    cur.execute(
                        """
                        INSERT INTO task_execution_log
                            (id, queue_id, site_id, task_codes, status,
-                             started_at, command, schedule_id)
-                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+                             started_at, command, schedule_id, config)
+                        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
                        """,
                        (
                            execution_id,
@@ -412,6 +419,7 @@ class TaskExecutor:
                            started_at,
                            command,
                            effective_schedule_id,
+                            Json(config_json) if config_json else None,
                        ),
                    )
                conn.commit()
@@ -475,6 +483,115 @@ class TaskExecutor:
        self._log_buffers.pop(execution_id, None)
        self._subscribers.pop(execution_id, None)

+    # ------------------------------------------------------------------
+    # 优雅关闭：终止所有子进程并回写状态
+    # ------------------------------------------------------------------
+
+    async def shutdown(self, timeout: float = 3.0) -> int:
+        """优雅关闭：终止所有正在运行的子进程，等待回写完成。
+
+        Args:
+            timeout: 等待子进程退出的超时秒数，超时后强制 kill。
+
+        Returns:
+            被终止的进程数量。
+        """
+        running_ids = list(self._processes.keys())
+        if not running_ids:
+            return 0
+
+        logger.info(
+            "优雅关闭：终止 %d 个运行中的子进程，超时 %.1fs",
+            len(running_ids), timeout,
+        )
+
+        # 先发 terminate 信号
+        for eid, proc in list(self._processes.items()):
+            if proc.poll() is None:
+                try:
+                    proc.terminate()
+                    logger.info("已发送 terminate 信号: %s (pid=%s)", eid, proc.pid)
+                except ProcessLookupError:
+                    pass
+
+        # 等待子进程退出（给 finally 块执行的机会）
+        import time
+        deadline = time.monotonic() + timeout
+        for eid, proc in list(self._processes.items()):
+            remaining = deadline - time.monotonic()
+            if remaining > 0 and proc.poll() is None:
+                try:
+                    proc.wait(timeout=remaining)
+                except Exception:
+                    pass
+
+        # 超时后强制 kill 仍存活的进程
+        for eid, proc in list(self._processes.items()):
+            if proc.poll() is None:
+                try:
+                    proc.kill()
+                    logger.warning("强制 kill: %s (pid=%s)", eid, proc.pid)
+                except ProcessLookupError:
+                    pass
+
+        # 注意：execute() 的 finally 块会在 run_in_executor 返回后执行，
+        # 此处不需要手动回写——asyncio 事件循环关闭前会处理。
+        # 但如果 finally 来不及执行，recover_stale() 会在下次启动时兜底。
+
+        count = len(running_ids)
+        logger.info("优雅关闭完成，已终止 %d 个子进程", count)
+        return count
+
+    # ------------------------------------------------------------------
+    # 启动时僵尸任务清理
+    # ------------------------------------------------------------------
+
+    def recover_stale(self) -> int:
+        """启动时清理本机的僵尸任务（status=running 但进程已不存在）。
+
+        仅清理 command 中包含本机主机名标识 [hostname] 的记录。
+
+        Returns:
+            被标记为 interrupted 的记录数量。
+        """
+        # CHANGE 2026-03-22 | 启动时僵尸清理，仅限本机
+        host_tag = f"[{_INSTANCE_HOST}]"
+        try:
+            conn = get_connection()
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(
+                        """
+                        UPDATE task_execution_log
+                        SET status      = 'interrupted',
+                            finished_at = NOW(),
+                            error_log   = COALESCE(error_log, '')
+                                || E'\n[recover_stale] 后端重启，进程已丢失，标记为 interrupted'
+                        WHERE status = 'running'
+                          AND command LIKE %s
+                        RETURNING id
+                        """,
+                        (f"{host_tag}%",),
+                    )
+                    rows = cur.fetchall()
+                    count = len(rows)
+                conn.commit()
+            finally:
+                conn.close()
+
+            if count > 0:
+                ids = [str(r[0]) for r in rows]
+                logger.warning(
+                    "启动清理：%d 条僵尸任务标记为 interrupted: %s",
+                    count, ", ".join(ids),
+                )
+            else:
+                logger.info("启动清理：无僵尸任务")
+            return count
+        except Exception:
+            logger.exception("启动清理僵尸任务失败")
+            return 0
+

 # 全局单例
 task_executor = TaskExecutor()