feat: 累积功能变更 — 聊天集成、租户管理、小程序更新、ETL 增强、迁移脚本
包含多个会话的累积代码变更: - backend: AI 聊天服务、触发器调度、认证增强、WebSocket、调度器最小间隔 - admin-web: ETL 状态页、任务管理、调度配置、登录优化 - miniprogram: 看板页面、聊天集成、UI 组件、导航更新 - etl: DWS 新任务(finance_area_daily/board_cache)、连接器增强 - tenant-admin: 项目初始化 - db: 19 个迁移脚本(etl_feiqiu 11 + zqyy_app 8) - packages/shared: 枚举和工具函数更新 - tools: 数据库工具、报表生成、健康检查 - docs: PRD/架构/部署/合约文档更新 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
10
apps/backend/app/trace/__init__.py
Normal file
10
apps/backend/app/trace/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Dev Trace Log — 全链路请求追踪模块
|
||||
|
||||
仅在开发/测试环境启用,生产环境通过 DEV_TRACE_ENABLED=false 关闭。
|
||||
"""
|
||||
|
||||
from app.trace.config import TraceConfig, get_trace_config
|
||||
|
||||
__all__ = ["TraceConfig", "get_trace_config"]
|
||||
277
apps/backend/app/trace/cleanup.py
Normal file
277
apps/backend/app/trace/cleanup.py
Normal file
@@ -0,0 +1,277 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
日志自动清理模块
|
||||
|
||||
提供同步清理函数和定时任务调度器:
|
||||
- cleanup_old_logs: 保留最新 N 个日期目录(按记录日期排序,非日历天数),
|
||||
超出总量上限时额外清理
|
||||
- cleanup_date_range: 按日期范围清理指定目录
|
||||
- schedule_daily_cleanup: 返回 async 函数,每天凌晨 2:00 执行清理
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.trace.config import get_trace_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 日期目录名格式
|
||||
_DATE_FORMAT = "%Y-%m-%d"
|
||||
|
||||
|
||||
def _is_date_dir(name: str) -> bool:
|
||||
"""判断目录名是否为 YYYY-MM-DD 格式的日期目录。"""
|
||||
try:
|
||||
datetime.strptime(name, _DATE_FORMAT)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def _parse_date(name: str) -> datetime:
|
||||
"""将 YYYY-MM-DD 格式的目录名解析为 datetime。"""
|
||||
return datetime.strptime(name, _DATE_FORMAT)
|
||||
|
||||
|
||||
def _dir_size(path: Path) -> int:
|
||||
"""递归计算目录总大小(字节)。"""
|
||||
total = 0
|
||||
try:
|
||||
for f in path.rglob("*"):
|
||||
if f.is_file():
|
||||
total += f.stat().st_size
|
||||
except OSError:
|
||||
pass
|
||||
return total
|
||||
|
||||
|
||||
def _update_root_index(base_dir: Path, deleted_dirs: list[str]) -> None:
|
||||
"""更新根目录的 _index.json,移除已删除目录的引用。
|
||||
|
||||
根索引结构示例:
|
||||
{
|
||||
"dates": {
|
||||
"2026-03-20": { "files": {...} },
|
||||
"2026-03-21": { "files": {...} }
|
||||
}
|
||||
}
|
||||
也兼容日期目录内的 _index.json(files 字典)。
|
||||
"""
|
||||
index_path = base_dir / "_index.json"
|
||||
if not index_path.exists():
|
||||
return
|
||||
|
||||
try:
|
||||
index: dict[str, Any] = json.loads(index_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return
|
||||
|
||||
changed = False
|
||||
|
||||
# 移除 dates 字典中的已删除日期
|
||||
if "dates" in index and isinstance(index["dates"], dict):
|
||||
for d in deleted_dirs:
|
||||
if d in index["dates"]:
|
||||
del index["dates"][d]
|
||||
changed = True
|
||||
|
||||
# 兼容:如果根索引直接用日期作为 key(扁平结构)
|
||||
for d in deleted_dirs:
|
||||
if d in index and d != "dates":
|
||||
del index[d]
|
||||
changed = True
|
||||
|
||||
if changed:
|
||||
index_path.write_text(
|
||||
json.dumps(index, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def cleanup_old_logs(
|
||||
base_dir: str | None = None,
|
||||
retention_days: int | None = None,
|
||||
max_total_mb: int = 200,
|
||||
) -> dict[str, Any]:
|
||||
"""保留最新 N 个日期目录,删除其余;超出总量上限时额外清理。
|
||||
|
||||
策略:按日期目录名排序(非日历天数),保留最新 retention_days 个,
|
||||
不论日期是否连续。之后如果总量仍超过 max_total_mb,从最旧开始
|
||||
继续删除直到低于上限。
|
||||
|
||||
Args:
|
||||
base_dir: 日志根目录,默认从 TraceConfig 读取
|
||||
retention_days: 保留的日期目录个数,默认从 TraceConfig 读取
|
||||
max_total_mb: 总量上限(MB),超过时从最旧额外清理
|
||||
|
||||
Returns:
|
||||
清理结果 dict:
|
||||
{
|
||||
"deleted_dirs": ["2026-03-15", ...],
|
||||
"deleted_count": 2,
|
||||
"freed_bytes": 12345
|
||||
}
|
||||
"""
|
||||
cfg = get_trace_config()
|
||||
if base_dir is None:
|
||||
base_dir = cfg.log_dir
|
||||
if retention_days is None:
|
||||
retention_days = cfg.retention_days
|
||||
|
||||
base_path = Path(base_dir)
|
||||
if not base_path.exists():
|
||||
return {"deleted_dirs": [], "deleted_count": 0, "freed_bytes": 0}
|
||||
|
||||
# 收集所有日期目录,按名称排序(即按日期升序)
|
||||
date_dirs = sorted(
|
||||
[d for d in base_path.iterdir() if d.is_dir() and _is_date_dir(d.name)],
|
||||
key=lambda d: d.name,
|
||||
)
|
||||
|
||||
deleted_dirs: list[str] = []
|
||||
freed_bytes = 0
|
||||
|
||||
# 第 1 步:保留最新 retention_days 个,删除其余
|
||||
if len(date_dirs) > retention_days:
|
||||
to_delete = date_dirs[: len(date_dirs) - retention_days]
|
||||
for entry in to_delete:
|
||||
size = _dir_size(entry)
|
||||
try:
|
||||
shutil.rmtree(entry)
|
||||
deleted_dirs.append(entry.name)
|
||||
freed_bytes += size
|
||||
except OSError:
|
||||
logger.warning("清理日期目录失败: %s", entry, exc_info=True)
|
||||
# 更新剩余列表
|
||||
date_dirs = date_dirs[len(date_dirs) - retention_days :]
|
||||
|
||||
# 第 2 步:总量上限保护
|
||||
max_bytes = max_total_mb * 1024 * 1024
|
||||
total_size = sum(_dir_size(d) for d in date_dirs)
|
||||
if total_size > max_bytes:
|
||||
logger.warning(
|
||||
"日志总量 %.1f MB 超过上限 %d MB,启动额外清理",
|
||||
total_size / (1024 * 1024),
|
||||
max_total_mb,
|
||||
)
|
||||
for entry in list(date_dirs):
|
||||
if total_size <= max_bytes:
|
||||
break
|
||||
size = _dir_size(entry)
|
||||
try:
|
||||
shutil.rmtree(entry)
|
||||
deleted_dirs.append(entry.name)
|
||||
freed_bytes += size
|
||||
total_size -= size
|
||||
date_dirs.remove(entry)
|
||||
except OSError:
|
||||
logger.warning("清理日期目录失败: %s", entry, exc_info=True)
|
||||
|
||||
# 更新根索引
|
||||
if deleted_dirs:
|
||||
_update_root_index(base_path, deleted_dirs)
|
||||
|
||||
return {
|
||||
"deleted_dirs": deleted_dirs,
|
||||
"deleted_count": len(deleted_dirs),
|
||||
"freed_bytes": freed_bytes,
|
||||
}
|
||||
|
||||
|
||||
def cleanup_date_range(
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
base_dir: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""按日期范围清理日期目录。
|
||||
|
||||
Args:
|
||||
start_date: 起始日期(含),格式 YYYY-MM-DD
|
||||
end_date: 结束日期(含),格式 YYYY-MM-DD
|
||||
base_dir: 日志根目录,默认从 TraceConfig 读取
|
||||
|
||||
Returns:
|
||||
清理结果 dict(同 cleanup_old_logs)
|
||||
"""
|
||||
if base_dir is None:
|
||||
base_dir = get_trace_config().log_dir
|
||||
|
||||
base_path = Path(base_dir)
|
||||
if not base_path.exists():
|
||||
return {"deleted_dirs": [], "deleted_count": 0, "freed_bytes": 0}
|
||||
|
||||
start_dt = datetime.strptime(start_date, _DATE_FORMAT)
|
||||
end_dt = datetime.strptime(end_date, _DATE_FORMAT)
|
||||
|
||||
deleted_dirs: list[str] = []
|
||||
freed_bytes = 0
|
||||
|
||||
for entry in sorted(base_path.iterdir()):
|
||||
if not entry.is_dir():
|
||||
continue
|
||||
if not _is_date_dir(entry.name):
|
||||
continue
|
||||
|
||||
dir_date = _parse_date(entry.name)
|
||||
if start_dt <= dir_date <= end_dt:
|
||||
size = _dir_size(entry)
|
||||
try:
|
||||
shutil.rmtree(entry)
|
||||
deleted_dirs.append(entry.name)
|
||||
freed_bytes += size
|
||||
except OSError:
|
||||
logger.warning("清理日期目录失败: %s", entry, exc_info=True)
|
||||
|
||||
if deleted_dirs:
|
||||
_update_root_index(base_path, deleted_dirs)
|
||||
|
||||
return {
|
||||
"deleted_dirs": deleted_dirs,
|
||||
"deleted_count": len(deleted_dirs),
|
||||
"freed_bytes": freed_bytes,
|
||||
}
|
||||
|
||||
|
||||
def schedule_daily_cleanup():
|
||||
"""返回一个 async 函数,可在 lifespan 中注册为定时任务。
|
||||
|
||||
每天凌晨 2:00 执行 cleanup_old_logs,使用 asyncio.sleep 循环等待。
|
||||
"""
|
||||
|
||||
async def _daily_cleanup_loop() -> None:
|
||||
"""定时清理循环:计算到下一个凌晨 2:00 的等待秒数,执行清理后继续循环。"""
|
||||
while True:
|
||||
now = datetime.now()
|
||||
# 计算下一个凌晨 2:00
|
||||
target = now.replace(hour=2, minute=0, second=0, microsecond=0)
|
||||
if now >= target:
|
||||
# 今天的 2:00 已过,等到明天
|
||||
target += timedelta(days=1)
|
||||
|
||||
wait_seconds = (target - now).total_seconds()
|
||||
logger.info("日志清理定时任务:将在 %.0f 秒后执行(%s)", wait_seconds, target.isoformat())
|
||||
|
||||
await asyncio.sleep(wait_seconds)
|
||||
|
||||
try:
|
||||
result = cleanup_old_logs()
|
||||
if result["deleted_count"] > 0:
|
||||
logger.info(
|
||||
"日志自动清理完成:删除 %d 个目录,释放 %d 字节",
|
||||
result["deleted_count"],
|
||||
result["freed_bytes"],
|
||||
)
|
||||
else:
|
||||
logger.debug("日志自动清理:无过期目录需要清理")
|
||||
except Exception:
|
||||
logger.warning("日志自动清理失败", exc_info=True)
|
||||
|
||||
return _daily_cleanup_loop
|
||||
136
apps/backend/app/trace/config.py
Normal file
136
apps/backend/app/trace/config.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Dev Trace Log 配置模块
|
||||
|
||||
从环境变量读取 trace 相关配置,支持运行时动态修改(内存状态)。
|
||||
重启后回退到 .env 配置值。使用单例模式,全局唯一实例。
|
||||
|
||||
环境变量缺失时使用合理默认值:
|
||||
- DEV_TRACE_ENABLED: false(默认关闭,显式开启才采集)
|
||||
- DEV_TRACE_LOG_DIR: export/dev-trace-logs
|
||||
- DEV_TRACE_LOG_RETENTION_DAYS: 7
|
||||
- DEV_TRACE_LOG_SQL: true
|
||||
- DEV_TRACE_LOG_PARAMS: true
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import threading
|
||||
from typing import Any
|
||||
|
||||
|
||||
def _parse_bool(value: str, default: bool) -> bool:
|
||||
"""解析布尔型环境变量,支持 true/1/yes(不区分大小写)。"""
|
||||
if not value:
|
||||
return default
|
||||
return value.strip().lower() in ("true", "1", "yes")
|
||||
|
||||
|
||||
class TraceConfig:
|
||||
"""全链路日志配置(单例)。
|
||||
|
||||
属性从环境变量初始化,运行时可通过 update_settings() 动态修改。
|
||||
重启后自动回退到 .env 值(因为 __init__ 重新读取环境变量)。
|
||||
"""
|
||||
|
||||
_instance: TraceConfig | None = None
|
||||
_lock: threading.Lock = threading.Lock()
|
||||
|
||||
def __new__(cls) -> TraceConfig:
|
||||
# 双重检查锁定,线程安全的单例
|
||||
if cls._instance is None:
|
||||
with cls._lock:
|
||||
if cls._instance is None:
|
||||
instance = super().__new__(cls)
|
||||
instance._init_from_env()
|
||||
cls._instance = instance
|
||||
return cls._instance
|
||||
|
||||
def _init_from_env(self) -> None:
|
||||
"""从环境变量读取配置,作为基线值和当前运行时值。"""
|
||||
# 基线值(.env 原始值,reset_to_env 时回退到这里)
|
||||
self._env_enabled: bool = _parse_bool(
|
||||
os.environ.get("DEV_TRACE_ENABLED", ""), default=False
|
||||
)
|
||||
raw_log_dir: str = os.environ.get(
|
||||
"DEV_TRACE_LOG_DIR", "export/dev-trace-logs"
|
||||
)
|
||||
# CHANGE 2026-03-22 | trace日志路径修复
|
||||
# 相对路径基于项目根目录解析,避免受后端 cwd(apps/backend/)影响
|
||||
if not os.path.isabs(raw_log_dir):
|
||||
project_root = os.environ.get("NEOZQYY_ROOT", "")
|
||||
if project_root:
|
||||
raw_log_dir = os.path.join(project_root, raw_log_dir)
|
||||
self._env_log_dir: str = raw_log_dir
|
||||
self._env_retention_days: int = int(
|
||||
os.environ.get("DEV_TRACE_LOG_RETENTION_DAYS", "7")
|
||||
)
|
||||
self._env_log_sql: bool = _parse_bool(
|
||||
os.environ.get("DEV_TRACE_LOG_SQL", ""), default=True
|
||||
)
|
||||
self._env_log_params: bool = _parse_bool(
|
||||
os.environ.get("DEV_TRACE_LOG_PARAMS", ""), default=True
|
||||
)
|
||||
|
||||
# 运行时值(初始等于基线值,API 可动态修改)
|
||||
self.enabled: bool = self._env_enabled
|
||||
self.log_dir: str = self._env_log_dir
|
||||
self.retention_days: int = self._env_retention_days
|
||||
self.log_sql: bool = self._env_log_sql
|
||||
self.log_params: bool = self._env_log_params
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 运行时开关方法
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def update_settings(
|
||||
self,
|
||||
*,
|
||||
enabled: bool | None = None,
|
||||
retention_days: int | None = None,
|
||||
log_sql: bool | None = None,
|
||||
log_params: bool | None = None,
|
||||
) -> None:
|
||||
"""更新内存中的运行时设置(不写 .env,重启后回退)。"""
|
||||
if enabled is not None:
|
||||
self.enabled = enabled
|
||||
if retention_days is not None:
|
||||
self.retention_days = retention_days
|
||||
if log_sql is not None:
|
||||
self.log_sql = log_sql
|
||||
if log_params is not None:
|
||||
self.log_params = log_params
|
||||
|
||||
def get_settings(self) -> dict[str, Any]:
|
||||
"""返回当前运行时设置字典。"""
|
||||
return {
|
||||
"enabled": self.enabled,
|
||||
"log_dir": self.log_dir,
|
||||
"retention_days": self.retention_days,
|
||||
"log_sql": self.log_sql,
|
||||
"log_params": self.log_params,
|
||||
}
|
||||
|
||||
def reset_to_env(self) -> None:
|
||||
"""重置运行时值为 .env 基线值。"""
|
||||
self.enabled = self._env_enabled
|
||||
self.log_dir = self._env_log_dir
|
||||
self.retention_days = self._env_retention_days
|
||||
self.log_sql = self._env_log_sql
|
||||
self.log_params = self._env_log_params
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 测试辅助:重置单例(仅测试使用)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@classmethod
|
||||
def _reset_singleton(cls) -> None:
|
||||
"""销毁单例实例,下次访问时重新从环境变量初始化。仅供测试使用。"""
|
||||
with cls._lock:
|
||||
cls._instance = None
|
||||
|
||||
|
||||
def get_trace_config() -> TraceConfig:
|
||||
"""获取全局 TraceConfig 单例。"""
|
||||
return TraceConfig()
|
||||
150
apps/backend/app/trace/context.py
Normal file
150
apps/backend/app/trace/context.py
Normal file
@@ -0,0 +1,150 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
TraceContext 与 TraceSpan 数据模型
|
||||
|
||||
基于 contextvars 的请求级追踪上下文,存储 request_id 和有序 span 列表。
|
||||
支持 HTTP / SSE / WebSocket / 后台 Job 四种 trace 类型,
|
||||
每种类型使用不同的 request_id 前缀以便区分。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import contextvars
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Span 类型常量(字符串常量类,不用 Enum)
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
class SpanType:
|
||||
"""所有支持的 span 类型常量。"""
|
||||
|
||||
HTTP_IN = "HTTP_IN"
|
||||
AUTH = "AUTH"
|
||||
ROUTE = "ROUTE"
|
||||
SERVICE = "SERVICE"
|
||||
DB_QUERY = "DB_QUERY"
|
||||
DB_CONN = "DB_CONN"
|
||||
DB_CONN_RELEASE = "DB_CONN_RELEASE"
|
||||
HTTP_OUT = "HTTP_OUT"
|
||||
ERROR = "ERROR"
|
||||
DB_ERROR = "DB_ERROR"
|
||||
MIDDLEWARE = "MIDDLEWARE"
|
||||
MIDDLEWARE_ERROR = "MIDDLEWARE_ERROR"
|
||||
SSE_START = "SSE_START"
|
||||
SSE_EVENT = "SSE_EVENT"
|
||||
SSE_END = "SSE_END"
|
||||
AI_CALL = "AI_CALL"
|
||||
AI_STREAM = "AI_STREAM"
|
||||
AI_ERROR = "AI_ERROR"
|
||||
WS_CONNECT = "WS_CONNECT"
|
||||
WS_MESSAGE = "WS_MESSAGE"
|
||||
WS_DISCONNECT = "WS_DISCONNECT"
|
||||
JOB_START = "JOB_START"
|
||||
JOB_END = "JOB_END"
|
||||
JOB_ERROR = "JOB_ERROR"
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Trace 类型常量
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
class TraceType:
|
||||
"""Trace 类型:HTTP / SSE / WebSocket / 后台 Job。"""
|
||||
|
||||
HTTP = "http"
|
||||
SSE = "sse"
|
||||
WS = "ws"
|
||||
JOB = "job"
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 数据模型
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class TraceSpan:
|
||||
"""单个追踪节点,记录某一层的函数调用信息。"""
|
||||
|
||||
span_type: str # SpanType 中的常量值
|
||||
module: str # 模块路径,如 "xcx_tasks"
|
||||
function: str # 函数名,如 "get_task_list"
|
||||
description_zh: str # 中文描述
|
||||
description_en: str # 英文描述
|
||||
params: dict[str, Any] # 调用参数
|
||||
result_summary: str # 结果摘要
|
||||
duration_ms: float # 耗时(毫秒)
|
||||
timestamp: str # ISO 格式时间戳
|
||||
extra: dict[str, Any] = field(default_factory=dict) # 额外信息(SQL 等)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TraceContext:
|
||||
"""请求级追踪上下文,维护 request_id 和有序 span 列表。"""
|
||||
|
||||
request_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
|
||||
trace_type: str = TraceType.HTTP # http / sse / ws / job
|
||||
start_time: datetime = field(default_factory=datetime.now)
|
||||
method: str = ""
|
||||
path: str = ""
|
||||
user_id: int | None = None
|
||||
site_id: int | None = None
|
||||
spans: list[TraceSpan] = field(default_factory=list)
|
||||
|
||||
def add_span(self, span: TraceSpan) -> None:
|
||||
"""追加 span 到列表,保持插入顺序。"""
|
||||
self.spans.append(span)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# contextvars 存储
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
trace_context_var: contextvars.ContextVar[TraceContext | None] = contextvars.ContextVar(
|
||||
"trace_context", default=None
|
||||
)
|
||||
|
||||
|
||||
def get_current_trace() -> TraceContext | None:
|
||||
"""获取当前请求的 TraceContext(无则返回 None)。"""
|
||||
return trace_context_var.get()
|
||||
|
||||
|
||||
def set_current_trace(ctx: TraceContext) -> contextvars.Token:
|
||||
"""设置当前请求的 TraceContext,返回 Token 用于后续恢复。"""
|
||||
return trace_context_var.set(ctx)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 工厂函数:创建不同类型的 TraceContext
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def create_http_trace(method: str, path: str) -> TraceContext:
|
||||
"""创建 HTTP 请求的 TraceContext(request_id = uuid hex[:12])。"""
|
||||
return TraceContext(
|
||||
request_id=uuid.uuid4().hex[:12],
|
||||
trace_type=TraceType.HTTP,
|
||||
method=method,
|
||||
path=path,
|
||||
)
|
||||
|
||||
|
||||
def create_ws_trace() -> TraceContext:
|
||||
"""创建 WebSocket 连接的 TraceContext(request_id = ws_ 前缀)。"""
|
||||
return TraceContext(
|
||||
request_id=f"ws_{uuid.uuid4().hex[:12]}",
|
||||
trace_type=TraceType.WS,
|
||||
)
|
||||
|
||||
|
||||
def create_job_trace(job_name: str) -> TraceContext:
|
||||
"""创建后台 Job 的 TraceContext(request_id = job_ 前缀)。"""
|
||||
return TraceContext(
|
||||
request_id=f"job_{uuid.uuid4().hex[:12]}",
|
||||
trace_type=TraceType.JOB,
|
||||
path=job_name,
|
||||
)
|
||||
240
apps/backend/app/trace/coverage.py
Normal file
240
apps/backend/app/trace/coverage.py
Normal file
@@ -0,0 +1,240 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Trace 覆盖率扫描器
|
||||
|
||||
通过文件内容搜索(grep 式)检测 trace 覆盖情况,不使用 AST 解析。
|
||||
扫描维度:路由覆盖、Service 覆盖、Job 覆盖、SSE/WS 覆盖。
|
||||
扫描结果缓存在内存中,支持手动刷新。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 后端 app 目录(相对于项目根目录)
|
||||
_APP_DIR = Path(__file__).resolve().parent.parent # apps/backend/app/
|
||||
|
||||
# ── 模块级缓存 ────────────────────────────────────────────
|
||||
|
||||
_cached_result: dict[str, Any] | None = None
|
||||
|
||||
|
||||
def get_cached_coverage() -> dict[str, Any] | None:
|
||||
"""返回缓存的覆盖率扫描结果,无缓存时返回 None。"""
|
||||
return _cached_result
|
||||
|
||||
|
||||
def run_coverage_scan() -> dict[str, Any]:
|
||||
"""执行覆盖率扫描并更新缓存,返回扫描结果。"""
|
||||
global _cached_result
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"scan_time": datetime.now().isoformat(),
|
||||
"routes": _scan_routes(),
|
||||
"services": _scan_services(),
|
||||
"jobs": _scan_jobs(),
|
||||
"sse_endpoints": _scan_sse(),
|
||||
"ws_endpoints": _scan_ws(),
|
||||
}
|
||||
|
||||
_cached_result = result
|
||||
return result
|
||||
|
||||
|
||||
# ── 路由覆盖扫描 ──────────────────────────────────────────
|
||||
|
||||
# 匹配 @router.get/post/put/delete/patch 装饰器
|
||||
_ROUTE_DECORATOR_RE = re.compile(r"@router\.(get|post|put|delete|patch)\(")
|
||||
# 匹配紧跟装饰器后的 async def / def 函数名
|
||||
_FUNC_DEF_RE = re.compile(r"(?:async\s+)?def\s+(\w+)\s*\(")
|
||||
|
||||
|
||||
def _scan_routes() -> dict[str, Any]:
|
||||
"""扫描 xcx_*.py 路由文件,统计路由函数数量。
|
||||
|
||||
覆盖判定:xcx_* 路由文件中的路由函数默认被 TraceMiddleware 覆盖
|
||||
(因为中间件拦截 /api/xcx/ 前缀)。
|
||||
"""
|
||||
routers_dir = _APP_DIR / "routers"
|
||||
total_routes: list[str] = []
|
||||
covered_routes: list[str] = []
|
||||
uncovered_routes: list[str] = []
|
||||
|
||||
if not routers_dir.exists():
|
||||
return {"total": 0, "covered": 0, "uncovered": []}
|
||||
|
||||
for py_file in sorted(routers_dir.glob("xcx_*.py")):
|
||||
module_name = py_file.stem
|
||||
content = py_file.read_text(encoding="utf-8")
|
||||
lines = content.splitlines()
|
||||
|
||||
# 查找路由装饰器后面的函数名
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if _ROUTE_DECORATOR_RE.search(lines[i]):
|
||||
# 向下查找函数定义(可能隔几行装饰器)
|
||||
for j in range(i, min(i + 5, len(lines))):
|
||||
m = _FUNC_DEF_RE.search(lines[j])
|
||||
if m:
|
||||
func_name = m.group(1)
|
||||
route_id = f"{module_name}.{func_name}"
|
||||
total_routes.append(route_id)
|
||||
# xcx_* 路由默认被 TraceMiddleware 覆盖
|
||||
covered_routes.append(route_id)
|
||||
break
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"total": len(total_routes),
|
||||
"covered": len(covered_routes),
|
||||
"uncovered": uncovered_routes,
|
||||
}
|
||||
|
||||
|
||||
# ── Service 覆盖扫描 ─────────────────────────────────────
|
||||
|
||||
|
||||
def _scan_services() -> dict[str, Any]:
|
||||
"""扫描 services/*.py 中的公开函数,检查是否有 @trace_service 装饰器。"""
|
||||
services_dir = _APP_DIR / "services"
|
||||
total_funcs: list[str] = []
|
||||
covered_funcs: list[str] = []
|
||||
uncovered_funcs: list[str] = []
|
||||
|
||||
if not services_dir.exists():
|
||||
return {"total": 0, "covered": 0, "uncovered": []}
|
||||
|
||||
for py_file in sorted(services_dir.rglob("*.py")):
|
||||
if py_file.name.startswith("_"):
|
||||
continue
|
||||
module_name = py_file.stem
|
||||
content = py_file.read_text(encoding="utf-8")
|
||||
lines = content.splitlines()
|
||||
|
||||
has_trace_import = "trace_service" in content
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].rstrip()
|
||||
# 匹配模块级公开函数(不以空格开头、非 _ 开头)
|
||||
m = re.match(r"^(?:async\s+)?def\s+([a-zA-Z]\w*)\s*\(", line)
|
||||
if m:
|
||||
func_name = m.group(1)
|
||||
func_id = f"{module_name}.{func_name}"
|
||||
total_funcs.append(func_id)
|
||||
|
||||
# 检查前面几行是否有 @trace_service
|
||||
decorated = False
|
||||
if has_trace_import:
|
||||
for k in range(max(0, i - 5), i):
|
||||
if "@trace_service" in lines[k]:
|
||||
decorated = True
|
||||
break
|
||||
|
||||
if decorated:
|
||||
covered_funcs.append(func_id)
|
||||
else:
|
||||
uncovered_funcs.append(func_id)
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"total": len(total_funcs),
|
||||
"covered": len(covered_funcs),
|
||||
"uncovered": uncovered_funcs,
|
||||
}
|
||||
|
||||
|
||||
# ── Job 覆盖扫描 ─────────────────────────────────────────
|
||||
|
||||
# 4 个已知 job handler
|
||||
_KNOWN_JOBS = [
|
||||
"task_generator",
|
||||
"task_expiry_check",
|
||||
"recall_completion_check",
|
||||
"note_reclassify_backfill",
|
||||
]
|
||||
|
||||
|
||||
def _scan_jobs() -> dict[str, Any]:
|
||||
"""检查 main.py 中 4 个 job handler 是否被 trace_job 包装。"""
|
||||
main_py = _APP_DIR / "main.py"
|
||||
covered: list[str] = []
|
||||
uncovered: list[str] = []
|
||||
|
||||
if not main_py.exists():
|
||||
return {"total": len(_KNOWN_JOBS), "covered": 0, "uncovered": _KNOWN_JOBS[:]}
|
||||
|
||||
content = main_py.read_text(encoding="utf-8")
|
||||
|
||||
for job_name in _KNOWN_JOBS:
|
||||
# 检查是否有 trace_job("job_name") 包装
|
||||
if f'trace_job("{job_name}")' in content:
|
||||
covered.append(job_name)
|
||||
else:
|
||||
uncovered.append(job_name)
|
||||
|
||||
return {
|
||||
"total": len(_KNOWN_JOBS),
|
||||
"covered": len(covered),
|
||||
"uncovered": uncovered,
|
||||
}
|
||||
|
||||
|
||||
# ── SSE 覆盖扫描 ─────────────────────────────────────────
|
||||
|
||||
|
||||
def _scan_sse() -> dict[str, Any]:
|
||||
"""检查 SSE 端点(xcx_chat.py)是否集成了 sse_wrapper。"""
|
||||
chat_file = _APP_DIR / "routers" / "xcx_chat.py"
|
||||
total = 1 # 目前只有 xcx_chat 一个 SSE 端点
|
||||
covered: list[str] = []
|
||||
uncovered: list[str] = []
|
||||
|
||||
if not chat_file.exists():
|
||||
return {"total": total, "covered": 0, "uncovered": ["xcx_chat.chat_stream"]}
|
||||
|
||||
content = chat_file.read_text(encoding="utf-8")
|
||||
|
||||
if "sse_wrapper" in content or "from app.trace.sse_wrapper" in content:
|
||||
covered.append("xcx_chat.chat_stream")
|
||||
else:
|
||||
uncovered.append("xcx_chat.chat_stream")
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"covered": len(covered),
|
||||
"uncovered": uncovered,
|
||||
}
|
||||
|
||||
|
||||
# ── WebSocket 覆盖扫描 ───────────────────────────────────
|
||||
|
||||
|
||||
def _scan_ws() -> dict[str, Any]:
|
||||
"""检查 WebSocket 端点(ws/logs.py)是否集成了 ws_wrapper。"""
|
||||
ws_file = _APP_DIR / "ws" / "logs.py"
|
||||
total = 1 # 目前只有 ws_logs 一个 WS 端点
|
||||
covered: list[str] = []
|
||||
uncovered: list[str] = []
|
||||
|
||||
if not ws_file.exists():
|
||||
return {"total": total, "covered": 0, "uncovered": ["ws_logs.ws_logs"]}
|
||||
|
||||
content = ws_file.read_text(encoding="utf-8")
|
||||
|
||||
if "ws_wrapper" in content or "from app.trace.ws_wrapper" in content:
|
||||
covered.append("ws_logs.ws_logs")
|
||||
else:
|
||||
uncovered.append("ws_logs.ws_logs")
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"covered": len(covered),
|
||||
"uncovered": uncovered,
|
||||
}
|
||||
222
apps/backend/app/trace/db_wrapper.py
Normal file
222
apps/backend/app/trace/db_wrapper.py
Normal file
@@ -0,0 +1,222 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
数据库连接包装器 — 拦截 cursor.execute() 和连接生命周期
|
||||
|
||||
包装 psycopg2 连接对象,自动记录以下 span:
|
||||
- DB_QUERY: 每条 SQL 的语句、参数、行数、耗时、调用来源
|
||||
- DB_CONN: 连接获取耗时(在 database.py 中记录)
|
||||
- DB_CONN_RELEASE: 连接释放
|
||||
- DB_ERROR: 数据库异常(PostgreSQL 错误码、消息、触发 SQL)
|
||||
|
||||
当 DEV_TRACE_LOG_SQL=false 时,SQL 语句替换为 "[SQL hidden]"。
|
||||
当无活跃 TraceContext 时零开销直接委托原始对象。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
|
||||
from app.trace.config import get_trace_config
|
||||
from app.trace.context import (
|
||||
SpanType,
|
||||
TraceSpan,
|
||||
get_current_trace,
|
||||
)
|
||||
|
||||
|
||||
# SQL 被隐藏时的占位符
|
||||
_SQL_HIDDEN = "[SQL hidden]"
|
||||
|
||||
|
||||
def _get_caller_info() -> tuple[str, str]:
|
||||
"""获取调用来源(跳过 wrapper 自身的栈帧)。
|
||||
|
||||
返回 (module, function) 元组。
|
||||
"""
|
||||
frame = inspect.currentframe()
|
||||
try:
|
||||
# 向上跳 3 层:_get_caller_info → execute → 调用方
|
||||
caller = frame
|
||||
for _ in range(3):
|
||||
if caller is not None:
|
||||
caller = caller.f_back
|
||||
if caller is not None:
|
||||
module = caller.f_globals.get("__name__", "unknown")
|
||||
func_name = caller.f_code.co_name
|
||||
return module, func_name
|
||||
finally:
|
||||
del frame
|
||||
return "unknown", "unknown"
|
||||
|
||||
|
||||
def _safe_sql_repr(sql: Any) -> str:
|
||||
"""安全地将 SQL 转为字符串表示。"""
|
||||
if sql is None:
|
||||
return ""
|
||||
if isinstance(sql, bytes):
|
||||
return sql.decode("utf-8", errors="replace")
|
||||
return str(sql)
|
||||
|
||||
|
||||
def _safe_params_repr(params: Any) -> Any:
|
||||
"""安全地将 SQL 参数转为可序列化的表示。"""
|
||||
if params is None:
|
||||
return None
|
||||
if isinstance(params, (list, tuple)):
|
||||
return [_safe_param_value(v) for v in params]
|
||||
if isinstance(params, dict):
|
||||
return {str(k): _safe_param_value(v) for k, v in params.items()}
|
||||
return str(params)
|
||||
|
||||
|
||||
def _safe_param_value(value: Any) -> Any:
|
||||
"""安全地将单个参数值转为可序列化的表示。"""
|
||||
if isinstance(value, (str, int, float, bool, type(None))):
|
||||
return value
|
||||
return str(value)
|
||||
|
||||
|
||||
class TracedCursor:
|
||||
"""包装 psycopg2 cursor,拦截 execute() 记录 DB_QUERY / DB_ERROR span。
|
||||
|
||||
未实现的方法通过 __getattr__ 委托给底层 cursor。
|
||||
"""
|
||||
|
||||
def __init__(self, cursor: Any) -> None:
|
||||
# 使用 object.__setattr__ 避免触发 __getattr__
|
||||
object.__setattr__(self, "_cursor", cursor)
|
||||
|
||||
def execute(self, sql: Any, params: Any = None) -> None:
|
||||
"""拦截 execute,记录 DB_QUERY span;异常时记录 DB_ERROR span。"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
# 无活跃 trace,直接执行
|
||||
self._cursor.execute(sql, params)
|
||||
return
|
||||
|
||||
config = get_trace_config()
|
||||
caller_module, caller_func = _get_caller_info()
|
||||
sql_str = _safe_sql_repr(sql)
|
||||
display_sql = sql_str if config.log_sql else _SQL_HIDDEN
|
||||
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
self._cursor.execute(sql, params)
|
||||
except psycopg2.Error as exc:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
# 记录 DB_ERROR span
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.DB_ERROR,
|
||||
module=caller_module,
|
||||
function=caller_func,
|
||||
description_zh=f"数据库异常: {type(exc).__name__}",
|
||||
description_en=f"Database error: {type(exc).__name__}",
|
||||
params={},
|
||||
result_summary=str(exc).strip()[:200],
|
||||
duration_ms=elapsed,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={
|
||||
"pgcode": getattr(exc, "pgcode", None),
|
||||
"pgerror": (getattr(exc, "pgerror", None) or "")[:500],
|
||||
"sql": display_sql,
|
||||
},
|
||||
))
|
||||
raise
|
||||
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
# 获取行数
|
||||
row_count = self._cursor.rowcount if self._cursor.rowcount >= 0 else 0
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.DB_QUERY,
|
||||
module=caller_module,
|
||||
function=caller_func,
|
||||
description_zh=f"执行 SQL 查询,返回 {row_count} 行",
|
||||
description_en=f"Executed SQL query, returned {row_count} rows",
|
||||
params={"params": _safe_params_repr(params)} if config.log_sql else {},
|
||||
result_summary=f"{row_count} 行",
|
||||
duration_ms=elapsed,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={
|
||||
"sql": display_sql,
|
||||
"params": _safe_params_repr(params) if config.log_sql else None,
|
||||
"row_count": row_count,
|
||||
"caller": f"{caller_module}.{caller_func}",
|
||||
},
|
||||
))
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
"""未实现的方法委托给底层 cursor。"""
|
||||
return getattr(self._cursor, name)
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._cursor)
|
||||
|
||||
def __next__(self):
|
||||
return next(self._cursor)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self._cursor.close()
|
||||
|
||||
|
||||
class TracedConnection:
|
||||
"""包装 psycopg2 连接,拦截 cursor() 和 close()。
|
||||
|
||||
- cursor() 返回 TracedCursor
|
||||
- close() 记录 DB_CONN_RELEASE span
|
||||
- 其他方法通过 __getattr__ 委托给底层连接
|
||||
"""
|
||||
|
||||
def __init__(self, conn: Any) -> None:
|
||||
object.__setattr__(self, "_conn", conn)
|
||||
object.__setattr__(self, "_closed_traced", False)
|
||||
|
||||
def cursor(self, *args: Any, **kwargs: Any) -> TracedCursor:
|
||||
"""返回包装后的 TracedCursor。"""
|
||||
raw_cursor = self._conn.cursor(*args, **kwargs)
|
||||
return TracedCursor(raw_cursor)
|
||||
|
||||
def close(self) -> None:
|
||||
"""关闭连接并记录 DB_CONN_RELEASE span。"""
|
||||
if not self._closed_traced:
|
||||
object.__setattr__(self, "_closed_traced", True)
|
||||
ctx = get_current_trace()
|
||||
if ctx is not None:
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.DB_CONN_RELEASE,
|
||||
module="trace.db_wrapper",
|
||||
function="TracedConnection.close",
|
||||
description_zh="释放数据库连接",
|
||||
description_en="Released database connection",
|
||||
params={},
|
||||
result_summary="closed",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
self._conn.close()
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
"""未实现的方法委托给底层连接。"""
|
||||
return getattr(self._conn, name)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
|
||||
def traced_connection(conn: Any) -> TracedConnection:
|
||||
"""将 psycopg2 连接包装为 TracedConnection。
|
||||
|
||||
仅在 trace 启用且有活跃 TraceContext 时调用。
|
||||
"""
|
||||
return TracedConnection(conn)
|
||||
192
apps/backend/app/trace/decorators.py
Normal file
192
apps/backend/app/trace/decorators.py
Normal file
@@ -0,0 +1,192 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Trace 装饰器模块
|
||||
|
||||
提供 trace_service 装饰器,自动为 Service 层函数记录 SERVICE span。
|
||||
当无活跃 TraceContext 时零开销直接调用原函数。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import functools
|
||||
import inspect
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable
|
||||
|
||||
from app.trace.config import get_trace_config
|
||||
from app.trace.context import SpanType, TraceSpan, get_current_trace
|
||||
|
||||
|
||||
# 参数值被隐藏时的占位符
|
||||
_REDACTED = "[redacted]"
|
||||
|
||||
# 返回值摘要最大长度
|
||||
_RESULT_SUMMARY_MAX_LEN = 120
|
||||
|
||||
|
||||
def _build_params_dict(func: Callable, args: tuple, kwargs: dict, *, redact: bool) -> dict[str, Any]:
|
||||
"""从函数签名和实际调用参数构建参数字典。
|
||||
|
||||
redact=True 时只记录参数名,值替换为 "[redacted]"。
|
||||
"""
|
||||
sig = inspect.signature(func)
|
||||
bound = sig.bind(*args, **kwargs)
|
||||
bound.apply_defaults()
|
||||
|
||||
if redact:
|
||||
return {name: _REDACTED for name in bound.arguments}
|
||||
return {name: _safe_repr(val) for name, val in bound.arguments.items()}
|
||||
|
||||
|
||||
def _safe_repr(value: Any) -> Any:
|
||||
"""安全地将参数值转为可序列化的表示。"""
|
||||
if isinstance(value, (str, int, float, bool, type(None))):
|
||||
return value
|
||||
if isinstance(value, (list, tuple)):
|
||||
if len(value) <= 10:
|
||||
return [_safe_repr(v) for v in value]
|
||||
return f"[{len(value)} items]"
|
||||
if isinstance(value, dict):
|
||||
if len(value) <= 10:
|
||||
return {str(k): _safe_repr(v) for k, v in value.items()}
|
||||
return f"{{{len(value)} keys}}"
|
||||
# 其他类型用 repr 截断
|
||||
r = repr(value)
|
||||
if len(r) > 100:
|
||||
return r[:97] + "..."
|
||||
return r
|
||||
|
||||
|
||||
def _make_result_summary(result: Any) -> str:
|
||||
"""从返回值生成摘要字符串。"""
|
||||
if result is None:
|
||||
return "None"
|
||||
if isinstance(result, (str, int, float, bool)):
|
||||
s = str(result)
|
||||
if len(s) > _RESULT_SUMMARY_MAX_LEN:
|
||||
return s[:_RESULT_SUMMARY_MAX_LEN - 3] + "..."
|
||||
return s
|
||||
if isinstance(result, (list, tuple)):
|
||||
return f"[{len(result)} items]"
|
||||
if isinstance(result, dict):
|
||||
return f"{{{len(result)} keys}}"
|
||||
r = repr(result)
|
||||
if len(r) > _RESULT_SUMMARY_MAX_LEN:
|
||||
return r[:_RESULT_SUMMARY_MAX_LEN - 3] + "..."
|
||||
return r
|
||||
|
||||
|
||||
def trace_service(description_zh: str, description_en: str) -> Callable:
|
||||
"""Service 层函数装饰器,自动记录 SERVICE span。
|
||||
|
||||
- 无活跃 TraceContext 时零开销直接调用原函数
|
||||
- DEV_TRACE_LOG_PARAMS=false 时参数值替换为 "[redacted]"
|
||||
- 支持同步和异步函数
|
||||
"""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
module_name = func.__module__ or ""
|
||||
func_name = func.__qualname__ or func.__name__
|
||||
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
@functools.wraps(func)
|
||||
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return await func(*args, **kwargs)
|
||||
|
||||
config = get_trace_config()
|
||||
redact = not config.log_params
|
||||
params = _build_params_dict(func, args, kwargs, redact=redact)
|
||||
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
except Exception:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SERVICE,
|
||||
module=module_name,
|
||||
function=func_name,
|
||||
description_zh=description_zh,
|
||||
description_en=description_en,
|
||||
params=params,
|
||||
result_summary="exception",
|
||||
duration_ms=elapsed,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
raise
|
||||
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SERVICE,
|
||||
module=module_name,
|
||||
function=func_name,
|
||||
description_zh=description_zh,
|
||||
description_en=description_en,
|
||||
params=params,
|
||||
result_summary=_make_result_summary(result),
|
||||
duration_ms=elapsed,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
return result
|
||||
|
||||
return async_wrapper
|
||||
else:
|
||||
@functools.wraps(func)
|
||||
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
config = get_trace_config()
|
||||
redact = not config.log_params
|
||||
params = _build_params_dict(func, args, kwargs, redact=redact)
|
||||
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
except Exception:
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SERVICE,
|
||||
module=module_name,
|
||||
function=func_name,
|
||||
description_zh=description_zh,
|
||||
description_en=description_en,
|
||||
params=params,
|
||||
result_summary="exception",
|
||||
duration_ms=elapsed,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
raise
|
||||
|
||||
elapsed = (time.perf_counter() - start) * 1000
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SERVICE,
|
||||
module=module_name,
|
||||
function=func_name,
|
||||
description_zh=description_zh,
|
||||
description_en=description_en,
|
||||
params=params,
|
||||
result_summary=_make_result_summary(result),
|
||||
duration_ms=elapsed,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
return result
|
||||
|
||||
return sync_wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def truncate_token(token: str) -> str:
|
||||
"""截断 token,仅保留前 16 个字符 + '...' 后缀。
|
||||
|
||||
短于等于 16 字符的 token 原样返回。
|
||||
"""
|
||||
if len(token) <= 16:
|
||||
return token
|
||||
return token[:16] + "..."
|
||||
117
apps/backend/app/trace/error_handler.py
Normal file
117
apps/backend/app/trace/error_handler.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
异常/错误全链路追踪 — ERROR span 记录辅助函数
|
||||
|
||||
集成到全局异常处理器(http_exception_handler / unhandled_exception_handler),
|
||||
为 HTTPException 和未捕获异常分别记录 ERROR span。
|
||||
|
||||
- HTTPException → ERROR span(异常类型、status_code、detail、发生层级)
|
||||
- 未捕获异常 → ERROR span(异常类型、消息、堆栈摘要前 5 行)
|
||||
- 数据库异常(psycopg2.Error)→ DB_ERROR span(错误码、消息、触发 SQL)
|
||||
|
||||
所有函数均为安全调用:trace 未激活时静默跳过,不影响请求处理。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
|
||||
from starlette.exceptions import HTTPException
|
||||
|
||||
from app.trace.context import SpanType, TraceSpan, get_current_trace
|
||||
|
||||
|
||||
def record_http_exception(exc: HTTPException) -> None:
|
||||
"""为 HTTPException 记录 ERROR span。
|
||||
|
||||
记录内容:异常类型、status_code、detail、发生层级(exception_handler)。
|
||||
trace 未激活时静默跳过。
|
||||
"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.ERROR,
|
||||
module="trace.error_handler",
|
||||
function="record_http_exception",
|
||||
description_zh=f"HTTP 异常: {exc.status_code} {exc.detail}",
|
||||
description_en=f"HTTP exception: {exc.status_code} {exc.detail}",
|
||||
params={},
|
||||
result_summary=f"{exc.status_code}",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={
|
||||
"exception_type": type(exc).__name__,
|
||||
"status_code": exc.status_code,
|
||||
"detail": str(exc.detail) if exc.detail else "",
|
||||
"layer": "exception_handler",
|
||||
},
|
||||
))
|
||||
|
||||
|
||||
def record_unhandled_exception(exc: Exception) -> None:
|
||||
"""为未捕获异常记录 ERROR span。
|
||||
|
||||
记录内容:异常类型、消息、堆栈摘要(最后 5 行)。
|
||||
trace 未激活时静默跳过。
|
||||
"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
# 堆栈摘要:取最后 5 行,避免 span 过大
|
||||
tb_lines = traceback.format_exception(type(exc), exc, exc.__traceback__)
|
||||
stack_summary = "".join(tb_lines[-5:]) if len(tb_lines) > 5 else "".join(tb_lines)
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.ERROR,
|
||||
module="trace.error_handler",
|
||||
function="record_unhandled_exception",
|
||||
description_zh=f"未捕获异常: {type(exc).__name__}: {exc}",
|
||||
description_en=f"Unhandled exception: {type(exc).__name__}: {exc}",
|
||||
params={},
|
||||
result_summary=type(exc).__name__,
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={
|
||||
"exception_type": type(exc).__name__,
|
||||
"message": str(exc),
|
||||
"stack_summary": stack_summary,
|
||||
"layer": "exception_handler",
|
||||
},
|
||||
))
|
||||
|
||||
|
||||
def record_db_exception(exc: Exception, sql: str = "") -> None:
|
||||
"""为数据库异常记录 DB_ERROR span。
|
||||
|
||||
记录内容:PostgreSQL 错误码(如有)、消息、触发 SQL。
|
||||
trace 未激活时静默跳过。
|
||||
"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
# 尝试提取 psycopg2 错误码
|
||||
pgcode = getattr(exc, "pgcode", None) or ""
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.DB_ERROR,
|
||||
module="trace.error_handler",
|
||||
function="record_db_exception",
|
||||
description_zh=f"数据库异常: {type(exc).__name__}: {exc}",
|
||||
description_en=f"Database exception: {type(exc).__name__}: {exc}",
|
||||
params={},
|
||||
result_summary=type(exc).__name__,
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={
|
||||
"exception_type": type(exc).__name__,
|
||||
"message": str(exc),
|
||||
"pgcode": pgcode,
|
||||
"sql": sql,
|
||||
"layer": "database",
|
||||
},
|
||||
))
|
||||
127
apps/backend/app/trace/job_wrapper.py
Normal file
127
apps/backend/app/trace/job_wrapper.py
Normal file
@@ -0,0 +1,127 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
后台 Job 执行追踪包装器
|
||||
|
||||
提供 trace_job() 包装函数,在 lifespan 注册的 job handler 外层包裹,
|
||||
追踪后台任务执行全过程(JOB_START → 内部 span → JOB_END / JOB_ERROR)。
|
||||
|
||||
Job trace 使用独立的 request_id(job_ 前缀),写入同一日志文件。
|
||||
内部的 SERVICE / DB_QUERY span 通过 contextvars 自动关联到 job trace。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import functools
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable
|
||||
|
||||
from app.trace.config import get_trace_config
|
||||
from app.trace.context import (
|
||||
SpanType,
|
||||
TraceSpan,
|
||||
create_job_trace,
|
||||
set_current_trace,
|
||||
trace_context_var,
|
||||
)
|
||||
from app.trace.writer import TraceWriter
|
||||
|
||||
|
||||
def trace_job(job_name: str) -> Callable:
|
||||
"""装饰器:包装 job handler,追踪后台任务执行。
|
||||
|
||||
用法::
|
||||
|
||||
@trace_job("task_generator")
|
||||
def my_handler(**kwargs):
|
||||
...
|
||||
|
||||
或在 register_job 时包装::
|
||||
|
||||
register_job("task_generator", trace_job("task_generator")(lambda **_kw: task_generator.run()))
|
||||
"""
|
||||
|
||||
def decorator(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
||||
cfg = get_trace_config()
|
||||
if not cfg.enabled:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# 创建 job trace 上下文
|
||||
ctx = create_job_trace(job_name)
|
||||
token = set_current_trace(ctx)
|
||||
|
||||
# JOB_START span
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.JOB_START,
|
||||
module="trace.job_wrapper",
|
||||
function=f"trace_job({job_name})",
|
||||
description_zh=f"后台任务开始: {job_name}",
|
||||
description_en=f"Background job started: {job_name}",
|
||||
params={"job_name": job_name},
|
||||
result_summary="",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
t0 = time.perf_counter()
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
duration_ms = (time.perf_counter() - t0) * 1000
|
||||
|
||||
# JOB_END span
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.JOB_END,
|
||||
module="trace.job_wrapper",
|
||||
function=f"trace_job({job_name})",
|
||||
description_zh=f"后台任务完成: {job_name}, 耗时={duration_ms:.0f}ms",
|
||||
description_en=f"Background job completed: {job_name}, duration={duration_ms:.0f}ms",
|
||||
params={"job_name": job_name},
|
||||
result_summary=f"completed in {duration_ms:.0f}ms",
|
||||
duration_ms=duration_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
return result
|
||||
|
||||
except Exception as exc:
|
||||
duration_ms = (time.perf_counter() - t0) * 1000
|
||||
tb_lines = traceback.format_exception(type(exc), exc, exc.__traceback__)
|
||||
stack_summary = "".join(tb_lines[-5:])
|
||||
|
||||
# JOB_ERROR span
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.JOB_ERROR,
|
||||
module="trace.job_wrapper",
|
||||
function=f"trace_job({job_name})",
|
||||
description_zh=f"后台任务异常: {job_name}, {type(exc).__name__}: {exc}",
|
||||
description_en=f"Background job error: {job_name}, {type(exc).__name__}: {exc}",
|
||||
params={"job_name": job_name, "error_type": type(exc).__name__},
|
||||
result_summary=f"{type(exc).__name__}: {exc}",
|
||||
duration_ms=duration_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={"stack_summary": stack_summary},
|
||||
))
|
||||
raise
|
||||
finally:
|
||||
# 同步写入日志文件(job handler 在同步上下文中执行)
|
||||
try:
|
||||
import json
|
||||
from app.trace.writer import serialize_trace, get_log_file
|
||||
|
||||
writer = TraceWriter()
|
||||
data = serialize_trace(ctx)
|
||||
line = json.dumps(data, ensure_ascii=False, default=str)
|
||||
dt = ctx.start_time if isinstance(ctx.start_time, datetime) else datetime.now()
|
||||
filepath = get_log_file(dt, base_dir=writer.base_dir)
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
filepath = writer._rotate_if_needed(filepath)
|
||||
writer._sync_write(filepath, line)
|
||||
except Exception:
|
||||
pass # 写入失败不影响业务
|
||||
# 恢复 contextvars
|
||||
trace_context_var.set(None)
|
||||
|
||||
return wrapper
|
||||
return decorator
|
||||
211
apps/backend/app/trace/middleware.py
Normal file
211
apps/backend/app/trace/middleware.py
Normal file
@@ -0,0 +1,211 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
TraceMiddleware — ASGI 中间件
|
||||
|
||||
拦截 xcx_* 路由前缀(/api/xcx/)的请求,创建 TraceContext 并记录全链路 span。
|
||||
非 xcx 路由直接跳过,不创建 TraceContext。
|
||||
DEV_TRACE_ENABLED 关闭时跳过所有采集。
|
||||
|
||||
记录的 span 类型:
|
||||
- HTTP_IN: 请求进入(method, path, query_params, body_preview)
|
||||
- HTTP_OUT: 请求结束(status_code, duration, body_size)
|
||||
- MIDDLEWARE: ResponseWrapperMiddleware 执行耗时
|
||||
- MIDDLEWARE_ERROR: 响应包装失败时记录
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime
|
||||
from urllib.parse import unquote
|
||||
|
||||
from starlette.types import ASGIApp, Message, Receive, Scope, Send
|
||||
|
||||
from app.trace.config import get_trace_config
|
||||
from app.trace.context import (
|
||||
SpanType,
|
||||
TraceContext,
|
||||
TraceSpan,
|
||||
create_http_trace,
|
||||
get_current_trace,
|
||||
set_current_trace,
|
||||
trace_context_var,
|
||||
)
|
||||
from app.trace.writer import get_trace_writer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# xcx 路由前缀——仅匹配此前缀的请求才采集 trace
|
||||
XCX_PATH_PREFIX = "/api/xcx/"
|
||||
|
||||
|
||||
def _should_trace(path: str) -> bool:
|
||||
"""判断请求路径是否属于 xcx_* 路由前缀,需要采集 trace。"""
|
||||
return path.startswith(XCX_PATH_PREFIX)
|
||||
|
||||
|
||||
class TraceMiddleware:
|
||||
"""ASGI 中间件:全链路请求追踪。
|
||||
|
||||
执行顺序(最外层,最先执行):
|
||||
TraceMiddleware → CORSMiddleware → ResponseWrapperMiddleware → 路由处理
|
||||
|
||||
职责:
|
||||
1. 检查 DEV_TRACE_ENABLED 开关(运行时检查,支持动态切换)
|
||||
2. 仅拦截 /api/xcx/ 前缀的请求
|
||||
3. 创建 TraceContext 存入 contextvars
|
||||
4. 记录 HTTP_IN / HTTP_OUT / MIDDLEWARE / MIDDLEWARE_ERROR span
|
||||
5. 响应头写入 X-Request-ID, X-Process-Time, X-DB-Queries, X-DB-Time
|
||||
6. 调用 TraceWriter 写入完整 trace
|
||||
"""
|
||||
|
||||
def __init__(self, app: ASGIApp) -> None:
|
||||
self.app = app
|
||||
|
||||
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
|
||||
# 仅处理 HTTP 请求
|
||||
if scope["type"] != "http":
|
||||
await self.app(scope, receive, send)
|
||||
return
|
||||
|
||||
path = scope.get("path", "")
|
||||
config = get_trace_config()
|
||||
|
||||
# 开关关闭 或 非 xcx 路由 → 直接透传
|
||||
if not config.enabled or not _should_trace(path):
|
||||
await self.app(scope, receive, send)
|
||||
return
|
||||
|
||||
# ── 创建 TraceContext ──
|
||||
method = scope.get("method", "UNKNOWN")
|
||||
query_string = scope.get("query_string", b"").decode("latin-1", errors="replace")
|
||||
ctx = create_http_trace(method, path)
|
||||
token = set_current_trace(ctx)
|
||||
|
||||
# 记录 HTTP_IN span
|
||||
query_params = {}
|
||||
if query_string:
|
||||
# 简单解析 query string 为 dict
|
||||
for pair in query_string.split("&"):
|
||||
if "=" in pair:
|
||||
k, v = pair.split("=", 1)
|
||||
query_params[unquote(k)] = unquote(v)
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.HTTP_IN,
|
||||
module="trace.middleware",
|
||||
function="TraceMiddleware.__call__",
|
||||
description_zh=f"接收请求 {method} {path}",
|
||||
description_en=f"Received request {method} {path}",
|
||||
params={"query": query_params},
|
||||
result_summary="",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# ── 拦截响应,注入 trace 头和采集 HTTP_OUT ──
|
||||
status_code = 200
|
||||
response_body_parts: list[bytes] = []
|
||||
middleware_start = time.perf_counter()
|
||||
middleware_error: str | None = None
|
||||
|
||||
async def send_wrapper(message: Message) -> None:
|
||||
nonlocal status_code, middleware_error
|
||||
|
||||
if message["type"] == "http.response.start":
|
||||
status_code = message.get("status", 200)
|
||||
|
||||
# 计算 trace 统计数据
|
||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||
current_ctx = get_current_trace()
|
||||
db_queries = 0
|
||||
db_time_ms = 0.0
|
||||
if current_ctx:
|
||||
for s in current_ctx.spans:
|
||||
if s.span_type == SpanType.DB_QUERY:
|
||||
db_queries += 1
|
||||
db_time_ms += s.duration_ms
|
||||
|
||||
# 注入响应头
|
||||
headers = list(message.get("headers", []))
|
||||
headers.append([b"x-request-id", ctx.request_id.encode()])
|
||||
headers.append([b"x-process-time", f"{elapsed_ms:.1f}ms".encode()])
|
||||
headers.append([b"x-db-queries", str(db_queries).encode()])
|
||||
headers.append([b"x-db-time", f"{db_time_ms:.1f}ms".encode()])
|
||||
message = {**message, "headers": headers}
|
||||
|
||||
await send(message)
|
||||
return
|
||||
|
||||
if message["type"] == "http.response.body":
|
||||
body = message.get("body", b"")
|
||||
response_body_parts.append(body)
|
||||
await send(message)
|
||||
return
|
||||
|
||||
await send(message)
|
||||
|
||||
try:
|
||||
await self.app(scope, receive, send_wrapper)
|
||||
except Exception as exc:
|
||||
# 即使内层异常,也要记录 trace
|
||||
middleware_error = f"{type(exc).__name__}: {exc}"
|
||||
raise
|
||||
finally:
|
||||
elapsed_ms = (time.perf_counter() - start_time) * 1000
|
||||
middleware_elapsed_ms = (time.perf_counter() - middleware_start) * 1000
|
||||
body_size = sum(len(p) for p in response_body_parts)
|
||||
|
||||
# 记录 MIDDLEWARE span(ResponseWrapperMiddleware 执行耗时)
|
||||
if middleware_error:
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.MIDDLEWARE_ERROR,
|
||||
module="middleware.response_wrapper",
|
||||
function="ResponseWrapperMiddleware.__call__",
|
||||
description_zh=f"响应包装失败: {middleware_error}",
|
||||
description_en=f"Response wrapping failed: {middleware_error}",
|
||||
params={},
|
||||
result_summary=middleware_error,
|
||||
duration_ms=middleware_elapsed_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
else:
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.MIDDLEWARE,
|
||||
module="middleware.response_wrapper",
|
||||
function="ResponseWrapperMiddleware.__call__",
|
||||
description_zh="响应包装中间件执行完成",
|
||||
description_en="Response wrapper middleware completed",
|
||||
params={},
|
||||
result_summary=f"body_size={body_size}",
|
||||
duration_ms=middleware_elapsed_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={"body_size": body_size},
|
||||
))
|
||||
|
||||
# 记录 HTTP_OUT span
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.HTTP_OUT,
|
||||
module="trace.middleware",
|
||||
function="TraceMiddleware.__call__",
|
||||
description_zh=f"响应返回 {status_code},耗时 {elapsed_ms:.0f}ms",
|
||||
description_en=f"Response sent {status_code}, took {elapsed_ms:.0f}ms",
|
||||
params={},
|
||||
result_summary=f"{status_code}, {body_size}B body",
|
||||
duration_ms=elapsed_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
extra={"status_code": status_code, "body_size": body_size},
|
||||
))
|
||||
|
||||
# 写入 trace 日志
|
||||
try:
|
||||
writer = get_trace_writer()
|
||||
await writer.write_trace(ctx)
|
||||
except Exception:
|
||||
logger.warning("Trace 日志写入失败", exc_info=True)
|
||||
|
||||
# 恢复 contextvars
|
||||
trace_context_var.reset(token)
|
||||
149
apps/backend/app/trace/sse_wrapper.py
Normal file
149
apps/backend/app/trace/sse_wrapper.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
SSE 流式响应追踪辅助函数
|
||||
|
||||
提供一组轻量函数,在 SSE event_generator 内部调用,
|
||||
追踪流式响应全过程(SSE_START → AI_CALL → SSE_EVENT → SSE_END / AI_ERROR)。
|
||||
|
||||
所有函数在无活跃 TraceContext 时静默跳过,不影响业务逻辑。
|
||||
SSE_EVENT span 每 10 个 token 记录一次,避免 span 爆炸。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from app.trace.context import (
|
||||
SpanType,
|
||||
TraceSpan,
|
||||
TraceType,
|
||||
get_current_trace,
|
||||
)
|
||||
|
||||
|
||||
def record_sse_start(
|
||||
endpoint: str,
|
||||
user_id: int | None = None,
|
||||
chat_id: str = "",
|
||||
) -> None:
|
||||
"""记录 SSE_START span:流开始。
|
||||
|
||||
同时将 trace_type 切换为 "sse"。
|
||||
"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
# 切换 trace 类型为 SSE
|
||||
ctx.trace_type = TraceType.SSE
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SSE_START,
|
||||
module="trace.sse_wrapper",
|
||||
function="record_sse_start",
|
||||
description_zh=f"SSE 流开始: endpoint={endpoint}, chat_id={chat_id}",
|
||||
description_en=f"SSE stream started: endpoint={endpoint}, chat_id={chat_id}",
|
||||
params={"endpoint": endpoint, "user_id": user_id, "chat_id": chat_id},
|
||||
result_summary="",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
|
||||
def record_ai_call(
|
||||
app_id: str,
|
||||
prompt_length: int,
|
||||
session_id: str = "",
|
||||
) -> None:
|
||||
"""记录 AI_CALL span:DashScope API 调用。"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.AI_CALL,
|
||||
module="trace.sse_wrapper",
|
||||
function="record_ai_call",
|
||||
description_zh=f"AI 调用: app_id={app_id}, prompt 长度={prompt_length}",
|
||||
description_en=f"AI call: app_id={app_id}, prompt_length={prompt_length}",
|
||||
params={
|
||||
"app_id": app_id,
|
||||
"prompt_length": prompt_length,
|
||||
"session_id": session_id,
|
||||
},
|
||||
result_summary="",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
|
||||
def record_sse_token(token_count: int, total_tokens: int) -> None:
|
||||
"""记录 SSE_EVENT span:每 10 个 token 记录一次,避免 span 爆炸。
|
||||
|
||||
仅当 total_tokens 是 10 的倍数时才记录 span。
|
||||
"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
# 每 10 个 token 记录一次
|
||||
if total_tokens % 10 != 0:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SSE_EVENT,
|
||||
module="trace.sse_wrapper",
|
||||
function="record_sse_token",
|
||||
description_zh=f"SSE token 流: 本批 {token_count} token, 累计 {total_tokens}",
|
||||
description_en=f"SSE token stream: batch {token_count}, cumulative {total_tokens}",
|
||||
params={"token_count": token_count, "total_tokens": total_tokens},
|
||||
result_summary=f"cumulative={total_tokens}",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
|
||||
def record_sse_end(
|
||||
total_tokens: int,
|
||||
total_duration_ms: float,
|
||||
completed: bool = True,
|
||||
) -> None:
|
||||
"""记录 SSE_END span:流结束。"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.SSE_END,
|
||||
module="trace.sse_wrapper",
|
||||
function="record_sse_end",
|
||||
description_zh=f"SSE 流结束: 总 token={total_tokens}, 耗时={total_duration_ms:.0f}ms, 完成={completed}",
|
||||
description_en=f"SSE stream ended: total_tokens={total_tokens}, duration={total_duration_ms:.0f}ms, completed={completed}",
|
||||
params={
|
||||
"total_tokens": total_tokens,
|
||||
"total_duration_ms": total_duration_ms,
|
||||
"completed": completed,
|
||||
},
|
||||
result_summary=f"tokens={total_tokens}, completed={completed}",
|
||||
duration_ms=total_duration_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
|
||||
def record_ai_error(error_type: str, message: str) -> None:
|
||||
"""记录 AI_ERROR span:AI 调用失败。"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.AI_ERROR,
|
||||
module="trace.sse_wrapper",
|
||||
function="record_ai_error",
|
||||
description_zh=f"AI 调用失败: {error_type} - {message}",
|
||||
description_en=f"AI call failed: {error_type} - {message}",
|
||||
params={"error_type": error_type, "message": message},
|
||||
result_summary=f"{error_type}: {message}",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
301
apps/backend/app/trace/writer.py
Normal file
301
apps/backend/app/trace/writer.py
Normal file
@@ -0,0 +1,301 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
JSON Lines 日志写入器
|
||||
|
||||
负责将 TraceContext 序列化为 JSON 并追加写入 .jsonl 文件。
|
||||
按日期分目录(YYYY-MM-DD/),按小时分文件(trace_YYYY-MM-DD_HH.jsonl),
|
||||
单文件超过 10MB 自动轮转(trace_YYYY-MM-DD_HH_NNN.jsonl)。
|
||||
维护 _index.json 索引文件(文件列表、记录数、文件大小)。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.trace.config import get_trace_config
|
||||
from app.trace.context import TraceContext, TraceSpan, SpanType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 单文件大小上限(10MB)
|
||||
MAX_FILE_SIZE = 10 * 1024 * 1024
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 序列化函数
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def serialize_span(span: TraceSpan) -> dict[str, Any]:
|
||||
"""将 TraceSpan 转为可 JSON 序列化的 dict。
|
||||
|
||||
包含所有必需字段:span_type, module, function, description_zh,
|
||||
description_en, params, result_summary, duration_ms, timestamp, extra。
|
||||
"""
|
||||
return {
|
||||
"span_type": span.span_type,
|
||||
"module": span.module,
|
||||
"function": span.function,
|
||||
"description_zh": span.description_zh,
|
||||
"description_en": span.description_en,
|
||||
"params": span.params,
|
||||
"result_summary": span.result_summary,
|
||||
"duration_ms": span.duration_ms,
|
||||
"timestamp": span.timestamp,
|
||||
"extra": span.extra,
|
||||
}
|
||||
|
||||
|
||||
def serialize_trace(ctx: TraceContext) -> dict[str, Any]:
|
||||
"""将 TraceContext 转为可 JSON 序列化的 dict。
|
||||
|
||||
包含完整字段:request_id, trace_type, timestamp, method, path,
|
||||
status_code, total_duration_ms, user_id, site_id,
|
||||
db_query_count, db_total_ms, error, spans。
|
||||
|
||||
衍生字段从 spans 列表计算:
|
||||
- status_code: 从 HTTP_OUT span 的 extra 或 result_summary 提取
|
||||
- total_duration_ms: 从 HTTP_OUT span 的 duration_ms 获取
|
||||
- db_query_count: 统计 DB_QUERY span 数量
|
||||
- db_total_ms: 累加 DB_QUERY span 的 duration_ms
|
||||
- error: 从 ERROR/DB_ERROR span 提取错误信息
|
||||
"""
|
||||
spans = ctx.spans
|
||||
|
||||
# 衍生字段:status_code — 从 HTTP_OUT span 提取
|
||||
status_code: int | None = None
|
||||
total_duration_ms: float = 0.0
|
||||
for s in spans:
|
||||
if s.span_type == SpanType.HTTP_OUT:
|
||||
# 优先从 extra 中取 status_code
|
||||
status_code = s.extra.get("status_code")
|
||||
if status_code is None:
|
||||
# 尝试从 result_summary 解析,如 "200 OK, 3.2KB body"
|
||||
summary = s.result_summary or ""
|
||||
parts = summary.split()
|
||||
if parts and parts[0].isdigit():
|
||||
status_code = int(parts[0])
|
||||
total_duration_ms = s.duration_ms
|
||||
break
|
||||
|
||||
# 衍生字段:db_query_count, db_total_ms
|
||||
db_query_count = 0
|
||||
db_total_ms = 0.0
|
||||
for s in spans:
|
||||
if s.span_type == SpanType.DB_QUERY:
|
||||
db_query_count += 1
|
||||
db_total_ms += s.duration_ms
|
||||
|
||||
# 衍生字段:error — 从 ERROR/DB_ERROR span 提取
|
||||
error: str | None = None
|
||||
for s in spans:
|
||||
if s.span_type in (SpanType.ERROR, SpanType.DB_ERROR):
|
||||
# 拼接错误信息:优先 result_summary,其次 description_zh
|
||||
error = s.result_summary or s.description_zh or s.description_en
|
||||
break
|
||||
|
||||
return {
|
||||
"request_id": ctx.request_id,
|
||||
"trace_type": ctx.trace_type,
|
||||
"timestamp": ctx.start_time.isoformat() if isinstance(ctx.start_time, datetime) else str(ctx.start_time),
|
||||
"method": ctx.method,
|
||||
"path": ctx.path,
|
||||
"status_code": status_code,
|
||||
"total_duration_ms": total_duration_ms,
|
||||
"user_id": ctx.user_id,
|
||||
"site_id": ctx.site_id,
|
||||
"db_query_count": db_query_count,
|
||||
"db_total_ms": round(db_total_ms, 2),
|
||||
"error": error,
|
||||
"spans": [serialize_span(s) for s in spans],
|
||||
}
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 路径生成函数
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def get_log_dir(dt: datetime, base_dir: str | None = None) -> Path:
|
||||
"""返回日期目录路径:{base_dir}/{YYYY-MM-DD}/。
|
||||
|
||||
Args:
|
||||
dt: 时间戳,用于确定日期目录名
|
||||
base_dir: 日志根目录,默认从 TraceConfig 读取
|
||||
"""
|
||||
if base_dir is None:
|
||||
base_dir = get_trace_config().log_dir
|
||||
date_str = dt.strftime("%Y-%m-%d")
|
||||
return Path(base_dir) / date_str
|
||||
|
||||
|
||||
def get_log_file(dt: datetime, base_dir: str | None = None) -> Path:
|
||||
"""返回当前小时的日志文件路径:{base_dir}/{YYYY-MM-DD}/trace_YYYY-MM-DD_HH.jsonl。
|
||||
|
||||
Args:
|
||||
dt: 时间戳,用于确定日期目录和小时文件名
|
||||
base_dir: 日志根目录,默认从 TraceConfig 读取
|
||||
"""
|
||||
date_dir = get_log_dir(dt, base_dir=base_dir)
|
||||
filename = f"trace_{dt.strftime('%Y-%m-%d_%H')}.jsonl"
|
||||
return date_dir / filename
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# TraceWriter 类
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
class TraceWriter:
|
||||
"""JSON Lines 日志写入器。
|
||||
|
||||
负责将 TraceContext 序列化为 JSON 并追加写入 .jsonl 文件。
|
||||
支持按日期分目录、按小时分文件、超 10MB 自动轮转、索引维护。
|
||||
"""
|
||||
|
||||
def __init__(self, base_dir: str | None = None):
|
||||
"""初始化写入器。
|
||||
|
||||
Args:
|
||||
base_dir: 日志根目录,默认从 TraceConfig 读取
|
||||
"""
|
||||
self._base_dir = base_dir
|
||||
|
||||
@property
|
||||
def base_dir(self) -> str:
|
||||
"""获取日志根目录。"""
|
||||
if self._base_dir is not None:
|
||||
return self._base_dir
|
||||
return get_trace_config().log_dir
|
||||
|
||||
def _rotate_if_needed(self, filepath: Path) -> Path:
|
||||
"""检查文件大小,需要时返回轮转后的新路径。
|
||||
|
||||
如果文件不存在或未超过 10MB,返回原路径。
|
||||
如果超过 10MB,返回带序号后缀的新路径(trace_YYYY-MM-DD_HH_001.jsonl)。
|
||||
"""
|
||||
if not filepath.exists():
|
||||
return filepath
|
||||
|
||||
if filepath.stat().st_size < MAX_FILE_SIZE:
|
||||
return filepath
|
||||
|
||||
# 需要轮转:查找下一个可用序号
|
||||
stem = filepath.stem # trace_YYYY-MM-DD_HH
|
||||
parent = filepath.parent
|
||||
|
||||
# 如果当前文件名已经带序号(如 trace_2026-03-22_14_001),
|
||||
# 提取基础名称
|
||||
parts = stem.rsplit("_", 1)
|
||||
if len(parts) == 2 and parts[1].isdigit() and len(parts[1]) == 3:
|
||||
base_stem = parts[0]
|
||||
else:
|
||||
base_stem = stem
|
||||
|
||||
# 查找已有的轮转文件,确定下一个序号
|
||||
seq = 1
|
||||
while True:
|
||||
candidate = parent / f"{base_stem}_{seq:03d}.jsonl"
|
||||
if not candidate.exists() or candidate.stat().st_size < MAX_FILE_SIZE:
|
||||
return candidate
|
||||
seq += 1
|
||||
|
||||
def _update_index(self, date_dir: Path, filename: str, record_count_delta: int) -> None:
|
||||
"""更新索引文件 _index.json。
|
||||
|
||||
索引结构:
|
||||
{
|
||||
"files": {
|
||||
"trace_2026-03-22_14.jsonl": {
|
||||
"record_count": 42,
|
||||
"file_size": 12345
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
index_path = date_dir / "_index.json"
|
||||
|
||||
# 读取现有索引
|
||||
index: dict[str, Any] = {"files": {}}
|
||||
if index_path.exists():
|
||||
try:
|
||||
index = json.loads(index_path.read_text(encoding="utf-8"))
|
||||
except (json.JSONDecodeError, OSError):
|
||||
index = {"files": {}}
|
||||
|
||||
if "files" not in index:
|
||||
index["files"] = {}
|
||||
|
||||
# 更新文件条目
|
||||
file_entry = index["files"].get(filename, {"record_count": 0, "file_size": 0})
|
||||
file_entry["record_count"] = file_entry.get("record_count", 0) + record_count_delta
|
||||
|
||||
# 更新实际文件大小
|
||||
file_path = date_dir / filename
|
||||
if file_path.exists():
|
||||
file_entry["file_size"] = file_path.stat().st_size
|
||||
else:
|
||||
file_entry["file_size"] = 0
|
||||
|
||||
index["files"][filename] = file_entry
|
||||
|
||||
# 写回索引
|
||||
index_path.write_text(
|
||||
json.dumps(index, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
async def write_trace(self, ctx: TraceContext) -> None:
|
||||
"""异步写入 TraceContext 到 JSON Lines 文件。
|
||||
|
||||
写入失败时静默处理(仅记录日志),不影响请求处理。
|
||||
"""
|
||||
try:
|
||||
# 序列化
|
||||
data = serialize_trace(ctx)
|
||||
line = json.dumps(data, ensure_ascii=False, default=str)
|
||||
|
||||
# 确定写入路径
|
||||
dt = ctx.start_time if isinstance(ctx.start_time, datetime) else datetime.now()
|
||||
filepath = get_log_file(dt, base_dir=self.base_dir)
|
||||
|
||||
# 确保目录存在
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 检查是否需要轮转
|
||||
filepath = self._rotate_if_needed(filepath)
|
||||
|
||||
# 异步写入(在线程池中执行 IO 操作)
|
||||
await asyncio.to_thread(self._sync_write, filepath, line)
|
||||
|
||||
except Exception:
|
||||
# 写入失败不影响请求处理,仅记录日志
|
||||
logger.warning("Trace 日志写入失败", exc_info=True)
|
||||
|
||||
def _sync_write(self, filepath: Path, line: str) -> None:
|
||||
"""同步写入一行 JSON 到文件,并更新索引。"""
|
||||
# 追加写入
|
||||
with open(filepath, "a", encoding="utf-8") as f:
|
||||
f.write(line + "\n")
|
||||
|
||||
# 更新索引
|
||||
date_dir = filepath.parent
|
||||
filename = filepath.name
|
||||
self._update_index(date_dir, filename, record_count_delta=1)
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# 单例访问
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
_writer_instance: TraceWriter | None = None
|
||||
|
||||
|
||||
def get_trace_writer() -> TraceWriter:
|
||||
"""获取全局 TraceWriter 单例。"""
|
||||
global _writer_instance
|
||||
if _writer_instance is None:
|
||||
_writer_instance = TraceWriter()
|
||||
return _writer_instance
|
||||
129
apps/backend/app/trace/ws_wrapper.py
Normal file
129
apps/backend/app/trace/ws_wrapper.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
WebSocket 连接追踪辅助函数
|
||||
|
||||
提供一组轻量函数,在 WebSocket 端点内部调用,
|
||||
追踪连接全生命周期(WS_CONNECT → WS_MESSAGE → WS_DISCONNECT)。
|
||||
|
||||
所有函数在无活跃 TraceContext 时静默跳过,不影响业务逻辑。
|
||||
WS_MESSAGE span 每 5 条消息记录一次,避免 span 爆炸。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from app.trace.context import (
|
||||
SpanType,
|
||||
TraceContext,
|
||||
TraceSpan,
|
||||
TraceType,
|
||||
create_ws_trace,
|
||||
get_current_trace,
|
||||
set_current_trace,
|
||||
)
|
||||
from app.trace.config import get_trace_config
|
||||
from app.trace.writer import TraceWriter
|
||||
|
||||
|
||||
def ws_trace_connect(
|
||||
execution_id: str,
|
||||
client_info: str = "",
|
||||
) -> TraceContext | None:
|
||||
"""创建 WS TraceContext 并记录 WS_CONNECT span。
|
||||
|
||||
返回创建的 TraceContext(供后续 span 使用),
|
||||
如果 trace 未启用则返回 None。
|
||||
"""
|
||||
cfg = get_trace_config()
|
||||
if not cfg.enabled:
|
||||
return None
|
||||
|
||||
ctx = create_ws_trace()
|
||||
set_current_trace(ctx)
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.WS_CONNECT,
|
||||
module="trace.ws_wrapper",
|
||||
function="ws_trace_connect",
|
||||
description_zh=f"WebSocket 连接建立: execution_id={execution_id}",
|
||||
description_en=f"WebSocket connected: execution_id={execution_id}",
|
||||
params={"execution_id": execution_id, "client_info": client_info},
|
||||
result_summary="connected",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
return ctx
|
||||
|
||||
|
||||
def ws_trace_message(
|
||||
message_count: int,
|
||||
total_bytes: int,
|
||||
) -> None:
|
||||
"""记录 WS_MESSAGE span:每 5 条消息记录一次,避免 span 爆炸。
|
||||
|
||||
仅当 message_count 是 5 的倍数时才记录 span。
|
||||
"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
# 每 5 条消息记录一次
|
||||
if message_count % 5 != 0:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.WS_MESSAGE,
|
||||
module="trace.ws_wrapper",
|
||||
function="ws_trace_message",
|
||||
description_zh=f"WS 消息推送: 累计 {message_count} 条, {total_bytes} 字节",
|
||||
description_en=f"WS message push: cumulative {message_count} msgs, {total_bytes} bytes",
|
||||
params={"message_count": message_count, "total_bytes": total_bytes},
|
||||
result_summary=f"msgs={message_count}, bytes={total_bytes}",
|
||||
duration_ms=0.0,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
|
||||
def ws_trace_disconnect(
|
||||
reason: str,
|
||||
total_messages: int,
|
||||
total_duration_ms: float,
|
||||
) -> None:
|
||||
"""记录 WS_DISCONNECT span 并写入 trace 日志。"""
|
||||
ctx = get_current_trace()
|
||||
if ctx is None:
|
||||
return
|
||||
|
||||
ctx.add_span(TraceSpan(
|
||||
span_type=SpanType.WS_DISCONNECT,
|
||||
module="trace.ws_wrapper",
|
||||
function="ws_trace_disconnect",
|
||||
description_zh=f"WebSocket 断开: 原因={reason}, 总消息={total_messages}, 耗时={total_duration_ms:.0f}ms",
|
||||
description_en=f"WebSocket disconnected: reason={reason}, total_msgs={total_messages}, duration={total_duration_ms:.0f}ms",
|
||||
params={
|
||||
"reason": reason,
|
||||
"total_messages": total_messages,
|
||||
"total_duration_ms": total_duration_ms,
|
||||
},
|
||||
result_summary=f"msgs={total_messages}, reason={reason}",
|
||||
duration_ms=total_duration_ms,
|
||||
timestamp=datetime.now().isoformat(),
|
||||
))
|
||||
|
||||
# 同步写入日志文件(ws_trace_disconnect 可能在同步上下文中调用)
|
||||
try:
|
||||
from app.trace.writer import serialize_trace, get_log_file
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
writer = TraceWriter()
|
||||
data = serialize_trace(ctx)
|
||||
line = json.dumps(data, ensure_ascii=False, default=str)
|
||||
dt = ctx.start_time if isinstance(ctx.start_time, datetime) else datetime.now()
|
||||
filepath = get_log_file(dt, base_dir=writer.base_dir)
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
filepath = writer._rotate_if_needed(filepath)
|
||||
writer._sync_write(filepath, line)
|
||||
except Exception:
|
||||
pass # 写入失败不影响业务
|
||||
Reference in New Issue
Block a user