feat: 累积功能变更 — 聊天集成、租户管理、小程序更新、ETL 增强、迁移脚本

包含多个会话的累积代码变更:
- backend: AI 聊天服务、触发器调度、认证增强、WebSocket、调度器最小间隔
- admin-web: ETL 状态页、任务管理、调度配置、登录优化
- miniprogram: 看板页面、聊天集成、UI 组件、导航更新
- etl: DWS 新任务(finance_area_daily/board_cache)、连接器增强
- tenant-admin: 项目初始化
- db: 19 个迁移脚本(etl_feiqiu 11 + zqyy_app 8)
- packages/shared: 枚举和工具函数更新
- tools: 数据库工具、报表生成、健康检查
- docs: PRD/架构/部署/合约文档更新

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Neo
2026-04-06 00:03:48 +08:00
parent 70324d8542
commit 6f8f12314f
515 changed files with 76604 additions and 7456 deletions

View File

@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
"""
Dev Trace Log — 全链路请求追踪模块
仅在开发/测试环境启用,生产环境通过 DEV_TRACE_ENABLED=false 关闭。
"""
from app.trace.config import TraceConfig, get_trace_config
__all__ = ["TraceConfig", "get_trace_config"]

View File

@@ -0,0 +1,277 @@
# -*- coding: utf-8 -*-
"""
日志自动清理模块
提供同步清理函数和定时任务调度器:
- cleanup_old_logs: 保留最新 N 个日期目录(按记录日期排序,非日历天数),
超出总量上限时额外清理
- cleanup_date_range: 按日期范围清理指定目录
- schedule_daily_cleanup: 返回 async 函数,每天凌晨 2:00 执行清理
"""
from __future__ import annotations
import asyncio
import json
import logging
import shutil
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any
from app.trace.config import get_trace_config
logger = logging.getLogger(__name__)
# 日期目录名格式
_DATE_FORMAT = "%Y-%m-%d"
def _is_date_dir(name: str) -> bool:
"""判断目录名是否为 YYYY-MM-DD 格式的日期目录。"""
try:
datetime.strptime(name, _DATE_FORMAT)
return True
except ValueError:
return False
def _parse_date(name: str) -> datetime:
"""将 YYYY-MM-DD 格式的目录名解析为 datetime。"""
return datetime.strptime(name, _DATE_FORMAT)
def _dir_size(path: Path) -> int:
"""递归计算目录总大小(字节)。"""
total = 0
try:
for f in path.rglob("*"):
if f.is_file():
total += f.stat().st_size
except OSError:
pass
return total
def _update_root_index(base_dir: Path, deleted_dirs: list[str]) -> None:
"""更新根目录的 _index.json移除已删除目录的引用。
根索引结构示例:
{
"dates": {
"2026-03-20": { "files": {...} },
"2026-03-21": { "files": {...} }
}
}
也兼容日期目录内的 _index.jsonfiles 字典)。
"""
index_path = base_dir / "_index.json"
if not index_path.exists():
return
try:
index: dict[str, Any] = json.loads(index_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return
changed = False
# 移除 dates 字典中的已删除日期
if "dates" in index and isinstance(index["dates"], dict):
for d in deleted_dirs:
if d in index["dates"]:
del index["dates"][d]
changed = True
# 兼容:如果根索引直接用日期作为 key扁平结构
for d in deleted_dirs:
if d in index and d != "dates":
del index[d]
changed = True
if changed:
index_path.write_text(
json.dumps(index, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def cleanup_old_logs(
base_dir: str | None = None,
retention_days: int | None = None,
max_total_mb: int = 200,
) -> dict[str, Any]:
"""保留最新 N 个日期目录,删除其余;超出总量上限时额外清理。
策略:按日期目录名排序(非日历天数),保留最新 retention_days 个,
不论日期是否连续。之后如果总量仍超过 max_total_mb从最旧开始
继续删除直到低于上限。
Args:
base_dir: 日志根目录,默认从 TraceConfig 读取
retention_days: 保留的日期目录个数,默认从 TraceConfig 读取
max_total_mb: 总量上限MB超过时从最旧额外清理
Returns:
清理结果 dict
{
"deleted_dirs": ["2026-03-15", ...],
"deleted_count": 2,
"freed_bytes": 12345
}
"""
cfg = get_trace_config()
if base_dir is None:
base_dir = cfg.log_dir
if retention_days is None:
retention_days = cfg.retention_days
base_path = Path(base_dir)
if not base_path.exists():
return {"deleted_dirs": [], "deleted_count": 0, "freed_bytes": 0}
# 收集所有日期目录,按名称排序(即按日期升序)
date_dirs = sorted(
[d for d in base_path.iterdir() if d.is_dir() and _is_date_dir(d.name)],
key=lambda d: d.name,
)
deleted_dirs: list[str] = []
freed_bytes = 0
# 第 1 步:保留最新 retention_days 个,删除其余
if len(date_dirs) > retention_days:
to_delete = date_dirs[: len(date_dirs) - retention_days]
for entry in to_delete:
size = _dir_size(entry)
try:
shutil.rmtree(entry)
deleted_dirs.append(entry.name)
freed_bytes += size
except OSError:
logger.warning("清理日期目录失败: %s", entry, exc_info=True)
# 更新剩余列表
date_dirs = date_dirs[len(date_dirs) - retention_days :]
# 第 2 步:总量上限保护
max_bytes = max_total_mb * 1024 * 1024
total_size = sum(_dir_size(d) for d in date_dirs)
if total_size > max_bytes:
logger.warning(
"日志总量 %.1f MB 超过上限 %d MB启动额外清理",
total_size / (1024 * 1024),
max_total_mb,
)
for entry in list(date_dirs):
if total_size <= max_bytes:
break
size = _dir_size(entry)
try:
shutil.rmtree(entry)
deleted_dirs.append(entry.name)
freed_bytes += size
total_size -= size
date_dirs.remove(entry)
except OSError:
logger.warning("清理日期目录失败: %s", entry, exc_info=True)
# 更新根索引
if deleted_dirs:
_update_root_index(base_path, deleted_dirs)
return {
"deleted_dirs": deleted_dirs,
"deleted_count": len(deleted_dirs),
"freed_bytes": freed_bytes,
}
def cleanup_date_range(
start_date: str,
end_date: str,
base_dir: str | None = None,
) -> dict[str, Any]:
"""按日期范围清理日期目录。
Args:
start_date: 起始日期(含),格式 YYYY-MM-DD
end_date: 结束日期(含),格式 YYYY-MM-DD
base_dir: 日志根目录,默认从 TraceConfig 读取
Returns:
清理结果 dict同 cleanup_old_logs
"""
if base_dir is None:
base_dir = get_trace_config().log_dir
base_path = Path(base_dir)
if not base_path.exists():
return {"deleted_dirs": [], "deleted_count": 0, "freed_bytes": 0}
start_dt = datetime.strptime(start_date, _DATE_FORMAT)
end_dt = datetime.strptime(end_date, _DATE_FORMAT)
deleted_dirs: list[str] = []
freed_bytes = 0
for entry in sorted(base_path.iterdir()):
if not entry.is_dir():
continue
if not _is_date_dir(entry.name):
continue
dir_date = _parse_date(entry.name)
if start_dt <= dir_date <= end_dt:
size = _dir_size(entry)
try:
shutil.rmtree(entry)
deleted_dirs.append(entry.name)
freed_bytes += size
except OSError:
logger.warning("清理日期目录失败: %s", entry, exc_info=True)
if deleted_dirs:
_update_root_index(base_path, deleted_dirs)
return {
"deleted_dirs": deleted_dirs,
"deleted_count": len(deleted_dirs),
"freed_bytes": freed_bytes,
}
def schedule_daily_cleanup():
"""返回一个 async 函数,可在 lifespan 中注册为定时任务。
每天凌晨 2:00 执行 cleanup_old_logs使用 asyncio.sleep 循环等待。
"""
async def _daily_cleanup_loop() -> None:
"""定时清理循环:计算到下一个凌晨 2:00 的等待秒数,执行清理后继续循环。"""
while True:
now = datetime.now()
# 计算下一个凌晨 2:00
target = now.replace(hour=2, minute=0, second=0, microsecond=0)
if now >= target:
# 今天的 2:00 已过,等到明天
target += timedelta(days=1)
wait_seconds = (target - now).total_seconds()
logger.info("日志清理定时任务:将在 %.0f 秒后执行(%s", wait_seconds, target.isoformat())
await asyncio.sleep(wait_seconds)
try:
result = cleanup_old_logs()
if result["deleted_count"] > 0:
logger.info(
"日志自动清理完成:删除 %d 个目录,释放 %d 字节",
result["deleted_count"],
result["freed_bytes"],
)
else:
logger.debug("日志自动清理:无过期目录需要清理")
except Exception:
logger.warning("日志自动清理失败", exc_info=True)
return _daily_cleanup_loop

View File

@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
"""
Dev Trace Log 配置模块
从环境变量读取 trace 相关配置,支持运行时动态修改(内存状态)。
重启后回退到 .env 配置值。使用单例模式,全局唯一实例。
环境变量缺失时使用合理默认值:
- DEV_TRACE_ENABLED: false默认关闭显式开启才采集
- DEV_TRACE_LOG_DIR: export/dev-trace-logs
- DEV_TRACE_LOG_RETENTION_DAYS: 7
- DEV_TRACE_LOG_SQL: true
- DEV_TRACE_LOG_PARAMS: true
"""
from __future__ import annotations
import os
import threading
from typing import Any
def _parse_bool(value: str, default: bool) -> bool:
"""解析布尔型环境变量,支持 true/1/yes不区分大小写"""
if not value:
return default
return value.strip().lower() in ("true", "1", "yes")
class TraceConfig:
"""全链路日志配置(单例)。
属性从环境变量初始化,运行时可通过 update_settings() 动态修改。
重启后自动回退到 .env 值(因为 __init__ 重新读取环境变量)。
"""
_instance: TraceConfig | None = None
_lock: threading.Lock = threading.Lock()
def __new__(cls) -> TraceConfig:
# 双重检查锁定,线程安全的单例
if cls._instance is None:
with cls._lock:
if cls._instance is None:
instance = super().__new__(cls)
instance._init_from_env()
cls._instance = instance
return cls._instance
def _init_from_env(self) -> None:
"""从环境变量读取配置,作为基线值和当前运行时值。"""
# 基线值(.env 原始值reset_to_env 时回退到这里)
self._env_enabled: bool = _parse_bool(
os.environ.get("DEV_TRACE_ENABLED", ""), default=False
)
raw_log_dir: str = os.environ.get(
"DEV_TRACE_LOG_DIR", "export/dev-trace-logs"
)
# CHANGE 2026-03-22 | trace日志路径修复
# 相对路径基于项目根目录解析,避免受后端 cwdapps/backend/)影响
if not os.path.isabs(raw_log_dir):
project_root = os.environ.get("NEOZQYY_ROOT", "")
if project_root:
raw_log_dir = os.path.join(project_root, raw_log_dir)
self._env_log_dir: str = raw_log_dir
self._env_retention_days: int = int(
os.environ.get("DEV_TRACE_LOG_RETENTION_DAYS", "7")
)
self._env_log_sql: bool = _parse_bool(
os.environ.get("DEV_TRACE_LOG_SQL", ""), default=True
)
self._env_log_params: bool = _parse_bool(
os.environ.get("DEV_TRACE_LOG_PARAMS", ""), default=True
)
# 运行时值初始等于基线值API 可动态修改)
self.enabled: bool = self._env_enabled
self.log_dir: str = self._env_log_dir
self.retention_days: int = self._env_retention_days
self.log_sql: bool = self._env_log_sql
self.log_params: bool = self._env_log_params
# ------------------------------------------------------------------
# 运行时开关方法
# ------------------------------------------------------------------
def update_settings(
self,
*,
enabled: bool | None = None,
retention_days: int | None = None,
log_sql: bool | None = None,
log_params: bool | None = None,
) -> None:
"""更新内存中的运行时设置(不写 .env重启后回退"""
if enabled is not None:
self.enabled = enabled
if retention_days is not None:
self.retention_days = retention_days
if log_sql is not None:
self.log_sql = log_sql
if log_params is not None:
self.log_params = log_params
def get_settings(self) -> dict[str, Any]:
"""返回当前运行时设置字典。"""
return {
"enabled": self.enabled,
"log_dir": self.log_dir,
"retention_days": self.retention_days,
"log_sql": self.log_sql,
"log_params": self.log_params,
}
def reset_to_env(self) -> None:
"""重置运行时值为 .env 基线值。"""
self.enabled = self._env_enabled
self.log_dir = self._env_log_dir
self.retention_days = self._env_retention_days
self.log_sql = self._env_log_sql
self.log_params = self._env_log_params
# ------------------------------------------------------------------
# 测试辅助:重置单例(仅测试使用)
# ------------------------------------------------------------------
@classmethod
def _reset_singleton(cls) -> None:
"""销毁单例实例,下次访问时重新从环境变量初始化。仅供测试使用。"""
with cls._lock:
cls._instance = None
def get_trace_config() -> TraceConfig:
"""获取全局 TraceConfig 单例。"""
return TraceConfig()

View File

@@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""
TraceContext 与 TraceSpan 数据模型
基于 contextvars 的请求级追踪上下文,存储 request_id 和有序 span 列表。
支持 HTTP / SSE / WebSocket / 后台 Job 四种 trace 类型,
每种类型使用不同的 request_id 前缀以便区分。
"""
from __future__ import annotations
import contextvars
import uuid
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any
# ──────────────────────────────────────────────
# Span 类型常量(字符串常量类,不用 Enum
# ──────────────────────────────────────────────
class SpanType:
"""所有支持的 span 类型常量。"""
HTTP_IN = "HTTP_IN"
AUTH = "AUTH"
ROUTE = "ROUTE"
SERVICE = "SERVICE"
DB_QUERY = "DB_QUERY"
DB_CONN = "DB_CONN"
DB_CONN_RELEASE = "DB_CONN_RELEASE"
HTTP_OUT = "HTTP_OUT"
ERROR = "ERROR"
DB_ERROR = "DB_ERROR"
MIDDLEWARE = "MIDDLEWARE"
MIDDLEWARE_ERROR = "MIDDLEWARE_ERROR"
SSE_START = "SSE_START"
SSE_EVENT = "SSE_EVENT"
SSE_END = "SSE_END"
AI_CALL = "AI_CALL"
AI_STREAM = "AI_STREAM"
AI_ERROR = "AI_ERROR"
WS_CONNECT = "WS_CONNECT"
WS_MESSAGE = "WS_MESSAGE"
WS_DISCONNECT = "WS_DISCONNECT"
JOB_START = "JOB_START"
JOB_END = "JOB_END"
JOB_ERROR = "JOB_ERROR"
# ──────────────────────────────────────────────
# Trace 类型常量
# ──────────────────────────────────────────────
class TraceType:
"""Trace 类型HTTP / SSE / WebSocket / 后台 Job。"""
HTTP = "http"
SSE = "sse"
WS = "ws"
JOB = "job"
# ──────────────────────────────────────────────
# 数据模型
# ──────────────────────────────────────────────
@dataclass
class TraceSpan:
"""单个追踪节点,记录某一层的函数调用信息。"""
span_type: str # SpanType 中的常量值
module: str # 模块路径,如 "xcx_tasks"
function: str # 函数名,如 "get_task_list"
description_zh: str # 中文描述
description_en: str # 英文描述
params: dict[str, Any] # 调用参数
result_summary: str # 结果摘要
duration_ms: float # 耗时(毫秒)
timestamp: str # ISO 格式时间戳
extra: dict[str, Any] = field(default_factory=dict) # 额外信息SQL 等)
@dataclass
class TraceContext:
"""请求级追踪上下文,维护 request_id 和有序 span 列表。"""
request_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
trace_type: str = TraceType.HTTP # http / sse / ws / job
start_time: datetime = field(default_factory=datetime.now)
method: str = ""
path: str = ""
user_id: int | None = None
site_id: int | None = None
spans: list[TraceSpan] = field(default_factory=list)
def add_span(self, span: TraceSpan) -> None:
"""追加 span 到列表,保持插入顺序。"""
self.spans.append(span)
# ──────────────────────────────────────────────
# contextvars 存储
# ──────────────────────────────────────────────
trace_context_var: contextvars.ContextVar[TraceContext | None] = contextvars.ContextVar(
"trace_context", default=None
)
def get_current_trace() -> TraceContext | None:
"""获取当前请求的 TraceContext无则返回 None"""
return trace_context_var.get()
def set_current_trace(ctx: TraceContext) -> contextvars.Token:
"""设置当前请求的 TraceContext返回 Token 用于后续恢复。"""
return trace_context_var.set(ctx)
# ──────────────────────────────────────────────
# 工厂函数:创建不同类型的 TraceContext
# ──────────────────────────────────────────────
def create_http_trace(method: str, path: str) -> TraceContext:
"""创建 HTTP 请求的 TraceContextrequest_id = uuid hex[:12])。"""
return TraceContext(
request_id=uuid.uuid4().hex[:12],
trace_type=TraceType.HTTP,
method=method,
path=path,
)
def create_ws_trace() -> TraceContext:
"""创建 WebSocket 连接的 TraceContextrequest_id = ws_ 前缀)。"""
return TraceContext(
request_id=f"ws_{uuid.uuid4().hex[:12]}",
trace_type=TraceType.WS,
)
def create_job_trace(job_name: str) -> TraceContext:
"""创建后台 Job 的 TraceContextrequest_id = job_ 前缀)。"""
return TraceContext(
request_id=f"job_{uuid.uuid4().hex[:12]}",
trace_type=TraceType.JOB,
path=job_name,
)

View File

@@ -0,0 +1,240 @@
# -*- coding: utf-8 -*-
"""
Trace 覆盖率扫描器
通过文件内容搜索grep 式)检测 trace 覆盖情况,不使用 AST 解析。
扫描维度路由覆盖、Service 覆盖、Job 覆盖、SSE/WS 覆盖。
扫描结果缓存在内存中,支持手动刷新。
"""
from __future__ import annotations
import logging
import re
from datetime import datetime
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
# 后端 app 目录(相对于项目根目录)
_APP_DIR = Path(__file__).resolve().parent.parent # apps/backend/app/
# ── 模块级缓存 ────────────────────────────────────────────
_cached_result: dict[str, Any] | None = None
def get_cached_coverage() -> dict[str, Any] | None:
"""返回缓存的覆盖率扫描结果,无缓存时返回 None。"""
return _cached_result
def run_coverage_scan() -> dict[str, Any]:
"""执行覆盖率扫描并更新缓存,返回扫描结果。"""
global _cached_result
result: dict[str, Any] = {
"scan_time": datetime.now().isoformat(),
"routes": _scan_routes(),
"services": _scan_services(),
"jobs": _scan_jobs(),
"sse_endpoints": _scan_sse(),
"ws_endpoints": _scan_ws(),
}
_cached_result = result
return result
# ── 路由覆盖扫描 ──────────────────────────────────────────
# 匹配 @router.get/post/put/delete/patch 装饰器
_ROUTE_DECORATOR_RE = re.compile(r"@router\.(get|post|put|delete|patch)\(")
# 匹配紧跟装饰器后的 async def / def 函数名
_FUNC_DEF_RE = re.compile(r"(?:async\s+)?def\s+(\w+)\s*\(")
def _scan_routes() -> dict[str, Any]:
"""扫描 xcx_*.py 路由文件,统计路由函数数量。
覆盖判定xcx_* 路由文件中的路由函数默认被 TraceMiddleware 覆盖
(因为中间件拦截 /api/xcx/ 前缀)。
"""
routers_dir = _APP_DIR / "routers"
total_routes: list[str] = []
covered_routes: list[str] = []
uncovered_routes: list[str] = []
if not routers_dir.exists():
return {"total": 0, "covered": 0, "uncovered": []}
for py_file in sorted(routers_dir.glob("xcx_*.py")):
module_name = py_file.stem
content = py_file.read_text(encoding="utf-8")
lines = content.splitlines()
# 查找路由装饰器后面的函数名
i = 0
while i < len(lines):
if _ROUTE_DECORATOR_RE.search(lines[i]):
# 向下查找函数定义(可能隔几行装饰器)
for j in range(i, min(i + 5, len(lines))):
m = _FUNC_DEF_RE.search(lines[j])
if m:
func_name = m.group(1)
route_id = f"{module_name}.{func_name}"
total_routes.append(route_id)
# xcx_* 路由默认被 TraceMiddleware 覆盖
covered_routes.append(route_id)
break
i += 1
return {
"total": len(total_routes),
"covered": len(covered_routes),
"uncovered": uncovered_routes,
}
# ── Service 覆盖扫描 ─────────────────────────────────────
def _scan_services() -> dict[str, Any]:
"""扫描 services/*.py 中的公开函数,检查是否有 @trace_service 装饰器。"""
services_dir = _APP_DIR / "services"
total_funcs: list[str] = []
covered_funcs: list[str] = []
uncovered_funcs: list[str] = []
if not services_dir.exists():
return {"total": 0, "covered": 0, "uncovered": []}
for py_file in sorted(services_dir.rglob("*.py")):
if py_file.name.startswith("_"):
continue
module_name = py_file.stem
content = py_file.read_text(encoding="utf-8")
lines = content.splitlines()
has_trace_import = "trace_service" in content
i = 0
while i < len(lines):
line = lines[i].rstrip()
# 匹配模块级公开函数(不以空格开头、非 _ 开头)
m = re.match(r"^(?:async\s+)?def\s+([a-zA-Z]\w*)\s*\(", line)
if m:
func_name = m.group(1)
func_id = f"{module_name}.{func_name}"
total_funcs.append(func_id)
# 检查前面几行是否有 @trace_service
decorated = False
if has_trace_import:
for k in range(max(0, i - 5), i):
if "@trace_service" in lines[k]:
decorated = True
break
if decorated:
covered_funcs.append(func_id)
else:
uncovered_funcs.append(func_id)
i += 1
return {
"total": len(total_funcs),
"covered": len(covered_funcs),
"uncovered": uncovered_funcs,
}
# ── Job 覆盖扫描 ─────────────────────────────────────────
# 4 个已知 job handler
_KNOWN_JOBS = [
"task_generator",
"task_expiry_check",
"recall_completion_check",
"note_reclassify_backfill",
]
def _scan_jobs() -> dict[str, Any]:
"""检查 main.py 中 4 个 job handler 是否被 trace_job 包装。"""
main_py = _APP_DIR / "main.py"
covered: list[str] = []
uncovered: list[str] = []
if not main_py.exists():
return {"total": len(_KNOWN_JOBS), "covered": 0, "uncovered": _KNOWN_JOBS[:]}
content = main_py.read_text(encoding="utf-8")
for job_name in _KNOWN_JOBS:
# 检查是否有 trace_job("job_name") 包装
if f'trace_job("{job_name}")' in content:
covered.append(job_name)
else:
uncovered.append(job_name)
return {
"total": len(_KNOWN_JOBS),
"covered": len(covered),
"uncovered": uncovered,
}
# ── SSE 覆盖扫描 ─────────────────────────────────────────
def _scan_sse() -> dict[str, Any]:
"""检查 SSE 端点xcx_chat.py是否集成了 sse_wrapper。"""
chat_file = _APP_DIR / "routers" / "xcx_chat.py"
total = 1 # 目前只有 xcx_chat 一个 SSE 端点
covered: list[str] = []
uncovered: list[str] = []
if not chat_file.exists():
return {"total": total, "covered": 0, "uncovered": ["xcx_chat.chat_stream"]}
content = chat_file.read_text(encoding="utf-8")
if "sse_wrapper" in content or "from app.trace.sse_wrapper" in content:
covered.append("xcx_chat.chat_stream")
else:
uncovered.append("xcx_chat.chat_stream")
return {
"total": total,
"covered": len(covered),
"uncovered": uncovered,
}
# ── WebSocket 覆盖扫描 ───────────────────────────────────
def _scan_ws() -> dict[str, Any]:
"""检查 WebSocket 端点ws/logs.py是否集成了 ws_wrapper。"""
ws_file = _APP_DIR / "ws" / "logs.py"
total = 1 # 目前只有 ws_logs 一个 WS 端点
covered: list[str] = []
uncovered: list[str] = []
if not ws_file.exists():
return {"total": total, "covered": 0, "uncovered": ["ws_logs.ws_logs"]}
content = ws_file.read_text(encoding="utf-8")
if "ws_wrapper" in content or "from app.trace.ws_wrapper" in content:
covered.append("ws_logs.ws_logs")
else:
uncovered.append("ws_logs.ws_logs")
return {
"total": total,
"covered": len(covered),
"uncovered": uncovered,
}

View File

@@ -0,0 +1,222 @@
# -*- coding: utf-8 -*-
"""
数据库连接包装器 — 拦截 cursor.execute() 和连接生命周期
包装 psycopg2 连接对象,自动记录以下 span
- DB_QUERY: 每条 SQL 的语句、参数、行数、耗时、调用来源
- DB_CONN: 连接获取耗时(在 database.py 中记录)
- DB_CONN_RELEASE: 连接释放
- DB_ERROR: 数据库异常PostgreSQL 错误码、消息、触发 SQL
当 DEV_TRACE_LOG_SQL=false 时SQL 语句替换为 "[SQL hidden]"
当无活跃 TraceContext 时零开销直接委托原始对象。
"""
from __future__ import annotations
import inspect
import time
from datetime import datetime
from typing import Any
import psycopg2
from app.trace.config import get_trace_config
from app.trace.context import (
SpanType,
TraceSpan,
get_current_trace,
)
# SQL 被隐藏时的占位符
_SQL_HIDDEN = "[SQL hidden]"
def _get_caller_info() -> tuple[str, str]:
"""获取调用来源(跳过 wrapper 自身的栈帧)。
返回 (module, function) 元组。
"""
frame = inspect.currentframe()
try:
# 向上跳 3 层_get_caller_info → execute → 调用方
caller = frame
for _ in range(3):
if caller is not None:
caller = caller.f_back
if caller is not None:
module = caller.f_globals.get("__name__", "unknown")
func_name = caller.f_code.co_name
return module, func_name
finally:
del frame
return "unknown", "unknown"
def _safe_sql_repr(sql: Any) -> str:
"""安全地将 SQL 转为字符串表示。"""
if sql is None:
return ""
if isinstance(sql, bytes):
return sql.decode("utf-8", errors="replace")
return str(sql)
def _safe_params_repr(params: Any) -> Any:
"""安全地将 SQL 参数转为可序列化的表示。"""
if params is None:
return None
if isinstance(params, (list, tuple)):
return [_safe_param_value(v) for v in params]
if isinstance(params, dict):
return {str(k): _safe_param_value(v) for k, v in params.items()}
return str(params)
def _safe_param_value(value: Any) -> Any:
"""安全地将单个参数值转为可序列化的表示。"""
if isinstance(value, (str, int, float, bool, type(None))):
return value
return str(value)
class TracedCursor:
"""包装 psycopg2 cursor拦截 execute() 记录 DB_QUERY / DB_ERROR span。
未实现的方法通过 __getattr__ 委托给底层 cursor。
"""
def __init__(self, cursor: Any) -> None:
# 使用 object.__setattr__ 避免触发 __getattr__
object.__setattr__(self, "_cursor", cursor)
def execute(self, sql: Any, params: Any = None) -> None:
"""拦截 execute记录 DB_QUERY span异常时记录 DB_ERROR span。"""
ctx = get_current_trace()
if ctx is None:
# 无活跃 trace直接执行
self._cursor.execute(sql, params)
return
config = get_trace_config()
caller_module, caller_func = _get_caller_info()
sql_str = _safe_sql_repr(sql)
display_sql = sql_str if config.log_sql else _SQL_HIDDEN
start = time.perf_counter()
try:
self._cursor.execute(sql, params)
except psycopg2.Error as exc:
elapsed = (time.perf_counter() - start) * 1000
# 记录 DB_ERROR span
ctx.add_span(TraceSpan(
span_type=SpanType.DB_ERROR,
module=caller_module,
function=caller_func,
description_zh=f"数据库异常: {type(exc).__name__}",
description_en=f"Database error: {type(exc).__name__}",
params={},
result_summary=str(exc).strip()[:200],
duration_ms=elapsed,
timestamp=datetime.now().isoformat(),
extra={
"pgcode": getattr(exc, "pgcode", None),
"pgerror": (getattr(exc, "pgerror", None) or "")[:500],
"sql": display_sql,
},
))
raise
elapsed = (time.perf_counter() - start) * 1000
# 获取行数
row_count = self._cursor.rowcount if self._cursor.rowcount >= 0 else 0
ctx.add_span(TraceSpan(
span_type=SpanType.DB_QUERY,
module=caller_module,
function=caller_func,
description_zh=f"执行 SQL 查询,返回 {row_count}",
description_en=f"Executed SQL query, returned {row_count} rows",
params={"params": _safe_params_repr(params)} if config.log_sql else {},
result_summary=f"{row_count}",
duration_ms=elapsed,
timestamp=datetime.now().isoformat(),
extra={
"sql": display_sql,
"params": _safe_params_repr(params) if config.log_sql else None,
"row_count": row_count,
"caller": f"{caller_module}.{caller_func}",
},
))
def __getattr__(self, name: str) -> Any:
"""未实现的方法委托给底层 cursor。"""
return getattr(self._cursor, name)
def __iter__(self):
return iter(self._cursor)
def __next__(self):
return next(self._cursor)
def __enter__(self):
return self
def __exit__(self, *args):
self._cursor.close()
class TracedConnection:
"""包装 psycopg2 连接,拦截 cursor() 和 close()。
- cursor() 返回 TracedCursor
- close() 记录 DB_CONN_RELEASE span
- 其他方法通过 __getattr__ 委托给底层连接
"""
def __init__(self, conn: Any) -> None:
object.__setattr__(self, "_conn", conn)
object.__setattr__(self, "_closed_traced", False)
def cursor(self, *args: Any, **kwargs: Any) -> TracedCursor:
"""返回包装后的 TracedCursor。"""
raw_cursor = self._conn.cursor(*args, **kwargs)
return TracedCursor(raw_cursor)
def close(self) -> None:
"""关闭连接并记录 DB_CONN_RELEASE span。"""
if not self._closed_traced:
object.__setattr__(self, "_closed_traced", True)
ctx = get_current_trace()
if ctx is not None:
ctx.add_span(TraceSpan(
span_type=SpanType.DB_CONN_RELEASE,
module="trace.db_wrapper",
function="TracedConnection.close",
description_zh="释放数据库连接",
description_en="Released database connection",
params={},
result_summary="closed",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
self._conn.close()
def __getattr__(self, name: str) -> Any:
"""未实现的方法委托给底层连接。"""
return getattr(self._conn, name)
def __enter__(self):
return self
def __exit__(self, *args):
self.close()
def traced_connection(conn: Any) -> TracedConnection:
"""将 psycopg2 连接包装为 TracedConnection。
仅在 trace 启用且有活跃 TraceContext 时调用。
"""
return TracedConnection(conn)

View File

@@ -0,0 +1,192 @@
# -*- coding: utf-8 -*-
"""
Trace 装饰器模块
提供 trace_service 装饰器,自动为 Service 层函数记录 SERVICE span。
当无活跃 TraceContext 时零开销直接调用原函数。
"""
from __future__ import annotations
import asyncio
import functools
import inspect
import time
from datetime import datetime
from typing import Any, Callable
from app.trace.config import get_trace_config
from app.trace.context import SpanType, TraceSpan, get_current_trace
# 参数值被隐藏时的占位符
_REDACTED = "[redacted]"
# 返回值摘要最大长度
_RESULT_SUMMARY_MAX_LEN = 120
def _build_params_dict(func: Callable, args: tuple, kwargs: dict, *, redact: bool) -> dict[str, Any]:
"""从函数签名和实际调用参数构建参数字典。
redact=True 时只记录参数名,值替换为 "[redacted]"
"""
sig = inspect.signature(func)
bound = sig.bind(*args, **kwargs)
bound.apply_defaults()
if redact:
return {name: _REDACTED for name in bound.arguments}
return {name: _safe_repr(val) for name, val in bound.arguments.items()}
def _safe_repr(value: Any) -> Any:
"""安全地将参数值转为可序列化的表示。"""
if isinstance(value, (str, int, float, bool, type(None))):
return value
if isinstance(value, (list, tuple)):
if len(value) <= 10:
return [_safe_repr(v) for v in value]
return f"[{len(value)} items]"
if isinstance(value, dict):
if len(value) <= 10:
return {str(k): _safe_repr(v) for k, v in value.items()}
return f"{{{len(value)} keys}}"
# 其他类型用 repr 截断
r = repr(value)
if len(r) > 100:
return r[:97] + "..."
return r
def _make_result_summary(result: Any) -> str:
"""从返回值生成摘要字符串。"""
if result is None:
return "None"
if isinstance(result, (str, int, float, bool)):
s = str(result)
if len(s) > _RESULT_SUMMARY_MAX_LEN:
return s[:_RESULT_SUMMARY_MAX_LEN - 3] + "..."
return s
if isinstance(result, (list, tuple)):
return f"[{len(result)} items]"
if isinstance(result, dict):
return f"{{{len(result)} keys}}"
r = repr(result)
if len(r) > _RESULT_SUMMARY_MAX_LEN:
return r[:_RESULT_SUMMARY_MAX_LEN - 3] + "..."
return r
def trace_service(description_zh: str, description_en: str) -> Callable:
"""Service 层函数装饰器,自动记录 SERVICE span。
- 无活跃 TraceContext 时零开销直接调用原函数
- DEV_TRACE_LOG_PARAMS=false 时参数值替换为 "[redacted]"
- 支持同步和异步函数
"""
def decorator(func: Callable) -> Callable:
module_name = func.__module__ or ""
func_name = func.__qualname__ or func.__name__
if asyncio.iscoroutinefunction(func):
@functools.wraps(func)
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
ctx = get_current_trace()
if ctx is None:
return await func(*args, **kwargs)
config = get_trace_config()
redact = not config.log_params
params = _build_params_dict(func, args, kwargs, redact=redact)
start = time.perf_counter()
try:
result = await func(*args, **kwargs)
except Exception:
elapsed = (time.perf_counter() - start) * 1000
ctx.add_span(TraceSpan(
span_type=SpanType.SERVICE,
module=module_name,
function=func_name,
description_zh=description_zh,
description_en=description_en,
params=params,
result_summary="exception",
duration_ms=elapsed,
timestamp=datetime.now().isoformat(),
))
raise
elapsed = (time.perf_counter() - start) * 1000
ctx.add_span(TraceSpan(
span_type=SpanType.SERVICE,
module=module_name,
function=func_name,
description_zh=description_zh,
description_en=description_en,
params=params,
result_summary=_make_result_summary(result),
duration_ms=elapsed,
timestamp=datetime.now().isoformat(),
))
return result
return async_wrapper
else:
@functools.wraps(func)
def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
ctx = get_current_trace()
if ctx is None:
return func(*args, **kwargs)
config = get_trace_config()
redact = not config.log_params
params = _build_params_dict(func, args, kwargs, redact=redact)
start = time.perf_counter()
try:
result = func(*args, **kwargs)
except Exception:
elapsed = (time.perf_counter() - start) * 1000
ctx.add_span(TraceSpan(
span_type=SpanType.SERVICE,
module=module_name,
function=func_name,
description_zh=description_zh,
description_en=description_en,
params=params,
result_summary="exception",
duration_ms=elapsed,
timestamp=datetime.now().isoformat(),
))
raise
elapsed = (time.perf_counter() - start) * 1000
ctx.add_span(TraceSpan(
span_type=SpanType.SERVICE,
module=module_name,
function=func_name,
description_zh=description_zh,
description_en=description_en,
params=params,
result_summary=_make_result_summary(result),
duration_ms=elapsed,
timestamp=datetime.now().isoformat(),
))
return result
return sync_wrapper
return decorator
def truncate_token(token: str) -> str:
"""截断 token仅保留前 16 个字符 + '...' 后缀。
短于等于 16 字符的 token 原样返回。
"""
if len(token) <= 16:
return token
return token[:16] + "..."

View File

@@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
"""
异常/错误全链路追踪 — ERROR span 记录辅助函数
集成到全局异常处理器http_exception_handler / unhandled_exception_handler
为 HTTPException 和未捕获异常分别记录 ERROR span。
- HTTPException → ERROR span异常类型、status_code、detail、发生层级
- 未捕获异常 → ERROR span异常类型、消息、堆栈摘要前 5 行)
- 数据库异常psycopg2.Error→ DB_ERROR span错误码、消息、触发 SQL
所有函数均为安全调用trace 未激活时静默跳过,不影响请求处理。
"""
from __future__ import annotations
import traceback
from datetime import datetime
from starlette.exceptions import HTTPException
from app.trace.context import SpanType, TraceSpan, get_current_trace
def record_http_exception(exc: HTTPException) -> None:
"""为 HTTPException 记录 ERROR span。
记录内容异常类型、status_code、detail、发生层级exception_handler
trace 未激活时静默跳过。
"""
ctx = get_current_trace()
if ctx is None:
return
ctx.add_span(TraceSpan(
span_type=SpanType.ERROR,
module="trace.error_handler",
function="record_http_exception",
description_zh=f"HTTP 异常: {exc.status_code} {exc.detail}",
description_en=f"HTTP exception: {exc.status_code} {exc.detail}",
params={},
result_summary=f"{exc.status_code}",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
extra={
"exception_type": type(exc).__name__,
"status_code": exc.status_code,
"detail": str(exc.detail) if exc.detail else "",
"layer": "exception_handler",
},
))
def record_unhandled_exception(exc: Exception) -> None:
"""为未捕获异常记录 ERROR span。
记录内容:异常类型、消息、堆栈摘要(最后 5 行)。
trace 未激活时静默跳过。
"""
ctx = get_current_trace()
if ctx is None:
return
# 堆栈摘要:取最后 5 行,避免 span 过大
tb_lines = traceback.format_exception(type(exc), exc, exc.__traceback__)
stack_summary = "".join(tb_lines[-5:]) if len(tb_lines) > 5 else "".join(tb_lines)
ctx.add_span(TraceSpan(
span_type=SpanType.ERROR,
module="trace.error_handler",
function="record_unhandled_exception",
description_zh=f"未捕获异常: {type(exc).__name__}: {exc}",
description_en=f"Unhandled exception: {type(exc).__name__}: {exc}",
params={},
result_summary=type(exc).__name__,
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
extra={
"exception_type": type(exc).__name__,
"message": str(exc),
"stack_summary": stack_summary,
"layer": "exception_handler",
},
))
def record_db_exception(exc: Exception, sql: str = "") -> None:
"""为数据库异常记录 DB_ERROR span。
记录内容PostgreSQL 错误码(如有)、消息、触发 SQL。
trace 未激活时静默跳过。
"""
ctx = get_current_trace()
if ctx is None:
return
# 尝试提取 psycopg2 错误码
pgcode = getattr(exc, "pgcode", None) or ""
ctx.add_span(TraceSpan(
span_type=SpanType.DB_ERROR,
module="trace.error_handler",
function="record_db_exception",
description_zh=f"数据库异常: {type(exc).__name__}: {exc}",
description_en=f"Database exception: {type(exc).__name__}: {exc}",
params={},
result_summary=type(exc).__name__,
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
extra={
"exception_type": type(exc).__name__,
"message": str(exc),
"pgcode": pgcode,
"sql": sql,
"layer": "database",
},
))

View File

@@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
"""
后台 Job 执行追踪包装器
提供 trace_job() 包装函数,在 lifespan 注册的 job handler 外层包裹,
追踪后台任务执行全过程JOB_START → 内部 span → JOB_END / JOB_ERROR
Job trace 使用独立的 request_idjob_ 前缀),写入同一日志文件。
内部的 SERVICE / DB_QUERY span 通过 contextvars 自动关联到 job trace。
"""
from __future__ import annotations
import functools
import time
import traceback
from datetime import datetime
from typing import Any, Callable
from app.trace.config import get_trace_config
from app.trace.context import (
SpanType,
TraceSpan,
create_job_trace,
set_current_trace,
trace_context_var,
)
from app.trace.writer import TraceWriter
def trace_job(job_name: str) -> Callable:
"""装饰器:包装 job handler追踪后台任务执行。
用法::
@trace_job("task_generator")
def my_handler(**kwargs):
...
或在 register_job 时包装::
register_job("task_generator", trace_job("task_generator")(lambda **_kw: task_generator.run()))
"""
def decorator(func: Callable) -> Callable:
@functools.wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> Any:
cfg = get_trace_config()
if not cfg.enabled:
return func(*args, **kwargs)
# 创建 job trace 上下文
ctx = create_job_trace(job_name)
token = set_current_trace(ctx)
# JOB_START span
ctx.add_span(TraceSpan(
span_type=SpanType.JOB_START,
module="trace.job_wrapper",
function=f"trace_job({job_name})",
description_zh=f"后台任务开始: {job_name}",
description_en=f"Background job started: {job_name}",
params={"job_name": job_name},
result_summary="",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
t0 = time.perf_counter()
try:
result = func(*args, **kwargs)
duration_ms = (time.perf_counter() - t0) * 1000
# JOB_END span
ctx.add_span(TraceSpan(
span_type=SpanType.JOB_END,
module="trace.job_wrapper",
function=f"trace_job({job_name})",
description_zh=f"后台任务完成: {job_name}, 耗时={duration_ms:.0f}ms",
description_en=f"Background job completed: {job_name}, duration={duration_ms:.0f}ms",
params={"job_name": job_name},
result_summary=f"completed in {duration_ms:.0f}ms",
duration_ms=duration_ms,
timestamp=datetime.now().isoformat(),
))
return result
except Exception as exc:
duration_ms = (time.perf_counter() - t0) * 1000
tb_lines = traceback.format_exception(type(exc), exc, exc.__traceback__)
stack_summary = "".join(tb_lines[-5:])
# JOB_ERROR span
ctx.add_span(TraceSpan(
span_type=SpanType.JOB_ERROR,
module="trace.job_wrapper",
function=f"trace_job({job_name})",
description_zh=f"后台任务异常: {job_name}, {type(exc).__name__}: {exc}",
description_en=f"Background job error: {job_name}, {type(exc).__name__}: {exc}",
params={"job_name": job_name, "error_type": type(exc).__name__},
result_summary=f"{type(exc).__name__}: {exc}",
duration_ms=duration_ms,
timestamp=datetime.now().isoformat(),
extra={"stack_summary": stack_summary},
))
raise
finally:
# 同步写入日志文件job handler 在同步上下文中执行)
try:
import json
from app.trace.writer import serialize_trace, get_log_file
writer = TraceWriter()
data = serialize_trace(ctx)
line = json.dumps(data, ensure_ascii=False, default=str)
dt = ctx.start_time if isinstance(ctx.start_time, datetime) else datetime.now()
filepath = get_log_file(dt, base_dir=writer.base_dir)
filepath.parent.mkdir(parents=True, exist_ok=True)
filepath = writer._rotate_if_needed(filepath)
writer._sync_write(filepath, line)
except Exception:
pass # 写入失败不影响业务
# 恢复 contextvars
trace_context_var.set(None)
return wrapper
return decorator

View File

@@ -0,0 +1,211 @@
# -*- coding: utf-8 -*-
"""
TraceMiddleware — ASGI 中间件
拦截 xcx_* 路由前缀(/api/xcx/)的请求,创建 TraceContext 并记录全链路 span。
非 xcx 路由直接跳过,不创建 TraceContext。
DEV_TRACE_ENABLED 关闭时跳过所有采集。
记录的 span 类型:
- HTTP_IN: 请求进入method, path, query_params, body_preview
- HTTP_OUT: 请求结束status_code, duration, body_size
- MIDDLEWARE: ResponseWrapperMiddleware 执行耗时
- MIDDLEWARE_ERROR: 响应包装失败时记录
"""
from __future__ import annotations
import logging
import time
from datetime import datetime
from urllib.parse import unquote
from starlette.types import ASGIApp, Message, Receive, Scope, Send
from app.trace.config import get_trace_config
from app.trace.context import (
SpanType,
TraceContext,
TraceSpan,
create_http_trace,
get_current_trace,
set_current_trace,
trace_context_var,
)
from app.trace.writer import get_trace_writer
logger = logging.getLogger(__name__)
# xcx 路由前缀——仅匹配此前缀的请求才采集 trace
XCX_PATH_PREFIX = "/api/xcx/"
def _should_trace(path: str) -> bool:
"""判断请求路径是否属于 xcx_* 路由前缀,需要采集 trace。"""
return path.startswith(XCX_PATH_PREFIX)
class TraceMiddleware:
"""ASGI 中间件:全链路请求追踪。
执行顺序(最外层,最先执行):
TraceMiddleware → CORSMiddleware → ResponseWrapperMiddleware → 路由处理
职责:
1. 检查 DEV_TRACE_ENABLED 开关(运行时检查,支持动态切换)
2. 仅拦截 /api/xcx/ 前缀的请求
3. 创建 TraceContext 存入 contextvars
4. 记录 HTTP_IN / HTTP_OUT / MIDDLEWARE / MIDDLEWARE_ERROR span
5. 响应头写入 X-Request-ID, X-Process-Time, X-DB-Queries, X-DB-Time
6. 调用 TraceWriter 写入完整 trace
"""
def __init__(self, app: ASGIApp) -> None:
self.app = app
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
# 仅处理 HTTP 请求
if scope["type"] != "http":
await self.app(scope, receive, send)
return
path = scope.get("path", "")
config = get_trace_config()
# 开关关闭 或 非 xcx 路由 → 直接透传
if not config.enabled or not _should_trace(path):
await self.app(scope, receive, send)
return
# ── 创建 TraceContext ──
method = scope.get("method", "UNKNOWN")
query_string = scope.get("query_string", b"").decode("latin-1", errors="replace")
ctx = create_http_trace(method, path)
token = set_current_trace(ctx)
# 记录 HTTP_IN span
query_params = {}
if query_string:
# 简单解析 query string 为 dict
for pair in query_string.split("&"):
if "=" in pair:
k, v = pair.split("=", 1)
query_params[unquote(k)] = unquote(v)
ctx.add_span(TraceSpan(
span_type=SpanType.HTTP_IN,
module="trace.middleware",
function="TraceMiddleware.__call__",
description_zh=f"接收请求 {method} {path}",
description_en=f"Received request {method} {path}",
params={"query": query_params},
result_summary="",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
start_time = time.perf_counter()
# ── 拦截响应,注入 trace 头和采集 HTTP_OUT ──
status_code = 200
response_body_parts: list[bytes] = []
middleware_start = time.perf_counter()
middleware_error: str | None = None
async def send_wrapper(message: Message) -> None:
nonlocal status_code, middleware_error
if message["type"] == "http.response.start":
status_code = message.get("status", 200)
# 计算 trace 统计数据
elapsed_ms = (time.perf_counter() - start_time) * 1000
current_ctx = get_current_trace()
db_queries = 0
db_time_ms = 0.0
if current_ctx:
for s in current_ctx.spans:
if s.span_type == SpanType.DB_QUERY:
db_queries += 1
db_time_ms += s.duration_ms
# 注入响应头
headers = list(message.get("headers", []))
headers.append([b"x-request-id", ctx.request_id.encode()])
headers.append([b"x-process-time", f"{elapsed_ms:.1f}ms".encode()])
headers.append([b"x-db-queries", str(db_queries).encode()])
headers.append([b"x-db-time", f"{db_time_ms:.1f}ms".encode()])
message = {**message, "headers": headers}
await send(message)
return
if message["type"] == "http.response.body":
body = message.get("body", b"")
response_body_parts.append(body)
await send(message)
return
await send(message)
try:
await self.app(scope, receive, send_wrapper)
except Exception as exc:
# 即使内层异常,也要记录 trace
middleware_error = f"{type(exc).__name__}: {exc}"
raise
finally:
elapsed_ms = (time.perf_counter() - start_time) * 1000
middleware_elapsed_ms = (time.perf_counter() - middleware_start) * 1000
body_size = sum(len(p) for p in response_body_parts)
# 记录 MIDDLEWARE spanResponseWrapperMiddleware 执行耗时)
if middleware_error:
ctx.add_span(TraceSpan(
span_type=SpanType.MIDDLEWARE_ERROR,
module="middleware.response_wrapper",
function="ResponseWrapperMiddleware.__call__",
description_zh=f"响应包装失败: {middleware_error}",
description_en=f"Response wrapping failed: {middleware_error}",
params={},
result_summary=middleware_error,
duration_ms=middleware_elapsed_ms,
timestamp=datetime.now().isoformat(),
))
else:
ctx.add_span(TraceSpan(
span_type=SpanType.MIDDLEWARE,
module="middleware.response_wrapper",
function="ResponseWrapperMiddleware.__call__",
description_zh="响应包装中间件执行完成",
description_en="Response wrapper middleware completed",
params={},
result_summary=f"body_size={body_size}",
duration_ms=middleware_elapsed_ms,
timestamp=datetime.now().isoformat(),
extra={"body_size": body_size},
))
# 记录 HTTP_OUT span
ctx.add_span(TraceSpan(
span_type=SpanType.HTTP_OUT,
module="trace.middleware",
function="TraceMiddleware.__call__",
description_zh=f"响应返回 {status_code},耗时 {elapsed_ms:.0f}ms",
description_en=f"Response sent {status_code}, took {elapsed_ms:.0f}ms",
params={},
result_summary=f"{status_code}, {body_size}B body",
duration_ms=elapsed_ms,
timestamp=datetime.now().isoformat(),
extra={"status_code": status_code, "body_size": body_size},
))
# 写入 trace 日志
try:
writer = get_trace_writer()
await writer.write_trace(ctx)
except Exception:
logger.warning("Trace 日志写入失败", exc_info=True)
# 恢复 contextvars
trace_context_var.reset(token)

View File

@@ -0,0 +1,149 @@
# -*- coding: utf-8 -*-
"""
SSE 流式响应追踪辅助函数
提供一组轻量函数,在 SSE event_generator 内部调用,
追踪流式响应全过程SSE_START → AI_CALL → SSE_EVENT → SSE_END / AI_ERROR
所有函数在无活跃 TraceContext 时静默跳过,不影响业务逻辑。
SSE_EVENT span 每 10 个 token 记录一次,避免 span 爆炸。
"""
from __future__ import annotations
from datetime import datetime
from app.trace.context import (
SpanType,
TraceSpan,
TraceType,
get_current_trace,
)
def record_sse_start(
endpoint: str,
user_id: int | None = None,
chat_id: str = "",
) -> None:
"""记录 SSE_START span流开始。
同时将 trace_type 切换为 "sse"
"""
ctx = get_current_trace()
if ctx is None:
return
# 切换 trace 类型为 SSE
ctx.trace_type = TraceType.SSE
ctx.add_span(TraceSpan(
span_type=SpanType.SSE_START,
module="trace.sse_wrapper",
function="record_sse_start",
description_zh=f"SSE 流开始: endpoint={endpoint}, chat_id={chat_id}",
description_en=f"SSE stream started: endpoint={endpoint}, chat_id={chat_id}",
params={"endpoint": endpoint, "user_id": user_id, "chat_id": chat_id},
result_summary="",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
def record_ai_call(
app_id: str,
prompt_length: int,
session_id: str = "",
) -> None:
"""记录 AI_CALL spanDashScope API 调用。"""
ctx = get_current_trace()
if ctx is None:
return
ctx.add_span(TraceSpan(
span_type=SpanType.AI_CALL,
module="trace.sse_wrapper",
function="record_ai_call",
description_zh=f"AI 调用: app_id={app_id}, prompt 长度={prompt_length}",
description_en=f"AI call: app_id={app_id}, prompt_length={prompt_length}",
params={
"app_id": app_id,
"prompt_length": prompt_length,
"session_id": session_id,
},
result_summary="",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
def record_sse_token(token_count: int, total_tokens: int) -> None:
"""记录 SSE_EVENT span每 10 个 token 记录一次,避免 span 爆炸。
仅当 total_tokens 是 10 的倍数时才记录 span。
"""
ctx = get_current_trace()
if ctx is None:
return
# 每 10 个 token 记录一次
if total_tokens % 10 != 0:
return
ctx.add_span(TraceSpan(
span_type=SpanType.SSE_EVENT,
module="trace.sse_wrapper",
function="record_sse_token",
description_zh=f"SSE token 流: 本批 {token_count} token, 累计 {total_tokens}",
description_en=f"SSE token stream: batch {token_count}, cumulative {total_tokens}",
params={"token_count": token_count, "total_tokens": total_tokens},
result_summary=f"cumulative={total_tokens}",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
def record_sse_end(
total_tokens: int,
total_duration_ms: float,
completed: bool = True,
) -> None:
"""记录 SSE_END span流结束。"""
ctx = get_current_trace()
if ctx is None:
return
ctx.add_span(TraceSpan(
span_type=SpanType.SSE_END,
module="trace.sse_wrapper",
function="record_sse_end",
description_zh=f"SSE 流结束: 总 token={total_tokens}, 耗时={total_duration_ms:.0f}ms, 完成={completed}",
description_en=f"SSE stream ended: total_tokens={total_tokens}, duration={total_duration_ms:.0f}ms, completed={completed}",
params={
"total_tokens": total_tokens,
"total_duration_ms": total_duration_ms,
"completed": completed,
},
result_summary=f"tokens={total_tokens}, completed={completed}",
duration_ms=total_duration_ms,
timestamp=datetime.now().isoformat(),
))
def record_ai_error(error_type: str, message: str) -> None:
"""记录 AI_ERROR spanAI 调用失败。"""
ctx = get_current_trace()
if ctx is None:
return
ctx.add_span(TraceSpan(
span_type=SpanType.AI_ERROR,
module="trace.sse_wrapper",
function="record_ai_error",
description_zh=f"AI 调用失败: {error_type} - {message}",
description_en=f"AI call failed: {error_type} - {message}",
params={"error_type": error_type, "message": message},
result_summary=f"{error_type}: {message}",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))

View File

@@ -0,0 +1,301 @@
# -*- coding: utf-8 -*-
"""
JSON Lines 日志写入器
负责将 TraceContext 序列化为 JSON 并追加写入 .jsonl 文件。
按日期分目录YYYY-MM-DD/按小时分文件trace_YYYY-MM-DD_HH.jsonl
单文件超过 10MB 自动轮转trace_YYYY-MM-DD_HH_NNN.jsonl
维护 _index.json 索引文件(文件列表、记录数、文件大小)。
"""
from __future__ import annotations
import asyncio
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Any
from app.trace.config import get_trace_config
from app.trace.context import TraceContext, TraceSpan, SpanType
logger = logging.getLogger(__name__)
# 单文件大小上限10MB
MAX_FILE_SIZE = 10 * 1024 * 1024
# ──────────────────────────────────────────────
# 序列化函数
# ──────────────────────────────────────────────
def serialize_span(span: TraceSpan) -> dict[str, Any]:
"""将 TraceSpan 转为可 JSON 序列化的 dict。
包含所有必需字段span_type, module, function, description_zh,
description_en, params, result_summary, duration_ms, timestamp, extra。
"""
return {
"span_type": span.span_type,
"module": span.module,
"function": span.function,
"description_zh": span.description_zh,
"description_en": span.description_en,
"params": span.params,
"result_summary": span.result_summary,
"duration_ms": span.duration_ms,
"timestamp": span.timestamp,
"extra": span.extra,
}
def serialize_trace(ctx: TraceContext) -> dict[str, Any]:
"""将 TraceContext 转为可 JSON 序列化的 dict。
包含完整字段request_id, trace_type, timestamp, method, path,
status_code, total_duration_ms, user_id, site_id,
db_query_count, db_total_ms, error, spans。
衍生字段从 spans 列表计算:
- status_code: 从 HTTP_OUT span 的 extra 或 result_summary 提取
- total_duration_ms: 从 HTTP_OUT span 的 duration_ms 获取
- db_query_count: 统计 DB_QUERY span 数量
- db_total_ms: 累加 DB_QUERY span 的 duration_ms
- error: 从 ERROR/DB_ERROR span 提取错误信息
"""
spans = ctx.spans
# 衍生字段status_code — 从 HTTP_OUT span 提取
status_code: int | None = None
total_duration_ms: float = 0.0
for s in spans:
if s.span_type == SpanType.HTTP_OUT:
# 优先从 extra 中取 status_code
status_code = s.extra.get("status_code")
if status_code is None:
# 尝试从 result_summary 解析,如 "200 OK, 3.2KB body"
summary = s.result_summary or ""
parts = summary.split()
if parts and parts[0].isdigit():
status_code = int(parts[0])
total_duration_ms = s.duration_ms
break
# 衍生字段db_query_count, db_total_ms
db_query_count = 0
db_total_ms = 0.0
for s in spans:
if s.span_type == SpanType.DB_QUERY:
db_query_count += 1
db_total_ms += s.duration_ms
# 衍生字段error — 从 ERROR/DB_ERROR span 提取
error: str | None = None
for s in spans:
if s.span_type in (SpanType.ERROR, SpanType.DB_ERROR):
# 拼接错误信息:优先 result_summary其次 description_zh
error = s.result_summary or s.description_zh or s.description_en
break
return {
"request_id": ctx.request_id,
"trace_type": ctx.trace_type,
"timestamp": ctx.start_time.isoformat() if isinstance(ctx.start_time, datetime) else str(ctx.start_time),
"method": ctx.method,
"path": ctx.path,
"status_code": status_code,
"total_duration_ms": total_duration_ms,
"user_id": ctx.user_id,
"site_id": ctx.site_id,
"db_query_count": db_query_count,
"db_total_ms": round(db_total_ms, 2),
"error": error,
"spans": [serialize_span(s) for s in spans],
}
# ──────────────────────────────────────────────
# 路径生成函数
# ──────────────────────────────────────────────
def get_log_dir(dt: datetime, base_dir: str | None = None) -> Path:
"""返回日期目录路径:{base_dir}/{YYYY-MM-DD}/。
Args:
dt: 时间戳,用于确定日期目录名
base_dir: 日志根目录,默认从 TraceConfig 读取
"""
if base_dir is None:
base_dir = get_trace_config().log_dir
date_str = dt.strftime("%Y-%m-%d")
return Path(base_dir) / date_str
def get_log_file(dt: datetime, base_dir: str | None = None) -> Path:
"""返回当前小时的日志文件路径:{base_dir}/{YYYY-MM-DD}/trace_YYYY-MM-DD_HH.jsonl。
Args:
dt: 时间戳,用于确定日期目录和小时文件名
base_dir: 日志根目录,默认从 TraceConfig 读取
"""
date_dir = get_log_dir(dt, base_dir=base_dir)
filename = f"trace_{dt.strftime('%Y-%m-%d_%H')}.jsonl"
return date_dir / filename
# ──────────────────────────────────────────────
# TraceWriter 类
# ──────────────────────────────────────────────
class TraceWriter:
"""JSON Lines 日志写入器。
负责将 TraceContext 序列化为 JSON 并追加写入 .jsonl 文件。
支持按日期分目录、按小时分文件、超 10MB 自动轮转、索引维护。
"""
def __init__(self, base_dir: str | None = None):
"""初始化写入器。
Args:
base_dir: 日志根目录,默认从 TraceConfig 读取
"""
self._base_dir = base_dir
@property
def base_dir(self) -> str:
"""获取日志根目录。"""
if self._base_dir is not None:
return self._base_dir
return get_trace_config().log_dir
def _rotate_if_needed(self, filepath: Path) -> Path:
"""检查文件大小,需要时返回轮转后的新路径。
如果文件不存在或未超过 10MB返回原路径。
如果超过 10MB返回带序号后缀的新路径trace_YYYY-MM-DD_HH_001.jsonl
"""
if not filepath.exists():
return filepath
if filepath.stat().st_size < MAX_FILE_SIZE:
return filepath
# 需要轮转:查找下一个可用序号
stem = filepath.stem # trace_YYYY-MM-DD_HH
parent = filepath.parent
# 如果当前文件名已经带序号(如 trace_2026-03-22_14_001
# 提取基础名称
parts = stem.rsplit("_", 1)
if len(parts) == 2 and parts[1].isdigit() and len(parts[1]) == 3:
base_stem = parts[0]
else:
base_stem = stem
# 查找已有的轮转文件,确定下一个序号
seq = 1
while True:
candidate = parent / f"{base_stem}_{seq:03d}.jsonl"
if not candidate.exists() or candidate.stat().st_size < MAX_FILE_SIZE:
return candidate
seq += 1
def _update_index(self, date_dir: Path, filename: str, record_count_delta: int) -> None:
"""更新索引文件 _index.json。
索引结构:
{
"files": {
"trace_2026-03-22_14.jsonl": {
"record_count": 42,
"file_size": 12345
}
}
}
"""
index_path = date_dir / "_index.json"
# 读取现有索引
index: dict[str, Any] = {"files": {}}
if index_path.exists():
try:
index = json.loads(index_path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
index = {"files": {}}
if "files" not in index:
index["files"] = {}
# 更新文件条目
file_entry = index["files"].get(filename, {"record_count": 0, "file_size": 0})
file_entry["record_count"] = file_entry.get("record_count", 0) + record_count_delta
# 更新实际文件大小
file_path = date_dir / filename
if file_path.exists():
file_entry["file_size"] = file_path.stat().st_size
else:
file_entry["file_size"] = 0
index["files"][filename] = file_entry
# 写回索引
index_path.write_text(
json.dumps(index, ensure_ascii=False, indent=2),
encoding="utf-8",
)
async def write_trace(self, ctx: TraceContext) -> None:
"""异步写入 TraceContext 到 JSON Lines 文件。
写入失败时静默处理(仅记录日志),不影响请求处理。
"""
try:
# 序列化
data = serialize_trace(ctx)
line = json.dumps(data, ensure_ascii=False, default=str)
# 确定写入路径
dt = ctx.start_time if isinstance(ctx.start_time, datetime) else datetime.now()
filepath = get_log_file(dt, base_dir=self.base_dir)
# 确保目录存在
filepath.parent.mkdir(parents=True, exist_ok=True)
# 检查是否需要轮转
filepath = self._rotate_if_needed(filepath)
# 异步写入(在线程池中执行 IO 操作)
await asyncio.to_thread(self._sync_write, filepath, line)
except Exception:
# 写入失败不影响请求处理,仅记录日志
logger.warning("Trace 日志写入失败", exc_info=True)
def _sync_write(self, filepath: Path, line: str) -> None:
"""同步写入一行 JSON 到文件,并更新索引。"""
# 追加写入
with open(filepath, "a", encoding="utf-8") as f:
f.write(line + "\n")
# 更新索引
date_dir = filepath.parent
filename = filepath.name
self._update_index(date_dir, filename, record_count_delta=1)
# ──────────────────────────────────────────────
# 单例访问
# ──────────────────────────────────────────────
_writer_instance: TraceWriter | None = None
def get_trace_writer() -> TraceWriter:
"""获取全局 TraceWriter 单例。"""
global _writer_instance
if _writer_instance is None:
_writer_instance = TraceWriter()
return _writer_instance

View File

@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
"""
WebSocket 连接追踪辅助函数
提供一组轻量函数,在 WebSocket 端点内部调用,
追踪连接全生命周期WS_CONNECT → WS_MESSAGE → WS_DISCONNECT
所有函数在无活跃 TraceContext 时静默跳过,不影响业务逻辑。
WS_MESSAGE span 每 5 条消息记录一次,避免 span 爆炸。
"""
from __future__ import annotations
from datetime import datetime
from app.trace.context import (
SpanType,
TraceContext,
TraceSpan,
TraceType,
create_ws_trace,
get_current_trace,
set_current_trace,
)
from app.trace.config import get_trace_config
from app.trace.writer import TraceWriter
def ws_trace_connect(
execution_id: str,
client_info: str = "",
) -> TraceContext | None:
"""创建 WS TraceContext 并记录 WS_CONNECT span。
返回创建的 TraceContext供后续 span 使用),
如果 trace 未启用则返回 None。
"""
cfg = get_trace_config()
if not cfg.enabled:
return None
ctx = create_ws_trace()
set_current_trace(ctx)
ctx.add_span(TraceSpan(
span_type=SpanType.WS_CONNECT,
module="trace.ws_wrapper",
function="ws_trace_connect",
description_zh=f"WebSocket 连接建立: execution_id={execution_id}",
description_en=f"WebSocket connected: execution_id={execution_id}",
params={"execution_id": execution_id, "client_info": client_info},
result_summary="connected",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
return ctx
def ws_trace_message(
message_count: int,
total_bytes: int,
) -> None:
"""记录 WS_MESSAGE span每 5 条消息记录一次,避免 span 爆炸。
仅当 message_count 是 5 的倍数时才记录 span。
"""
ctx = get_current_trace()
if ctx is None:
return
# 每 5 条消息记录一次
if message_count % 5 != 0:
return
ctx.add_span(TraceSpan(
span_type=SpanType.WS_MESSAGE,
module="trace.ws_wrapper",
function="ws_trace_message",
description_zh=f"WS 消息推送: 累计 {message_count} 条, {total_bytes} 字节",
description_en=f"WS message push: cumulative {message_count} msgs, {total_bytes} bytes",
params={"message_count": message_count, "total_bytes": total_bytes},
result_summary=f"msgs={message_count}, bytes={total_bytes}",
duration_ms=0.0,
timestamp=datetime.now().isoformat(),
))
def ws_trace_disconnect(
reason: str,
total_messages: int,
total_duration_ms: float,
) -> None:
"""记录 WS_DISCONNECT span 并写入 trace 日志。"""
ctx = get_current_trace()
if ctx is None:
return
ctx.add_span(TraceSpan(
span_type=SpanType.WS_DISCONNECT,
module="trace.ws_wrapper",
function="ws_trace_disconnect",
description_zh=f"WebSocket 断开: 原因={reason}, 总消息={total_messages}, 耗时={total_duration_ms:.0f}ms",
description_en=f"WebSocket disconnected: reason={reason}, total_msgs={total_messages}, duration={total_duration_ms:.0f}ms",
params={
"reason": reason,
"total_messages": total_messages,
"total_duration_ms": total_duration_ms,
},
result_summary=f"msgs={total_messages}, reason={reason}",
duration_ms=total_duration_ms,
timestamp=datetime.now().isoformat(),
))
# 同步写入日志文件ws_trace_disconnect 可能在同步上下文中调用)
try:
from app.trace.writer import serialize_trace, get_log_file
import json
from pathlib import Path
writer = TraceWriter()
data = serialize_trace(ctx)
line = json.dumps(data, ensure_ascii=False, default=str)
dt = ctx.start_time if isinstance(ctx.start_time, datetime) else datetime.now()
filepath = get_log_file(dt, base_dir=writer.base_dir)
filepath.parent.mkdir(parents=True, exist_ok=True)
filepath = writer._rotate_if_needed(filepath)
writer._sync_write(filepath, line)
except Exception:
pass # 写入失败不影响业务