This commit is contained in:
Neo
2026-03-15 10:15:02 +08:00
parent 2dd217522c
commit 72bb11b34f
916 changed files with 65306 additions and 16102803 deletions

View File

@@ -140,6 +140,20 @@ def trunc(s, n=3000) -> str:
return str(s)
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
import re as _re
_SURROGATE_RE = _re.compile(r'[\ud800-\udfff]')
def _sanitize_surrogates(obj):
"""递归清洗 dict/list/str 中的 surrogate 字符,替换为 U+FFFD。"""
if isinstance(obj, str):
return _SURROGATE_RE.sub('\ufffd', obj)
if isinstance(obj, dict):
return {_sanitize_surrogates(k): _sanitize_surrogates(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_sanitize_surrogates(i) for i in obj]
return obj
def safe_json(obj, n=5000) -> str:
try:
s = json.dumps(obj, ensure_ascii=False, indent=2)
@@ -1624,8 +1638,11 @@ def extract_single_execution(
return None
try:
with open(log_path, "r", encoding="utf-8") as f:
log = json.load(f)
with open(log_path, "rb") as f:
raw = f.read()
# 清洗 surrogate 字符Kiro log 中 emoji 可能被存为 surrogate pair
text = raw.decode("utf-8", errors="surrogatepass").encode("utf-8", errors="replace").decode("utf-8")
log = _sanitize_surrogates(json.loads(text))
except Exception:
return None
@@ -1860,6 +1877,9 @@ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optiona
force=True, # 强制覆盖
)
if result:
# 逐条持久化,避免中途超时导致下次重复处理
save_index(index)
save_full_index(full_index)
print(f"[session-extract] completed partial: {result}")
# ── 步骤 2提取未索引的终态 execution ──
@@ -1888,9 +1908,6 @@ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optiona
ready.append(r_exec)
if not ready:
if partial_eids:
save_index(index)
save_full_index(full_index)
return
# agent_on_stop 场景下限制单次处理量,避免处理数千个历史 execution
@@ -1912,39 +1929,66 @@ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optiona
for cid in chat_groups:
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
lock = threading.Lock()
extracted_count = 0
tombstone_count = 0
for cid, group_execs in chat_groups.items():
def _extract_group(group_execs: list[dict]) -> tuple[list[str], dict, dict]:
"""串行提取同一 chatSession 的所有 execution返回 (results, idx_entries, full_entries)。"""
local_index: dict = {"version": 2, "entries": {}}
local_full: dict = {"version": 2, "entries": {}}
results: list[str] = []
for execution in group_execs:
eid = execution.get("executionId", "")
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execution["_hex_dir"],
execution=execution,
session_dir=session_dir,
index=index,
full_index=full_index,
)
if result:
# 如果是 partial 提取,在索引中标记 status
if execution.get("_is_partial"):
if eid in index.get("entries", {}):
index["entries"][eid]["status"] = "partial"
if full_index and eid in full_index.get("entries", {}):
full_index["entries"][eid]["status"] = "partial"
extracted_count += 1
print(f"[session-extract] extracted: {result}")
elif eid and index.get("entries", {}).get(eid, {}).get("no_log"):
# tombstone 被写入,需要保存索引
tombstone_count += 1
try:
result = extract_single_execution(
agent_dir=agent_dir,
hex_dir=execution["_hex_dir"],
execution=execution,
session_dir=session_dir,
index=local_index,
full_index=local_full,
)
if result:
if execution.get("_is_partial"):
if eid in local_index.get("entries", {}):
local_index["entries"][eid]["status"] = "partial"
if eid in local_full.get("entries", {}):
local_full["entries"][eid]["status"] = "partial"
results.append(result)
except Exception as exc:
print(f"[session-extract] ✗ {eid[:8]}: {exc}")
return results, local_index.get("entries", {}), local_full.get("entries", {})
if extracted_count > 0 or partial_eids or tombstone_count > 0:
save_index(index)
save_full_index(full_index)
if extracted_count > 1:
print(f"[session-extract] total: {extracted_count} executions")
if tombstone_count > 0:
print(f"[session-extract] tombstoned: {tombstone_count} (no log found)")
workers = min(4, len(chat_groups))
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_extract_group, execs): cid
for cid, execs in chat_groups.items()}
for future in as_completed(futures):
results, idx_entries, full_entries = future.result()
with lock:
# 合并到主索引
index.setdefault("entries", {}).update(idx_entries)
if full_index is not None:
full_index.setdefault("entries", {}).update(full_entries)
extracted_count += len(results)
tombstone_count += sum(
1 for ent in idx_entries.values() if ent.get("no_log")
)
# 逐组持久化,避免中途超时导致下次重复处理
if idx_entries:
save_index(index)
save_full_index(full_index)
for r in results:
print(f"[session-extract] extracted: {r}")
if extracted_count > 1:
print(f"[session-extract] total: {extracted_count} executions")
if tombstone_count > 0:
print(f"[session-extract] tombstoned: {tombstone_count} (no log found)")
def extract_all_unindexed(