1
This commit is contained in:
@@ -140,6 +140,20 @@ def trunc(s, n=3000) -> str:
|
||||
return str(s)
|
||||
return s if len(s) <= n else s[:n] + f"\n... [截断,原文共 {len(s)} 字符]"
|
||||
|
||||
import re as _re
|
||||
_SURROGATE_RE = _re.compile(r'[\ud800-\udfff]')
|
||||
|
||||
def _sanitize_surrogates(obj):
|
||||
"""递归清洗 dict/list/str 中的 surrogate 字符,替换为 U+FFFD。"""
|
||||
if isinstance(obj, str):
|
||||
return _SURROGATE_RE.sub('\ufffd', obj)
|
||||
if isinstance(obj, dict):
|
||||
return {_sanitize_surrogates(k): _sanitize_surrogates(v) for k, v in obj.items()}
|
||||
if isinstance(obj, list):
|
||||
return [_sanitize_surrogates(i) for i in obj]
|
||||
return obj
|
||||
|
||||
|
||||
def safe_json(obj, n=5000) -> str:
|
||||
try:
|
||||
s = json.dumps(obj, ensure_ascii=False, indent=2)
|
||||
@@ -1624,8 +1638,11 @@ def extract_single_execution(
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(log_path, "r", encoding="utf-8") as f:
|
||||
log = json.load(f)
|
||||
with open(log_path, "rb") as f:
|
||||
raw = f.read()
|
||||
# 清洗 surrogate 字符(Kiro log 中 emoji 可能被存为 surrogate pair)
|
||||
text = raw.decode("utf-8", errors="surrogatepass").encode("utf-8", errors="replace").decode("utf-8")
|
||||
log = _sanitize_surrogates(json.loads(text))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@@ -1860,6 +1877,9 @@ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optiona
|
||||
force=True, # 强制覆盖
|
||||
)
|
||||
if result:
|
||||
# 逐条持久化,避免中途超时导致下次重复处理
|
||||
save_index(index)
|
||||
save_full_index(full_index)
|
||||
print(f"[session-extract] completed partial: {result}")
|
||||
|
||||
# ── 步骤 2:提取未索引的终态 execution ──
|
||||
@@ -1888,9 +1908,6 @@ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optiona
|
||||
ready.append(r_exec)
|
||||
|
||||
if not ready:
|
||||
if partial_eids:
|
||||
save_index(index)
|
||||
save_full_index(full_index)
|
||||
return
|
||||
|
||||
# agent_on_stop 场景下限制单次处理量,避免处理数千个历史 execution
|
||||
@@ -1912,39 +1929,66 @@ def extract_latest(global_storage: Optional[str] = None, workspace_path: Optiona
|
||||
for cid in chat_groups:
|
||||
chat_groups[cid].sort(key=lambda x: x.get("startTime", 0))
|
||||
|
||||
import threading
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
lock = threading.Lock()
|
||||
extracted_count = 0
|
||||
tombstone_count = 0
|
||||
for cid, group_execs in chat_groups.items():
|
||||
|
||||
def _extract_group(group_execs: list[dict]) -> tuple[list[str], dict, dict]:
|
||||
"""串行提取同一 chatSession 的所有 execution,返回 (results, idx_entries, full_entries)。"""
|
||||
local_index: dict = {"version": 2, "entries": {}}
|
||||
local_full: dict = {"version": 2, "entries": {}}
|
||||
results: list[str] = []
|
||||
for execution in group_execs:
|
||||
eid = execution.get("executionId", "")
|
||||
result = extract_single_execution(
|
||||
agent_dir=agent_dir,
|
||||
hex_dir=execution["_hex_dir"],
|
||||
execution=execution,
|
||||
session_dir=session_dir,
|
||||
index=index,
|
||||
full_index=full_index,
|
||||
)
|
||||
if result:
|
||||
# 如果是 partial 提取,在索引中标记 status
|
||||
if execution.get("_is_partial"):
|
||||
if eid in index.get("entries", {}):
|
||||
index["entries"][eid]["status"] = "partial"
|
||||
if full_index and eid in full_index.get("entries", {}):
|
||||
full_index["entries"][eid]["status"] = "partial"
|
||||
extracted_count += 1
|
||||
print(f"[session-extract] extracted: {result}")
|
||||
elif eid and index.get("entries", {}).get(eid, {}).get("no_log"):
|
||||
# tombstone 被写入,需要保存索引
|
||||
tombstone_count += 1
|
||||
try:
|
||||
result = extract_single_execution(
|
||||
agent_dir=agent_dir,
|
||||
hex_dir=execution["_hex_dir"],
|
||||
execution=execution,
|
||||
session_dir=session_dir,
|
||||
index=local_index,
|
||||
full_index=local_full,
|
||||
)
|
||||
if result:
|
||||
if execution.get("_is_partial"):
|
||||
if eid in local_index.get("entries", {}):
|
||||
local_index["entries"][eid]["status"] = "partial"
|
||||
if eid in local_full.get("entries", {}):
|
||||
local_full["entries"][eid]["status"] = "partial"
|
||||
results.append(result)
|
||||
except Exception as exc:
|
||||
print(f"[session-extract] ✗ {eid[:8]}: {exc}")
|
||||
return results, local_index.get("entries", {}), local_full.get("entries", {})
|
||||
|
||||
if extracted_count > 0 or partial_eids or tombstone_count > 0:
|
||||
save_index(index)
|
||||
save_full_index(full_index)
|
||||
if extracted_count > 1:
|
||||
print(f"[session-extract] total: {extracted_count} executions")
|
||||
if tombstone_count > 0:
|
||||
print(f"[session-extract] tombstoned: {tombstone_count} (no log found)")
|
||||
workers = min(4, len(chat_groups))
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {pool.submit(_extract_group, execs): cid
|
||||
for cid, execs in chat_groups.items()}
|
||||
for future in as_completed(futures):
|
||||
results, idx_entries, full_entries = future.result()
|
||||
with lock:
|
||||
# 合并到主索引
|
||||
index.setdefault("entries", {}).update(idx_entries)
|
||||
if full_index is not None:
|
||||
full_index.setdefault("entries", {}).update(full_entries)
|
||||
extracted_count += len(results)
|
||||
tombstone_count += sum(
|
||||
1 for ent in idx_entries.values() if ent.get("no_log")
|
||||
)
|
||||
# 逐组持久化,避免中途超时导致下次重复处理
|
||||
if idx_entries:
|
||||
save_index(index)
|
||||
save_full_index(full_index)
|
||||
for r in results:
|
||||
print(f"[session-extract] extracted: {r}")
|
||||
|
||||
if extracted_count > 1:
|
||||
print(f"[session-extract] total: {extracted_count} executions")
|
||||
if tombstone_count > 0:
|
||||
print(f"[session-extract] tombstoned: {tombstone_count} (no log found)")
|
||||
|
||||
|
||||
def extract_all_unindexed(
|
||||
|
||||
Reference in New Issue
Block a user