rmusser01
diff --git a/‎tldw_chatbook/Embeddings/Chroma_Lib.py
Lines changed: 968 additions & 0 deletions b/‎tldw_chatbook/Embeddings/Chroma_Lib.py
Lines changed: 968 additions & 0 deletions
diff --git a/‎tldw_chatbook/Embeddings/Embeddings_Lib.py
Lines changed: 374 additions & 0 deletions b/‎tldw_chatbook/Embeddings/Embeddings_Lib.py
Lines changed: 374 additions & 0 deletions
@@ -0,0 +1,374 @@
+# Embeddings_Lib.py
+#
+from __future__ import annotations
+#
+"""Adaptive embedding factory for desktop / TUI apps – asynchronous‑ready, logging‑aware.
+
+This version is a thread-safe, robust implementation that addresses race conditions
+and resource management issues found in previous iterations.
+
+Key Features & Fixes in this Version
+────────────────────────────────────
+• **Thread-Safe & Race-Condition-Free**: The core embedding logic is now fully
+  thread-safe, preventing race conditions where a model could be evicted by one
+  thread while being used by another.
+• **Correct Resource Management**: Eviction now occurs *before* loading a new
+  model, ensuring the `max_cached` limit is never exceeded, preventing
+  potential out-of-memory errors.
+• **Improved Typing**: The internal cache and Pydantic configurations use
+  `TypedDict` and discriminated unions for better static analysis and maintainability.
+• **Async Facade**: `await factory.async_embed(...)` uses `asyncio.to_thread`
+  so the UI never blocks.
+• **Structured Logging**: Provides insight into cache hits, loads, and evictions.
+• **Pluggable & High-Quality Pooling**: Defaults to a proper masked mean pooling
+  strategy with L2 normalization for superior embedding quality.
+• **Prefetch / Warm‑up**: `factory.prefetch(...)` downloads weights on-demand.
+"""
+#
+# Imports
+import asyncio
+import logging
+import random
+import threading
+import time
+from collections import OrderedDict
+from typing import (Any, Annotated, Callable, Dict, List, Literal, Optional,
+                    Protocol, TypedDict, Union)
+#
+# Third-Party Libraries
+import numpy as np
+import requests
+import torch
+from pydantic import BaseModel, Field, field_validator
+from torch import Tensor
+from torch.nn.functional import normalize
+from transformers import AutoModel, AutoTokenizer
+#
+# Local Imports
+#
+########################################################################################################################
+#
+__all__ = ["EmbeddingFactory", "EmbeddingConfigSchema"]
+#
+LOGGER = logging.getLogger("embeddings_lib")
+LOGGER.addHandler(logging.NullHandler())
+#
+###############################################################################
+# Configuration schema (with Discriminated Union)
+###############################################################################
+
+###############################################################################
+# Configuration schema (with Discriminated Union)
+###############################################################################
+
+PoolingFn = Callable[[Tensor, Tensor], Tensor]
+
+
+def _masked_mean(last_hidden: Tensor, attn: Tensor) -> Tensor:
+    """Default pooling: mean of vectors where attention_mask is 1."""
+    mask = attn.unsqueeze(-1).type_as(last_hidden)
+    summed = (last_hidden * mask).sum(dim=1)
+    lengths = mask.sum(dim=1).clamp(min=1e-9)
+    avg = summed / lengths
+    return normalize(avg, p=2, dim=1)
+
+
+class HFModelCfg(BaseModel):
+    provider: Literal["huggingface"] = "huggingface"
+    model_name_or_path: str
+    trust_remote_code: bool = False
+    max_length: int = 512
+    device: Optional[str] = None
+    batch_size: int = 32
+    pooling: Optional[PoolingFn] = None  # default: masked mean
+    dimension: Optional[int] = None
+
+
+class OpenAICfg(BaseModel):
+    provider: Literal["openai"] = "openai"
+    model_name_or_path: str = "text-embedding-3-small"
+    api_key: Optional[str] = Field(default=None, repr=False)
+    dimension: Optional[int] = None
+
+    @field_validator("api_key", mode="before")
+    def _default_api_key(cls, v: str | None) -> str:
+        if v:
+            return v
+        from os import getenv
+
+        env = getenv("OPENAI_API_KEY")
+        if not env:
+            raise ValueError("OPENAI_API_KEY env-var missing and api_key not set")
+        return env
+
+
+# A discriminated union lets Pydantic and type checkers infer the correct model type
+ModelCfg = Annotated[
+    Union[HFModelCfg, OpenAICfg],
+    Field(discriminator="provider")
+]
+
+
+class EmbeddingConfigSchema(BaseModel):
+    default_model_id: Optional[str] = None
+    models: Dict[str, ModelCfg]
+
+
+###############################################################################
+# Provider helpers
+###############################################################################
+
+class EmbedFn(Protocol):
+    """A protocol defining a callable for embedding texts.
+
+    This ensures that any function used as an embedding function adheres to this
+    specific signature, including the keyword-only `as_list` argument.
+    """
+    def __call__(
+        self, texts: List[str], *, as_list: bool = False
+    ) -> Union[np.ndarray, List[List[float]]]:
+        ...
+
+class CacheRecord(TypedDict):
+    """Strongly-typed structure for a cache entry."""
+    embed: EmbedFn
+    close: Optional[Callable[[], None]]
+    last: float
+
+
+class _HuggingFaceEmbedder:
+    """Wraps HF model/tokenizer; exposes poolable, dtype/device-aware embedding."""
+
+    def __init__(self, cfg: HFModelCfg):
+        try:
+            self._tok = AutoTokenizer.from_pretrained(
+                cfg.model_name_or_path, trust_remote_code=cfg.trust_remote_code
+            )
+            dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+            self._model = AutoModel.from_pretrained(
+                cfg.model_name_or_path,
+                torch_dtype=dtype,
+                trust_remote_code=cfg.trust_remote_code,
+            )
+        # --- [FIX] Added robust error handling for model loading ---
+        except (OSError, requests.exceptions.RequestException) as e:
+            raise IOError(
+                f"Failed to download or load model '{cfg.model_name_or_path}'. "
+                "Check the model name and your network connection."
+            ) from e
+
+        self._device = torch.device(
+            cfg.device if cfg.device is not None else ("cuda" if torch.cuda.is_available() else "cpu")
+        )
+        self._model.to(self._device).eval()
+        self._max_len = cfg.max_length
+        self._batch_size = cfg.batch_size
+        self._pool = cfg.pooling or _masked_mean
+
+    @torch.inference_mode()
+    def _forward(self, inp: Dict[str, Tensor]) -> Tensor:
+        out = self._model(**inp).last_hidden_state
+        return self._pool(out, inp["attention_mask"])
+
+    def embed(self, texts: List[str], *, as_list: bool = False) -> np.ndarray | List[List[float]]:
+        vecs: List[Tensor] = []
+        for i in range(0, len(texts), self._batch_size):
+            batch = texts[i: i + self._batch_size]
+            tok = self._tok(
+                batch,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=self._max_len,
+            )
+            tok = {k: v.to(self._device) for k, v in tok.items()}
+            # --- [FIX] Performance: keep tensors on GPU during loop ---
+            vecs.append(self._forward(tok))
+
+        # --- [FIX] Performance: concatenate on GPU, then move to CPU once ---
+        joined = torch.cat(vecs, dim=0).float().cpu().numpy()
+        return joined.tolist() if as_list else joined
+
+    def close(self) -> None:
+        del self._model, self._tok
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+
+# --------------------------------------------------------------------------
+
+_BACKOFF: tuple[float, ...] = (1, 2, 4, 8)
+
+
+def _openai_embedder(model: str, api_key: str) -> Callable[[list[str], Any, bool], Any]:
+    session = requests.Session()
+    session.headers.update(
+        {"Authorization": f"Bearer {api_key}", "User-Agent": "EmbeddingsLib/4.0"}
+    )
+
+    def _embed(texts: List[str], *, as_list: bool = False) -> np.ndarray | List[List[float]]:
+        payload = {"input": texts, "model": model}
+        for attempt, wait in enumerate(_BACKOFF, 1):
+            t0 = time.perf_counter()
+            try:
+                resp = session.post("https://api.openai.com/v1/embeddings", json=payload, timeout=30)
+                resp.raise_for_status()
+                data = resp.json()["data"]
+                arr = np.asarray([d["embedding"] for d in data], dtype=np.float32)
+                latency = time.perf_counter() - t0
+                LOGGER.debug("openai_embed[%s] %d texts in %.3fs", model, len(texts), latency)
+                return arr.tolist() if as_list else arr
+            except requests.RequestException as exc:
+                if attempt == len(_BACKOFF):
+                    LOGGER.error("openai_embed failed after %d retries: %s", attempt, exc)
+                    raise
+                LOGGER.warning("openai_embed retry %d/%d after %s: %s", attempt, len(_BACKOFF), wait, exc)
+                time.sleep(wait + random.random())
+        raise RuntimeError("Exhausted retries in OpenAI embedder")  # Should be unreachable
+
+    return _embed
+
+
+###############################################################################
+# Factory
+###############################################################################
+
+class EmbeddingFactory:
+    """Thread‑safe LRU/idle cache with sync & async embedding methods."""
+
+    def __init__(
+            self,
+            cfg: Dict[str, Any] | EmbeddingConfigSchema,
+            *,
+            max_cached: int = 2,
+            idle_seconds: int = 900,
+            allow_dynamic_hf: bool = True,
+    ) -> None:
+        self._cfg = cfg if isinstance(cfg, EmbeddingConfigSchema) else EmbeddingConfigSchema(**cfg)
+        self._max_cached = max_cached
+        self._idle = idle_seconds
+        self._allow_dynamic_hf = allow_dynamic_hf
+        self._cache: "OrderedDict[str, CacheRecord]" = OrderedDict()
+        self._lock = threading.Lock()
+        LOGGER.debug("factory initialised max_cached=%d idle=%ds", max_cached, idle_seconds)
+
+    def _get_spec(self, model_id: str) -> ModelCfg:
+        try:
+            return self._cfg.models[model_id]
+        except KeyError:
+            if self._allow_dynamic_hf:
+                LOGGER.info("dynamic HF model %s", model_id)
+                return HFModelCfg(model_name_or_path=model_id, trust_remote_code=False)
+            raise
+
+    @property
+    def config(self) -> EmbeddingConfigSchema:
+        return self._cfg
+
+    def _build(self, model_id: str) -> CacheRecord:
+        spec = self._get_spec(model_id)
+        t0 = time.perf_counter()
+        if spec.provider == "huggingface":
+            hf = _HuggingFaceEmbedder(spec)
+            rec = CacheRecord(embed=hf.embed, close=hf.close, last=time.monotonic())
+        elif spec.provider == "openai":
+            fn = _openai_embedder(spec.model_name_or_path, spec.api_key)
+            rec = CacheRecord(embed=fn, close=None, last=time.monotonic())
+        else:
+            raise ValueError(f"Unsupported provider: {spec.provider}")
+        LOGGER.debug("load %s in %.2fs", model_id, time.perf_counter() - t0)
+        return rec
+
+    def embed(
+            self, texts: List[str], *, model_id: Optional[str] = None, as_list: bool = False
+    ):
+        model_id_to_use = model_id or self._cfg.default_model_id
+        if not model_id_to_use:
+            raise ValueError("No model_id provided and no default_model_id is set.")
+
+        if not texts:
+            return [] if as_list else np.empty((0, 0), dtype=np.float32)
+
+        # --- The lock must be held during the embedding call to prevent use-after-free ---
+        with self._lock:
+            # First, check for idle models to evict.
+            now = time.monotonic()
+            for mid, rec in list(self._cache.items()):
+                if now - rec["last"] > self._idle:
+                    LOGGER.debug("idle evict %s", mid)
+                    self._cache.pop(mid)
+                    if rec["close"]:
+                        rec["close"]()
+
+            # Now, get the model, building it if it doesn't exist.
+            rec = self._cache.get(model_id_to_use)
+            if rec is None:
+                # If we need to load a model, make space for it *first*.
+                while len(self._cache) >= self._max_cached:
+                    lru_mid, lru_rec = self._cache.popitem(last=False)
+                    LOGGER.debug("LRU evict %s to make space", lru_mid)
+                    if lru_rec["close"]:
+                        lru_rec["close"]()
+                rec = self._build(model_id_to_use)
+                self._cache[model_id_to_use] = rec
+
+            # Mark as most recently used and get the function to call.
+            rec["last"] = time.monotonic()
+            self._cache.move_to_end(model_id_to_use)
+            embed_fn = rec["embed"]
+
+            # The embedding call itself is now inside the lock. This serializes
+            # all embedding calls, but guarantees that the model cannot be
+            # evicted by another thread while it is in use.
+            t0 = time.perf_counter()
+            result = embed_fn(texts, as_list=as_list)
+            LOGGER.debug("embed %s %d texts in %.3fs", model_id_to_use, len(texts), time.perf_counter() - t0)
+            return result
+        # The lock is released only after the work is complete.
+
+    async def async_embed(
+            self, texts: List[str], *, model_id: Optional[str] = None, as_list: bool = False
+    ):
+        """Non-blocking version of `embed` for use in async contexts."""
+        return await asyncio.to_thread(self.embed, texts, model_id=model_id, as_list=as_list)
+
+    def embed_one(
+            self, text: str, *, model_id: Optional[str] = None, as_list: bool = False
+    ):
+        vecs = self.embed([text], model_id=model_id, as_list=as_list)
+        return vecs[0] if as_list else vecs.squeeze(0)
+
+    async def async_embed_one(
+            self, text: str, *, model_id: Optional[str] = None, as_list: bool = False
+    ):
+        vecs = await self.async_embed([text], model_id=model_id, as_list=as_list)
+        return vecs[0] if as_list else vecs.squeeze(0)
+
+    def prefetch(self, model_ids: List[str]):
+        """Download / load given model ids in advance (bypasses eviction)."""
+        for mid in model_ids:
+            with self._lock:
+                if mid in self._cache:
+                    continue
+                # Note: This can temporarily exceed max_cached, which is acceptable for a startup operation.
+                self._cache[mid] = self._build(mid)
+            LOGGER.info("prefetched %s", mid)
+
+    def close(self) -> None:
+        """Close all models and clear the cache."""
+        with self._lock:
+            for mid, rec in self._cache.items():
+                if rec["close"]:
+                    rec["close"]()
+            self._cache.clear()
+            LOGGER.debug("factory closed")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        self.close()
+
+#
+# End of Embeddings_Lib.py
+########################################################################################################################