Source code for src.ingestor.embedder

from typing import List, Optional, Dict, Any
from abc import ABC, abstractmethod
from langchain_core.embeddings import Embeddings
from langchain_huggingface import HuggingFaceEmbeddings


[docs] class BaseEmbedder(ABC): """Base class for all embedders"""
[docs] @abstractmethod def embed_documents(self, texts: List[str]) -> List[List[float]]: """Embed a list of documents""" pass
[docs] @abstractmethod def embed_query(self, text: str) -> List[float]: """Embed a single query""" pass
[docs] @abstractmethod def get_embeddings(self) -> Embeddings: """Get the underlying LangChain Embeddings object""" pass
[docs] class DocumentEmbedder(BaseEmbedder): """HuggingFace embeddings implementation""" def __init__( self, model_name: str = "BAAI/bge-large-en-v1.5", model_kwargs: Optional[Dict[str, Any]] = None, encode_kwargs: Optional[Dict[str, Any]] = None ): """ Args: model_name: HuggingFace model ID for embeddings. Defaults to ``"BAAI/bge-large-en-v1.5"``. model_kwargs: Passed to ``HuggingFaceEmbeddings`` (e.g., ``{"device": "cuda"}``). Defaults to CPU. encode_kwargs: Passed to the encode call (e.g., ``{"normalize_embeddings": True}``). """ self.model_name = model_name self.model_kwargs = model_kwargs or {'device': 'cpu'} self.encode_kwargs = encode_kwargs or {'normalize_embeddings': True} self.embeddings = HuggingFaceEmbeddings( model_name=self.model_name, model_kwargs=self.model_kwargs, encode_kwargs=self.encode_kwargs )
[docs] def embed_documents(self, texts: List[str]) -> List[List[float]]: """Embed a list of document strings and return their dense vectors.""" return self.embeddings.embed_documents(texts)
[docs] def embed_query(self, text: str) -> List[float]: """Embed a single query string and return its dense vector.""" return self.embeddings.embed_query(text)
[docs] def get_embeddings(self) -> Embeddings: """Return the underlying LangChain ``Embeddings`` object (e.g., for use with FAISS).""" return self.embeddings