Source code for src.ingestor.embedder
from typing import List, Optional, Dict, Any
from abc import ABC, abstractmethod
from langchain_core.embeddings import Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
[docs]
class BaseEmbedder(ABC):
"""Base class for all embedders"""
[docs]
@abstractmethod
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of documents"""
pass
[docs]
@abstractmethod
def embed_query(self, text: str) -> List[float]:
"""Embed a single query"""
pass
[docs]
@abstractmethod
def get_embeddings(self) -> Embeddings:
"""Get the underlying LangChain Embeddings object"""
pass
[docs]
class DocumentEmbedder(BaseEmbedder):
"""HuggingFace embeddings implementation"""
def __init__(
self,
model_name: str = "BAAI/bge-large-en-v1.5",
model_kwargs: Optional[Dict[str, Any]] = None,
encode_kwargs: Optional[Dict[str, Any]] = None
):
"""
Args:
model_name: HuggingFace model ID for embeddings. Defaults to ``"BAAI/bge-large-en-v1.5"``.
model_kwargs: Passed to ``HuggingFaceEmbeddings`` (e.g., ``{"device": "cuda"}``). Defaults to CPU.
encode_kwargs: Passed to the encode call (e.g., ``{"normalize_embeddings": True}``).
"""
self.model_name = model_name
self.model_kwargs = model_kwargs or {'device': 'cpu'}
self.encode_kwargs = encode_kwargs or {'normalize_embeddings': True}
self.embeddings = HuggingFaceEmbeddings(
model_name=self.model_name,
model_kwargs=self.model_kwargs,
encode_kwargs=self.encode_kwargs
)
[docs]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed a list of document strings and return their dense vectors."""
return self.embeddings.embed_documents(texts)
[docs]
def embed_query(self, text: str) -> List[float]:
"""Embed a single query string and return its dense vector."""
return self.embeddings.embed_query(text)
[docs]
def get_embeddings(self) -> Embeddings:
"""Return the underlying LangChain ``Embeddings`` object (e.g., for use with FAISS)."""
return self.embeddings