Source code for src.ingestor.document_ingestor

from typing import List, Optional, Union
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from .base import BaseIngestor



[docs]
class DocumentIngestor(BaseIngestor):
    """Ingests documents (PDF, DOCX) for RAG pipeline"""

    def __init__(
        self,
        chunk_size: int = 500,
        chunk_overlap: int = 100,
        separators: Optional[List[str]] = None,
    ):
        """
        Args:
            chunk_size: Maximum character length per chunk. Must be >= 100. Defaults to 500.
            chunk_overlap: Character overlap between consecutive chunks. Must be < ``chunk_size``. Defaults to 100.
            separators: Ordered list of separators tried by ``RecursiveCharacterTextSplitter``.
                Defaults to ``["\\n\\n", "\\n", ".", " ", ""]``.
        """
        if chunk_overlap >= chunk_size:
            raise ValueError(
                f"chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})"
            )

        if chunk_size < 100:
            raise ValueError(f"chunk_size ({chunk_size}) should be at least 100")

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators or ["\n\n", "\n", ".", " ", ""]
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=self.separators,
        )

    def _load_document(self, file_path: str) -> List[Document]:
        """Load a single document based on file extension"""
        path = Path(file_path)

        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        file_ext = path.suffix.lower()

        if file_ext == ".pdf":
            loader = PyPDFLoader(file_path)
        elif file_ext in [".docx", ".doc"]:
            loader = Docx2txtLoader(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_ext}")

        return loader.load()


[docs]
    def ingest(self, file_paths: Union[str, List[str]]) -> List[Document]:
        """
        Load and chunk document(s).

        Args:
            file_paths: Single file path (str) or list of file paths (List[str])

        Returns:
            List of chunked Document objects with enriched metadata
        """
        # Convert single string to list for uniform processing
        if isinstance(file_paths, str):
            file_paths = [file_paths]

        all_chunks = []

        for file_path in file_paths:
            try:
                documents = self._load_document(file_path)
                print("Loaded docs:", len(documents))
                chunks = self.splitter.split_documents(documents)
                print("Chunks:", len(chunks))

                # Enrich chunk metadata with doc_title and chunk_index
                doc_title = Path(file_path).stem  # filename without extension
                for chunk_idx, chunk in enumerate(chunks):
                    chunk.metadata["doc_title"] = doc_title
                    chunk.metadata["chunk_index"] = chunk_idx

                all_chunks.extend(chunks)
            except Exception as e:
                print(f"Failed to ingest {file_path}: {e}")

        return all_chunks