Source code for src.ingestor.document_ingestor

from typing import List, Optional, Union
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from .base import BaseIngestor


[docs] class DocumentIngestor(BaseIngestor): """Ingests documents (PDF, DOCX) for RAG pipeline""" def __init__( self, chunk_size: int = 500, chunk_overlap: int = 100, separators: Optional[List[str]] = None, ): """ Args: chunk_size: Maximum character length per chunk. Must be >= 100. Defaults to 500. chunk_overlap: Character overlap between consecutive chunks. Must be < ``chunk_size``. Defaults to 100. separators: Ordered list of separators tried by ``RecursiveCharacterTextSplitter``. Defaults to ``["\\n\\n", "\\n", ".", " ", ""]``. """ if chunk_overlap >= chunk_size: raise ValueError( f"chunk_overlap ({chunk_overlap}) must be less than chunk_size ({chunk_size})" ) if chunk_size < 100: raise ValueError(f"chunk_size ({chunk_size}) should be at least 100") self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.separators = separators or ["\n\n", "\n", ".", " ", ""] self.splitter = RecursiveCharacterTextSplitter( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, separators=self.separators, ) def _load_document(self, file_path: str) -> List[Document]: """Load a single document based on file extension""" path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") file_ext = path.suffix.lower() if file_ext == ".pdf": loader = PyPDFLoader(file_path) elif file_ext in [".docx", ".doc"]: loader = Docx2txtLoader(file_path) else: raise ValueError(f"Unsupported file type: {file_ext}") return loader.load()
[docs] def ingest(self, file_paths: Union[str, List[str]]) -> List[Document]: """ Load and chunk document(s). Args: file_paths: Single file path (str) or list of file paths (List[str]) Returns: List of chunked Document objects with enriched metadata """ # Convert single string to list for uniform processing if isinstance(file_paths, str): file_paths = [file_paths] all_chunks = [] for file_path in file_paths: try: documents = self._load_document(file_path) print("Loaded docs:", len(documents)) chunks = self.splitter.split_documents(documents) print("Chunks:", len(chunks)) # Enrich chunk metadata with doc_title and chunk_index doc_title = Path(file_path).stem # filename without extension for chunk_idx, chunk in enumerate(chunks): chunk.metadata["doc_title"] = doc_title chunk.metadata["chunk_index"] = chunk_idx all_chunks.extend(chunks) except Exception as e: print(f"Failed to ingest {file_path}: {e}") return all_chunks