ingest.base

functionality for text extraction and document ingestion into a vector database for question-answering and other tasks

source

PDF2MarkdownLoader


def PDF2MarkdownLoader(
    file_path:Union, password:Optional=None, mode:Literal='page', pages_delimiter:str='\n\x0c',
    extract_images:bool=False, images_parser:Optional=None, images_inner_format:Literal='text',
    extract_tables:Optional=None, headers:Optional=None, extract_tables_settings:Optional=None, kwargs:Any
)->None:

Custom PDF to Markdown Loader


source

MyUnstructuredPDFLoader


def MyUnstructuredPDFLoader(
    file_path:Union, mode:str='single', unstructured_kwargs:Any
):

Custom PDF Loader


source

MyElmLoader


def MyElmLoader(
    file_path:Union, mode:str='single', unstructured_kwargs:Any
):

Wrapper to fallback to text/plain when default does not work


source

load_spreadsheet_documents


def load_spreadsheet_documents(
    file_path, text_column, metadata_columns:NoneType=None, sheet_name:NoneType=None
):

Load documents from a spreadsheet where each row becomes a document.

Args: file_path: Path to the spreadsheet file (.xlsx, .xls, .csv) text_column: Name of the column containing the text content metadata_columns: List of column names to include as metadata (default: all other columns) sheet_name: For Excel files, name of the sheet to read (default: first sheet)

Returns: List of Document objects, one per row


source

load_web_document


def load_web_document(
    url, username:NoneType=None, password:NoneType=None
):

Download and extract text from a web document using load_single_document.

Args: url: The URL to download from username: Optional username for authentication (e.g., for SharePoint) password: Optional password for authentication (e.g., for SharePoint)

Returns: List of Document objects


source

batchify_chunks


def batchify_chunks(
    texts, batch_size:int=41000
):

split texts into batches specifically for Chroma


source

does_vectorstore_exist


def does_vectorstore_exist(
    db
)->bool:

Checks if vectorstore exists


source

chunk_documents


def chunk_documents(
    documents:list, # list of LangChain Documents or list of text strings
    chunk_size:int=1000, # text is split to this many characters by `langchain.text_splitter.RecursiveCharacterTextSplitter`
    chunk_overlap:int=100, # character overlap between chunks in `langchain.text_splitter.RecursiveCharacterTextSplitter`
    infer_table_structure:bool=False, # This should be set to True if `documents` may contain contain tables (i.e., `doc.metadata['table']=True`).
    preserve_paragraphs:bool=False, # If True, strictly chunk by paragraph and only split if paragraph exceeds `chunk_size`. If False, small paragraphs will be accumulated into a single chunk until `chunk_size` is exceeded.
    keep_full_document:bool=False, # If True, skip chunking and return documents as-is
    kwargs:VAR_KEYWORD
)->List:

Process list of Documents or text strings by splitting into chunks. If text strings are provided, they will be converted to Document objects internally. If keep_full_document=True, documents are returned as-is without chunking.


source

process_folder


def process_folder(
    source_directory:str, # path to folder containing document store
    chunk_size:int=1000, # text is split to this many characters by `langchain.text_splitter.RecursiveCharacterTextSplitter`
    chunk_overlap:int=100, # character overlap between chunks in `langchain.text_splitter.RecursiveCharacterTextSplitter`
    ignored_files:List=[], # list of files to ignore
    ignore_fn:Optional=None, # Callable that accepts the file path (including file name) as input and ignores if returns True
    batch_size:int=41000, # batch size used when processing documents
    kwargs:VAR_KEYWORD
)->List:

Load documents from folder, extract text from them, split texts into chunks. Extra kwargs fed to ingest.load_documents and ingest.load_single_document.


source

load_documents


def load_documents(
    source_dir:str, # path to folder containing documents
    ignored_files:List=[], # list of filepaths to ignore
    ignore_fn:Optional=None, # callable that accepts file path and returns True for ignored files
    caption_tables:bool=False, # If True, agument table text with summaries of tables if infer_table_structure is True.
    extract_document_titles:bool=False, # If True, infer document title and attach to individual chunks
    llm:NoneType=None, # a reference to the LLM (used by `caption_tables` and `extract_document_titles`
    n_proc:Optional=None, # number of CPU cores to use for text extraction. If None, use maximum for system.
    verbose:bool=True, # verbosity
    preserve_paragraphs:bool=False, # This is not used here and is only included to mask it from being forwarded to [`load_single_document`](https://amaiya.github.io/onprem/ingest.base.html#load_single_document).
    kwargs:VAR_KEYWORD
)->List:

Loads all documents from the source documents directory, ignoring specified files. Extra kwargs fed to ingest.load_single_document.

Returns a generator over documents.


source

load_single_document


def load_single_document(
    file_path:str, # path to file
    pdf_unstructured:bool=False, # use unstructured for PDF extraction if True (will also OCR if necessary)
    pdf_markdown:bool=False, # Convert PDFs to Markdown instead of plain text if True.
    store_md5:bool=False, # Extract and store MD5 of document in metadata
    store_mimetype:bool=False, # Guess and store mime type of document in metadata
    store_file_dates:bool=False, # Extract snd store file dates in metadata
    keep_full_document:bool=False, # If True, concatenate multi-page documents into single documents and disable chunking
    max_words:Optional=None, # If provided, truncate document content to first N words (applied after concatenation)
    file_callables:Optional=None, # optional dict with  keys and functions called with filepath as argument. Results stored as metadata.
    text_callables:Optional=None, # optional dict with  keys and functions called with file text as argument. Results stored as metadata.
    kwargs:VAR_KEYWORD
)->List:

Extract text from a single document. Will attempt to OCR PDFs, if necessary.

Note that extra kwargs can be supplied to configure the behavior of PDF loaders. For instance, supplying infer_table_structure will cause load_single_document to try and infer and extract tables from PDFs. When pdf_unstructured=True and infer_table_structure=True, tables are represented as HTML within the main body of extracted text. In all other cases, inferred tables are represented as Markdown and appended to the end of the extracted text when infer_table_structure=True.

The keep_full_document option will combine multi-page documents into single documents with page breaks and disable chunking downstream. The max_words option will truncate documents to the specified number of words (applied after concatenation). When truncation occurs, metadata is updated to include original word count and truncation information.