ingest.base
PDF2MarkdownLoader
def PDF2MarkdownLoader(
file_path:Union, password:Optional=None, mode:Literal='page', pages_delimiter:str='\n\x0c',
extract_images:bool=False, images_parser:Optional=None, images_inner_format:Literal='text',
extract_tables:Optional=None, headers:Optional=None, extract_tables_settings:Optional=None, kwargs:Any
)->None:
Custom PDF to Markdown Loader
MyUnstructuredPDFLoader
def MyUnstructuredPDFLoader(
file_path:Union, mode:str='single', unstructured_kwargs:Any
):
Custom PDF Loader
MyElmLoader
def MyElmLoader(
file_path:Union, mode:str='single', unstructured_kwargs:Any
):
Wrapper to fallback to text/plain when default does not work
load_spreadsheet_documents
def load_spreadsheet_documents(
file_path, text_column, metadata_columns:NoneType=None, sheet_name:NoneType=None
):
Load documents from a spreadsheet where each row becomes a document.
Args: file_path: Path to the spreadsheet file (.xlsx, .xls, .csv) text_column: Name of the column containing the text content metadata_columns: List of column names to include as metadata (default: all other columns) sheet_name: For Excel files, name of the sheet to read (default: first sheet)
Returns: List of Document objects, one per row
load_web_document
def load_web_document(
url, username:NoneType=None, password:NoneType=None
):
Download and extract text from a web document using load_single_document.
Args: url: The URL to download from username: Optional username for authentication (e.g., for SharePoint) password: Optional password for authentication (e.g., for SharePoint)
Returns: List of Document objects
batchify_chunks
def batchify_chunks(
texts, batch_size:int=41000
):
split texts into batches specifically for Chroma
does_vectorstore_exist
def does_vectorstore_exist(
db
)->bool:
Checks if vectorstore exists
chunk_documents
def chunk_documents(
documents:list, # list of LangChain Documents or list of text strings
chunk_size:int=1000, # text is split to this many characters by `langchain.text_splitter.RecursiveCharacterTextSplitter`
chunk_overlap:int=100, # character overlap between chunks in `langchain.text_splitter.RecursiveCharacterTextSplitter`
infer_table_structure:bool=False, # This should be set to True if `documents` may contain contain tables (i.e., `doc.metadata['table']=True`).
preserve_paragraphs:bool=False, # If True, strictly chunk by paragraph and only split if paragraph exceeds `chunk_size`. If False, small paragraphs will be accumulated into a single chunk until `chunk_size` is exceeded.
keep_full_document:bool=False, # If True, skip chunking and return documents as-is
kwargs:VAR_KEYWORD
)->List:
Process list of Documents or text strings by splitting into chunks. If text strings are provided, they will be converted to Document objects internally. If keep_full_document=True, documents are returned as-is without chunking.
process_folder
def process_folder(
source_directory:str, # path to folder containing document store
chunk_size:int=1000, # text is split to this many characters by `langchain.text_splitter.RecursiveCharacterTextSplitter`
chunk_overlap:int=100, # character overlap between chunks in `langchain.text_splitter.RecursiveCharacterTextSplitter`
ignored_files:List=[], # list of files to ignore
ignore_fn:Optional=None, # Callable that accepts the file path (including file name) as input and ignores if returns True
batch_size:int=41000, # batch size used when processing documents
kwargs:VAR_KEYWORD
)->List:
Load documents from folder, extract text from them, split texts into chunks. Extra kwargs fed to ingest.load_documents and ingest.load_single_document.
load_documents
def load_documents(
source_dir:str, # path to folder containing documents
ignored_files:List=[], # list of filepaths to ignore
ignore_fn:Optional=None, # callable that accepts file path and returns True for ignored files
caption_tables:bool=False, # If True, agument table text with summaries of tables if infer_table_structure is True.
extract_document_titles:bool=False, # If True, infer document title and attach to individual chunks
llm:NoneType=None, # a reference to the LLM (used by `caption_tables` and `extract_document_titles`
n_proc:Optional=None, # number of CPU cores to use for text extraction. If None, use maximum for system.
verbose:bool=True, # verbosity
preserve_paragraphs:bool=False, # This is not used here and is only included to mask it from being forwarded to [`load_single_document`](https://amaiya.github.io/onprem/ingest.base.html#load_single_document).
kwargs:VAR_KEYWORD
)->List:
Loads all documents from the source documents directory, ignoring specified files. Extra kwargs fed to ingest.load_single_document.
Returns a generator over documents.
load_single_document
def load_single_document(
file_path:str, # path to file
pdf_unstructured:bool=False, # use unstructured for PDF extraction if True (will also OCR if necessary)
pdf_markdown:bool=False, # Convert PDFs to Markdown instead of plain text if True.
store_md5:bool=False, # Extract and store MD5 of document in metadata
store_mimetype:bool=False, # Guess and store mime type of document in metadata
store_file_dates:bool=False, # Extract snd store file dates in metadata
keep_full_document:bool=False, # If True, concatenate multi-page documents into single documents and disable chunking
max_words:Optional=None, # If provided, truncate document content to first N words (applied after concatenation)
file_callables:Optional=None, # optional dict with keys and functions called with filepath as argument. Results stored as metadata.
text_callables:Optional=None, # optional dict with keys and functions called with file text as argument. Results stored as metadata.
kwargs:VAR_KEYWORD
)->List:
Extract text from a single document. Will attempt to OCR PDFs, if necessary.
Note that extra kwargs can be supplied to configure the behavior of PDF loaders. For instance, supplying infer_table_structure will cause load_single_document to try and infer and extract tables from PDFs. When pdf_unstructured=True and infer_table_structure=True, tables are represented as HTML within the main body of extracted text. In all other cases, inferred tables are represented as Markdown and appended to the end of the extracted text when infer_table_structure=True.
The keep_full_document option will combine multi-page documents into single documents with page breaks and disable chunking downstream. The max_words option will truncate documents to the specified number of words (applied after concatenation). When truncation occurs, metadata is updated to include original word count and truncation information.