ingest.helpers

helper utilities for ingesting documents

source

md5sum


def md5sum(
    filepath
):

Perform an MD5 hash of a file.


source

date2iso


def date2iso(
    d
):

source

iso2date


def iso2date(
    s
):

source

extract_file_dates


def extract_file_dates(
    filepath
):

Takes a file path and returns an ISO datetime string of last-modified and create date of file.

Returns tuple of the form (create-date, last-modify-date)


source

extract_extension


def extract_extension(
    filepath:str, include_dot:bool=False
):

Extracts file extension (including dot) from file path


source

extract_files


def extract_files(
    source_dir:str, follow_links:bool=False, extensions:Union=None
):

source

extract_tables


def extract_tables(
    filepath:Optional=None, docs:Optional=[]
)->List:

Extract tables from PDF and append to end of supplied Document list. Accepts either a filepath or a list of LangChain Document objects all from a single file. If filepath is empty, the file path of interest is extracted from docs.

Returns an updated list of Document objects appended with extracted tables.


source

includes_caption


def includes_caption(
    d:Document
):

Returns True if content of supplied Document includes a table caption


source

is_random_plaintext


def is_random_plaintext(
    extension, mimetype
):

Check mimetype for plain text


source

extract_mimetype


def extract_mimetype(
    filepath
):

Extract mimetype. Returns a tuple with extracted mimetype, type, subtype.


source

get_mimetype


def get_mimetype(
    filepath
):

source

clean_text


def clean_text(
    text_s_or_b
):

convert to string and strip.


source

ParagraphTextSplitter


def ParagraphTextSplitter(
    chunk_size:int=5000, chunk_overlap:int=0
):

Interface for splitting text into chunks.


source

extract_file_metadata


def extract_file_metadata(
    file_path:str, store_md5:bool=True, store_mimetype:bool=True, store_file_dates:bool=True, file_callables:dict={}
):

Extract file metadata


source

set_metadata_defaults


def set_metadata_defaults(
    docs:List, extra_keys:list=[]
):

Sets Document metadata defaults


source

create_document


def create_document(
    page_content:str, only_required_metadata:bool=True, kwargs:VAR_KEYWORD
):

Create document with required metadata keys from METADATA.


source

dict_from_doc


def dict_from_doc(
    doc, content_field:str='page_content'
):

Create dictinoary from LangChain Document


source

doc_from_dict


def doc_from_dict(
    d:dict, content_field:str='page_content'
):

Create LangChain Document from dicationary