ingest.helpers
md5sum
def md5sum(
filepath
):
Perform an MD5 hash of a file.
date2iso
def date2iso(
d
):
iso2date
def iso2date(
s
):
extract_file_dates
def extract_file_dates(
filepath
):
Takes a file path and returns an ISO datetime string of last-modified and create date of file.
Returns tuple of the form (create-date, last-modify-date)
extract_extension
def extract_extension(
filepath:str, include_dot:bool=False
):
Extracts file extension (including dot) from file path
extract_files
def extract_files(
source_dir:str, follow_links:bool=False, extensions:Union=None
):
extract_tables
def extract_tables(
filepath:Optional=None, docs:Optional=[]
)->List:
Extract tables from PDF and append to end of supplied Document list. Accepts either a filepath or a list of LangChain Document objects all from a single file. If filepath is empty, the file path of interest is extracted from docs.
Returns an updated list of Document objects appended with extracted tables.
includes_caption
def includes_caption(
d:Document
):
Returns True if content of supplied Document includes a table caption
is_random_plaintext
def is_random_plaintext(
extension, mimetype
):
Check mimetype for plain text
extract_mimetype
def extract_mimetype(
filepath
):
Extract mimetype. Returns a tuple with extracted mimetype, type, subtype.
get_mimetype
def get_mimetype(
filepath
):
clean_text
def clean_text(
text_s_or_b
):
convert to string and strip.
ParagraphTextSplitter
def ParagraphTextSplitter(
chunk_size:int=5000, chunk_overlap:int=0
):
Interface for splitting text into chunks.
extract_file_metadata
def extract_file_metadata(
file_path:str, store_md5:bool=True, store_mimetype:bool=True, store_file_dates:bool=True, file_callables:dict={}
):
Extract file metadata
set_metadata_defaults
def set_metadata_defaults(
docs:List, extra_keys:list=[]
):
Sets Document metadata defaults
create_document
def create_document(
page_content:str, only_required_metadata:bool=True, kwargs:VAR_KEYWORD
):
Create document with required metadata keys from METADATA.
dict_from_doc
def dict_from_doc(
doc, content_field:str='page_content'
):
Create dictinoary from LangChain Document
doc_from_dict
def doc_from_dict(
d:dict, content_field:str='page_content'
):
Create LangChain Document from dicationary