utils

some utility functions

source

get_webapp_dir


def get_webapp_dir(
    
):

Get the webapp directory path


source

get_models_dir


def get_models_dir(
    
):

Get the models directory path


source

get_datadir


def get_datadir(
    subfolder:NoneType=None
):

Get the data directory path, optionally with a subfolder. Creates the main data dir and any requested subfolder if they don’t exist.

Args: subfolder: Optional subfolder name to append to the data directory path

Returns: Path to the data directory or subfolder


source

download


def download(
    url, filename, verify:bool=False
):

source

df_to_md


def df_to_md(
    df, caption:NoneType=None
):

Converts pd.Dataframe to markdown


source

html_to_df


def html_to_df(
    html_str:str
)->Any:

Convert HTML to dataframe.


source

md_to_df


def md_to_df(
    md_str:str
)->Any:

Convert Markdown to dataframe.


source

extract_noun_phrases


def extract_noun_phrases(
    text:str
):

Extracts noun phrases from text, including coordinated phrases like “generative AI and live fire testing”, and removes subphrases like “AI” if “generative AI” is also found. Example: text = “Natural language processing (NLP) is a field of computer science, artificial intelligence,” “and computational linguistics concerned with the interactions between computers and human” “(natural) languages.” extract_noun_phrases(text) [‘Natural language processing’, ‘NLP’, ‘field’, ‘computer science’, ‘artificial intelligence’, ‘computational linguistics’, ‘interactions’, ‘computers’, ‘languages’, ‘human’]


source

contains_sentence


def contains_sentence(
    sentence, text
):

Returns True if sentence is contained in text ignoring whether tokens are delmited by spaces or newlines or tabs.


source

remove_sentence


def remove_sentence(
    sentence, text, remove_follow:bool=False, flags:RegexFlag=re.IGNORECASE
):

Removes a sentence or phrase from text ignoring whether tokens are delimited by spaces or newlines or tabs.

If remove_follow=True, then subsequent text until the first newline is also removed.


source

segment


def segment(
    text:str, unit:str='paragraph', maxchars:int=2048
):

Segments text into a list of paragraphs or sentences depending on value of unit (one of {'paragraph', 'sentence'}. The maxchars parameter is the maximum size of any unit of text.


source

filtered_generator


def filtered_generator(
    generator, criteria:list=[]
):

Filters a generator based on a given predicate function.

Args: generator: The generator to filter. criteria: List of functions that take an element from the generator and return True if the element should be included, False otherwise.

Yields: Elements from the original generator that satisfy the predicate.


source

batch_generator


def batch_generator(
    iterable, batch_size
):

Batched results from generator


source

batch_list


def batch_list(
    input_list, batch_size
):

Split list into chunks


source

get_template_vars


def get_template_vars(
    template_str:str
)->List:

Get template variables from a template string.


source

format_string


def format_string(
    string_to_format:str, kwargs:str
)->str:

Format a string with kwargs


source

SafeFormatter


def SafeFormatter(
    format_dict:Optional=None
):

Safe string formatter that does not raise KeyError if key is missing. Adapted from llama_index.