utils
get_webapp_dir
def get_webapp_dir(
):
Get the webapp directory path
get_models_dir
def get_models_dir(
):
Get the models directory path
get_datadir
def get_datadir(
subfolder:NoneType=None
):
Get the data directory path, optionally with a subfolder. Creates the main data dir and any requested subfolder if they don’t exist.
Args: subfolder: Optional subfolder name to append to the data directory path
Returns: Path to the data directory or subfolder
download
def download(
url, filename, verify:bool=False
):
df_to_md
def df_to_md(
df, caption:NoneType=None
):
Converts pd.Dataframe to markdown
html_to_df
def html_to_df(
html_str:str
)->Any:
Convert HTML to dataframe.
md_to_df
def md_to_df(
md_str:str
)->Any:
Convert Markdown to dataframe.
extract_noun_phrases
def extract_noun_phrases(
text:str
):
Extracts noun phrases from text, including coordinated phrases like “generative AI and live fire testing”, and removes subphrases like “AI” if “generative AI” is also found. Example: text = “Natural language processing (NLP) is a field of computer science, artificial intelligence,” “and computational linguistics concerned with the interactions between computers and human” “(natural) languages.” extract_noun_phrases(text) [‘Natural language processing’, ‘NLP’, ‘field’, ‘computer science’, ‘artificial intelligence’, ‘computational linguistics’, ‘interactions’, ‘computers’, ‘languages’, ‘human’]
contains_sentence
def contains_sentence(
sentence, text
):
Returns True if sentence is contained in text ignoring whether tokens are delmited by spaces or newlines or tabs.
remove_sentence
def remove_sentence(
sentence, text, remove_follow:bool=False, flags:RegexFlag=re.IGNORECASE
):
Removes a sentence or phrase from text ignoring whether tokens are delimited by spaces or newlines or tabs.
If remove_follow=True, then subsequent text until the first newline is also removed.
segment
def segment(
text:str, unit:str='paragraph', maxchars:int=2048
):
Segments text into a list of paragraphs or sentences depending on value of unit (one of {'paragraph', 'sentence'}. The maxchars parameter is the maximum size of any unit of text.
filtered_generator
def filtered_generator(
generator, criteria:list=[]
):
Filters a generator based on a given predicate function.
Args: generator: The generator to filter. criteria: List of functions that take an element from the generator and return True if the element should be included, False otherwise.
Yields: Elements from the original generator that satisfy the predicate.
batch_generator
def batch_generator(
iterable, batch_size
):
Batched results from generator
batch_list
def batch_list(
input_list, batch_size
):
Split list into chunks
get_template_vars
def get_template_vars(
template_str:str
)->List:
Get template variables from a template string.
format_string
def format_string(
string_to_format:str, kwargs:str
)->str:
Format a string with kwargs
SafeFormatter
def SafeFormatter(
format_dict:Optional=None
):
Safe string formatter that does not raise KeyError if key is missing. Adapted from llama_index.