pipelines.extractor

Pipline for information extraction

source

Extractor


def Extractor(
    llm, # An `onprem.LLM` object
    prompt_template:Optional=None, # A model specific prompt_template with a single placeholder named "{prompt}". If supplied, overrides the `prompt_template` supplied to the [`LLM`](https://amaiya.github.io/onprem/llm.base.html#llm) constructor.
    kwargs:VAR_KEYWORD
):

Extractor applies a given prompt to each sentence or paragraph in a document and returns the results.


source

Extractor.apply


def apply(
    ex_prompt_template:str, # A prompt to apply to each `unit` in document. Should have a single variable, `{text}`
    fpath:Optional=None, # A path to to a single file of interest (e.g., a PDF or MS Word document). Mutually-exclusive with `content`.
    content:Optional=None, # Text content of a document of interest.  Mutually-exclusive with `fpath`.
    unit:str='paragraph', # One of {'sentence', 'paragraph'}.
    pydantic_model:NoneType=None, # If a Pydantic model is supplied, [`LLM.pydantic_prompt`](https://amaiya.github.io/onprem/llm.base.html#llm.pydantic_prompt) is used instead of [`LLM.prompt`](https://amaiya.github.io/onprem/llm.base.html#llm.prompt).
    attempt_fix:bool=False, # If True and `pydantic_model` is supplied, attempt to fix malformed/incomplete outputs.
    fix_llm:NoneType=None, # LLM used to attempt fix when `attempt_fix=True`. If None, then use `self.llm.llm`.
    preproc_fn:Optional=None, # Function should accept a text string and returns a new preprocessed input.
    filter_fn:Optional=None, # A function that accepts a sentence or paragraph and returns `True` if prompt should be applied to it.
    clean_fn:Optional=None, # A function that accepts a sentence or paragraph and returns "cleaned" version of the text. (applied after `filter_fn`)
    pdf_pages:List=[], # If `fpath` is a PDF document, only apply prompt to text on page numbers listed in `pdf_pages` (starts at 1).
    maxchars:int=2048, # units (i.e., paragraphs or sentences) larger than `maxchars` split.
    stop:list=[], # list of characters to trigger the LLM to stop generating.
    kwargs:VAR_KEYWORD
):

Apply the prompt to each unit (where a “unit” is either a paragraph or sentence) optionally filtered by filter_fn. Extra kwargs fed directly to load_single_document. Results are stored in a pandas.Dataframe.

from onprem import LLM
from onprem.pipelines import Extractor
prompt_template = "<s>[INST] {prompt} [/INST]" # prompt template for Mistral
llm = LLM(model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf', 
          n_gpu_layers=33,  # change based on your system
          verbose=False, mute_stream=True, 
          prompt_template=prompt_template)
extractor = Extractor(llm)
/home/amaiya/mambaforge/envs/llm/lib/python3.9/site-packages/langchain_core/language_models/llms.py:239: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.
  warnings.warn(
prompt = """Extract citations from the following sentences. Return #NA# if there are no citations in the text. Here are some examples:

[SENTENCE]:pretrained BERT text classifier (Devlin et al., 2018), models for sequence tagging (Lample et al., 2016)
[CITATIONS]:(Devlin et al., 2018), (Lample et al., 2016)
[SENTENCE]:Machine learning (ML) is a powerful tool.
[CITATIONS]:#NA#
[SENTENCE]:Following inspiration from a blog post by Rachel Thomas of fast.ai (Howard and Gugger, 2020), we refer to this as Augmented Machine Learning or AugML
[CITATIONS]:(Howard and Gugger, 2020)
[SENTENCE]:{text}
[CITATIONS]:"""
content = """
For instance, the fit_onecycle method employs a 1cycle policy (Smith, 2018). 
"""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('(Smith, 2018)')
content ="""In the case of text, this may involve language-specific preprocessing (e.g., tokenization)."""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('#NA#')