pipelines.extractor.base

Pipline for information extraction

Extractor

def Extractor(
    llm, # An `onprem.LLM` object
    prompt_template:Optional=None, # A model specific prompt_template with a single placeholder named "{prompt}". If supplied, overrides the `prompt_template` supplied to the `LLM` constructor.
    **kwargs
):

Extractor applies a given prompt to each sentence or paragraph in a document and returns the results.

source

Extractor.apply

def apply(
    ex_prompt_template:str, # A prompt to apply to each `unit` in document. Should have a single variable, `{text}`
    fpath:Optional=None, # A path to to a single file of interest (e.g., a PDF or MS Word document). Mutually-exclusive with `content`.
    content:Optional=None, # Text content of a document of interest.  Mutually-exclusive with `fpath`.
    unit:str='paragraph', # One of {'sentence', 'paragraph'}.
    pydantic_model:NoneType=None, # If a Pydantic model is supplied, `LLM.pydantic_prompt` is used instead of `LLM.prompt`.
    attempt_fix:bool=False, # If True and `pydantic_model` is supplied, attempt to fix malformed/incomplete outputs.
    fix_llm:NoneType=None, # LLM used to attempt fix when `attempt_fix=True`. If None, then use `self.llm.llm`.
    preproc_fn:Optional=None, # Function should accept a text string and returns a new preprocessed input.
    filter_fn:Optional=None, # A function that accepts a sentence or paragraph and returns `True` if prompt should be applied to it.
    clean_fn:Optional=None, # A function that accepts a sentence or paragraph and returns "cleaned" version of the text. (applied after `filter_fn`)
    pdf_pages:List=[], # If `fpath` is a PDF document, only apply prompt to text on page numbers listed in `pdf_pages` (starts at 1).
    maxchars:int=2048, # units (i.e., paragraphs or sentences) larger than `maxchars` split.
    stop:list=[], # list of characters to trigger the LLM to stop generating.
    **kwargs
):

Apply the prompt to each unit (where a “unit” is either a paragraph or sentence) optionally filtered by filter_fn. Extra kwargs fed directly to load_single_document. Results are stored in a pandas.Dataframe.

Example: Information Extraction With Extractor.apply

from onprem import LLM
from onprem.pipelines import Extractor

prompt_template = "<s>[INST] {prompt} [/INST]" # prompt template for Mistral
llm = LLM(model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf', 
          n_gpu_layers=33,  # change based on your system
          verbose=False, mute_stream=True, 
          prompt_template=prompt_template)
extractor = Extractor(llm)

/home/amaiya/mambaforge/envs/llm/lib/python3.9/site-packages/langchain_core/language_models/llms.py:239: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.
  warnings.warn(

prompt = """Extract citations from the following sentences. Return #NA# if there are no citations in the text. Here are some examples:

[SENTENCE]:pretrained BERT text classifier (Devlin et al., 2018), models for sequence tagging (Lample et al., 2016)
[CITATIONS]:(Devlin et al., 2018), (Lample et al., 2016)
[SENTENCE]:Machine learning (ML) is a powerful tool.
[CITATIONS]:#NA#
[SENTENCE]:Following inspiration from a blog post by Rachel Thomas of fast.ai (Howard and Gugger, 2020), we refer to this as Augmented Machine Learning or AugML
[CITATIONS]:(Howard and Gugger, 2020)
[SENTENCE]:{text}
[CITATIONS]:"""

content = """
For instance, the fit_onecycle method employs a 1cycle policy (Smith, 2018). 
"""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('(Smith, 2018)')

content ="""In the case of text, this may involve language-specific preprocessing (e.g., tokenization)."""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('#NA#')

source

Extractor.extract_structured

def extract_structured(
    prompt:str, # Prompt for structured extraction (should include {text} placeholder if needed)
    pydantic_model, # Pydantic model defining the output structure
    fpath:Optional=None, # Path to a single file. Mutually-exclusive with `content`.
    content:Optional=None, # Text content. Mutually-exclusive with `fpath`.
    max_length:Optional=None, # Maximum characters to process from document (None = no limit)
    preproc_fn:Optional=None, # Preprocessing function for document text
    pdf_pages:List=[], # If `fpath` is a PDF, only use text from these page numbers
    return_as:str='model', # One of {'model', 'dict', 'json'}. Format to return results.
    stop:list=[], # List of stop sequences for LLM
    filter_fn:Optional=None, # Filter function for extracted items (applied after extraction)
    postproc_fn:Optional=None, # Post-processing function (applied after filtering)
    filter_field:Optional=None, # Field name to filter (auto-detect if None)
    use_pydantic_fallback:bool=False, # If True, always use pydantic_prompt instead of native structured output
    **kwargs
)->Union: # Extra kwargs fed to `load_single_document`

Perform document-level structured extraction using Pydantic models.

This method processes the entire document (or a large section) in a single LLM call, using a Pydantic model to enforce structured output. This is more efficient than chunk-by-chunk processing when you need to extract structured information that may span multiple sections of the document.

Unlike apply() which processes each paragraph/sentence separately, this method: - Makes a single LLM call for the entire document - Maintains full document context for better relationship detection - Enforces schema compliance via Pydantic models - Returns structured data in your desired format

Example use cases: - Extract all system parameters with values and units - Extract key performance parameters with thresholds and objectives - Extract entities, relationships, or structured specifications - Any extraction requiring cross-document context

Args: prompt: Prompt template for extraction. Use {text} placeholder if you want the document text inserted (recommended for clarity). pydantic_model: Pydantic BaseModel class defining output structure fpath: Path to document file (PDF, Word, text, etc.) content: Raw text content (alternative to fpath) max_length: Truncate document to this many characters (None = use full document) preproc_fn: Optional function to preprocess document text before extraction pdf_pages: For PDFs, extract only from these page numbers (1-indexed) return_as: Output format - ‘model’ (Pydantic model), ‘dict’, or ‘json’ string stop: Stop sequences for LLM generation use_pydantic_fallback: If True, always use pydantic_prompt method instead of native structured output. Set this to True if your LLM provider doesn’t support native structured outputs (e.g., custom gateways, older models). **kwargs: Additional arguments passed to load_single_document

Returns: Extracted data in format specified by return_as parameter: - ‘model’: Pydantic model instance - ‘dict’: Python dictionary - ‘json’: JSON string

Example: ```python from pydantic import BaseModel, Field from typing import List

class Parameter(BaseModel):
    name: str = Field(description="Parameter name")
    value: float = Field(description="Numeric value")
    unit: str = Field(description="Unit of measurement")

class ParameterList(BaseModel):
    parameters: List[Parameter] = Field(default=[], description="All parameters found")

extractor = Extractor(llm)
result = extractor.extract_structured(
    prompt="Extract all technical parameters from: {text}",
    pydantic_model=ParameterList,
    fpath="specs.pdf",
    return_as='dict'
)
print(result['parameters'])
```

Example: Structured Information Extraction With extract_structured

class Parameter(BaseModel):
    name: str = Field(description="Parameter name")
    value: float = Field(description="Numeric value")
    unit: str = Field(description="Unit of measurement")

class ParameterList(BaseModel):
    parameters: List[Parameter] = Field(default=[], description="All parameters found")

extractor = Extractor(llm)
result = extractor.extract_structured(
    prompt="Extract all measured quantities from: {text}",
    pydantic_model=ParameterList,
    content="A person 6ft tall was driving 90 mph.",
    return_as='dict'
)
print(result['parameters'])

# [{'name': 'height', 'value': 6.0, 'unit': 'ft'}, {'name': 'speed', 'value': 90.0, 'unit': 'mph'}]