from onprem import LLM
from onprem.pipelines import Extractorpipelines.extractor.base
Extractor
def Extractor(
llm, # An `onprem.LLM` object
prompt_template:Optional=None, # A model specific prompt_template with a single placeholder named "{prompt}". If supplied, overrides the `prompt_template` supplied to the [`LLM`](https://amaiya.github.io/onprem/llm.base.html#llm) constructor.
kwargs:VAR_KEYWORD
):
Extractor applies a given prompt to each sentence or paragraph in a document and returns the results.
Extractor.apply
def apply(
ex_prompt_template:str, # A prompt to apply to each `unit` in document. Should have a single variable, `{text}`
fpath:Optional=None, # A path to to a single file of interest (e.g., a PDF or MS Word document). Mutually-exclusive with `content`.
content:Optional=None, # Text content of a document of interest. Mutually-exclusive with `fpath`.
unit:str='paragraph', # One of {'sentence', 'paragraph'}.
pydantic_model:NoneType=None, # If a Pydantic model is supplied, [`LLM.pydantic_prompt`](https://amaiya.github.io/onprem/llm.base.html#llm.pydantic_prompt) is used instead of [`LLM.prompt`](https://amaiya.github.io/onprem/llm.base.html#llm.prompt).
attempt_fix:bool=False, # If True and `pydantic_model` is supplied, attempt to fix malformed/incomplete outputs.
fix_llm:NoneType=None, # LLM used to attempt fix when `attempt_fix=True`. If None, then use `self.llm.llm`.
preproc_fn:Optional=None, # Function should accept a text string and returns a new preprocessed input.
filter_fn:Optional=None, # A function that accepts a sentence or paragraph and returns `True` if prompt should be applied to it.
clean_fn:Optional=None, # A function that accepts a sentence or paragraph and returns "cleaned" version of the text. (applied after `filter_fn`)
pdf_pages:List=[], # If `fpath` is a PDF document, only apply prompt to text on page numbers listed in `pdf_pages` (starts at 1).
maxchars:int=2048, # units (i.e., paragraphs or sentences) larger than `maxchars` split.
stop:list=[], # list of characters to trigger the LLM to stop generating.
kwargs:VAR_KEYWORD
):
Apply the prompt to each unit (where a “unit” is either a paragraph or sentence) optionally filtered by filter_fn. Extra kwargs fed directly to load_single_document. Results are stored in a pandas.Dataframe.
Example: Information Extraction With Extractor.apply
prompt_template = "<s>[INST] {prompt} [/INST]" # prompt template for Mistral
llm = LLM(model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',
n_gpu_layers=33, # change based on your system
verbose=False, mute_stream=True,
prompt_template=prompt_template)
extractor = Extractor(llm)/home/amaiya/mambaforge/envs/llm/lib/python3.9/site-packages/langchain_core/language_models/llms.py:239: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.
warnings.warn(
prompt = """Extract citations from the following sentences. Return #NA# if there are no citations in the text. Here are some examples:
[SENTENCE]:pretrained BERT text classifier (Devlin et al., 2018), models for sequence tagging (Lample et al., 2016)
[CITATIONS]:(Devlin et al., 2018), (Lample et al., 2016)
[SENTENCE]:Machine learning (ML) is a powerful tool.
[CITATIONS]:#NA#
[SENTENCE]:Following inspiration from a blog post by Rachel Thomas of fast.ai (Howard and Gugger, 2020), we refer to this as Augmented Machine Learning or AugML
[CITATIONS]:(Howard and Gugger, 2020)
[SENTENCE]:{text}
[CITATIONS]:"""content = """
For instance, the fit_onecycle method employs a 1cycle policy (Smith, 2018).
"""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('(Smith, 2018)')content ="""In the case of text, this may involve language-specific preprocessing (e.g., tokenization)."""
df = extractor.apply(prompt, content=content, stop=['\n'])
assert df['Extractions'][0].strip().startswith('#NA#')Extractor.extract_structured
def extract_structured(
prompt:str, # Prompt for structured extraction (should include {text} placeholder if needed)
pydantic_model, # Pydantic model defining the output structure
fpath:Optional=None, # Path to a single file. Mutually-exclusive with `content`.
content:Optional=None, # Text content. Mutually-exclusive with `fpath`.
max_length:Optional=None, # Maximum characters to process from document (None = no limit)
preproc_fn:Optional=None, # Preprocessing function for document text
pdf_pages:List=[], # If `fpath` is a PDF, only use text from these page numbers
return_as:str='model', # One of {'model', 'dict', 'json'}. Format to return results.
stop:list=[], # List of stop sequences for LLM
filter_fn:Optional=None, # Filter function for extracted items (applied after extraction)
postproc_fn:Optional=None, # Post-processing function (applied after filtering)
filter_field:Optional=None, # Field name to filter (auto-detect if None)
kwargs:VAR_KEYWORD
)->Union: # Extra kwargs fed to [`load_single_document`](https://amaiya.github.io/onprem/ingest.base.html#load_single_document)
Perform document-level structured extraction using Pydantic models.
This method processes the entire document (or a large section) in a single LLM call, using a Pydantic model to enforce structured output. This is more efficient than chunk-by-chunk processing when you need to extract structured information that may span multiple sections of the document.
Unlike apply() which processes each paragraph/sentence separately, this method: - Makes a single LLM call for the entire document - Maintains full document context for better relationship detection - Enforces schema compliance via Pydantic models - Returns structured data in your desired format
Example use cases: - Extract all system parameters with values and units - Extract key performance parameters with thresholds and objectives - Extract entities, relationships, or structured specifications - Any extraction requiring cross-document context
Args: prompt: Prompt template for extraction. Use {text} placeholder if you want the document text inserted (recommended for clarity). pydantic_model: Pydantic BaseModel class defining output structure fpath: Path to document file (PDF, Word, text, etc.) content: Raw text content (alternative to fpath) max_length: Truncate document to this many characters (None = use full document) preproc_fn: Optional function to preprocess document text before extraction pdf_pages: For PDFs, extract only from these page numbers (1-indexed) return_as: Output format - ‘model’ (Pydantic model), ‘dict’, or ‘json’ string stop: Stop sequences for LLM generation **kwargs: Additional arguments passed to load_single_document
Returns: Extracted data in format specified by return_as parameter: - ‘model’: Pydantic model instance - ‘dict’: Python dictionary - ‘json’: JSON string
Example: ```python from pydantic import BaseModel, Field from typing import List
class Parameter(BaseModel):
name: str = Field(description="Parameter name")
value: float = Field(description="Numeric value")
unit: str = Field(description="Unit of measurement")
class ParameterList(BaseModel):
parameters: List[Parameter] = Field(default=[], description="All parameters found")
extractor = Extractor(llm)
result = extractor.extract_structured(
prompt="Extract all technical parameters from: {text}",
pydantic_model=ParameterList,
fpath="specs.pdf",
return_as='dict'
)
print(result['parameters'])
```
Example: Structured Information Extraction With extract_structured
class Parameter(BaseModel):
name: str = Field(description="Parameter name")
value: float = Field(description="Numeric value")
unit: str = Field(description="Unit of measurement")
class ParameterList(BaseModel):
parameters: List[Parameter] = Field(default=[], description="All parameters found")
extractor = Extractor(llm)
result = extractor.extract_structured(
prompt="Extract all measured quantities from: {text}",
pydantic_model=ParameterList,
content="A person 6ft tall was driving 90 mph.",
return_as='dict'
)
print(result['parameters'])
# [{'name': 'height', 'value': 6.0, 'unit': 'ft'}, {'name': 'speed', 'value': 90.0, 'unit': 'mph'}]