Module ktrain.text.ner.anago.tagger

Model API.

Expand source code
"""
Model API.
"""

from ....imports import *
from .. import metrics


class Tagger(object):
    """A model API that tags input sentence.

    Attributes:
        model: Model.
        preprocessor: Transformer. Preprocessing data for feature extraction.
        tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
    """

    def __init__(self, model, preprocessor, tokenizer=str.split):
        self.model = model
        self.preprocessor = preprocessor
        self.tokenizer = tokenizer

    def predict_proba(self, text):
        """Probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        Args:
            text : string, the input text.

        Returns:
            y : array-like, shape = [num_words, num_classes]
            Returns the probability of the word for each class in the model,
        """
        assert isinstance(text, str)

        words = self.tokenizer(text)
        X = self.preprocessor.transform([words])
        y = self.model.predict(X)
        y = y[0]  # reduce batch dimension.

        return y

    def _get_prob(self, pred):
        prob = np.max(pred, -1)

        return prob

    def _get_tags(self, pred):
        tags = self.preprocessor.inverse_transform([pred])
        tags = tags[0]  # reduce batch dimension

        return tags

    def _build_response(self, sent, tags, prob):
        words = self.tokenizer(sent)
        res = {"words": words, "entities": []}
        chunks = metrics.get_entities(tags)

        for chunk_type, chunk_start, chunk_end in chunks:
            chunk_end += 1
            entity = {
                "text": " ".join(words[chunk_start:chunk_end]),
                "type": chunk_type,
                "score": float(np.average(prob[chunk_start:chunk_end])),
                "beginOffset": chunk_start,
                "endOffset": chunk_end,
            }
            res["entities"].append(entity)

        return res

    def analyze(self, text):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.

        Returns:
            res: dict.

        Examples:
            >>> text = 'President Obama is speaking at the White House.'
            >>> model.analyze(text)
            {
                "words": [
                    "President",
                    "Obama",
                    "is",
                    "speaking",
                    "at",
                    "the",
                    "White",
                    "House."
                ],
                "entities": [
                    {
                        "beginOffset": 1,
                        "endOffset": 2,
                        "score": 1,
                        "text": "Obama",
                        "type": "PER"
                    },
                    {
                        "beginOffset": 6,
                        "endOffset": 8,
                        "score": 1,
                        "text": "White House.",
                        "type": "ORG"
                    }
                ]
            }
        """
        pred = self.predict_proba(text)
        tags = self._get_tags(pred)
        prob = self._get_prob(pred)
        res = self._build_response(text, tags, prob)

        return res

    def predict(self, text):
        """Predict using the model.

        Args:
            text: string, the input text.

        Returns:
            tags: list, shape = (num_words,)
            Returns predicted values.
        """
        pred = self.predict_proba(text)
        tags = self._get_tags(pred)

        return tags

Classes

class Tagger (model, preprocessor, tokenizer=<method 'split' of 'str' objects>)

A model API that tags input sentence.

Attributes

model
Model.
preprocessor
Transformer. Preprocessing data for feature extraction.
tokenizer
Tokenize input sentence. Default tokenizer is str.split.
Expand source code
class Tagger(object):
    """A model API that tags input sentence.

    Attributes:
        model: Model.
        preprocessor: Transformer. Preprocessing data for feature extraction.
        tokenizer: Tokenize input sentence. Default tokenizer is `str.split`.
    """

    def __init__(self, model, preprocessor, tokenizer=str.split):
        self.model = model
        self.preprocessor = preprocessor
        self.tokenizer = tokenizer

    def predict_proba(self, text):
        """Probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        Args:
            text : string, the input text.

        Returns:
            y : array-like, shape = [num_words, num_classes]
            Returns the probability of the word for each class in the model,
        """
        assert isinstance(text, str)

        words = self.tokenizer(text)
        X = self.preprocessor.transform([words])
        y = self.model.predict(X)
        y = y[0]  # reduce batch dimension.

        return y

    def _get_prob(self, pred):
        prob = np.max(pred, -1)

        return prob

    def _get_tags(self, pred):
        tags = self.preprocessor.inverse_transform([pred])
        tags = tags[0]  # reduce batch dimension

        return tags

    def _build_response(self, sent, tags, prob):
        words = self.tokenizer(sent)
        res = {"words": words, "entities": []}
        chunks = metrics.get_entities(tags)

        for chunk_type, chunk_start, chunk_end in chunks:
            chunk_end += 1
            entity = {
                "text": " ".join(words[chunk_start:chunk_end]),
                "type": chunk_type,
                "score": float(np.average(prob[chunk_start:chunk_end])),
                "beginOffset": chunk_start,
                "endOffset": chunk_end,
            }
            res["entities"].append(entity)

        return res

    def analyze(self, text):
        """Analyze text and return pretty format.

        Args:
            text: string, the input text.

        Returns:
            res: dict.

        Examples:
            >>> text = 'President Obama is speaking at the White House.'
            >>> model.analyze(text)
            {
                "words": [
                    "President",
                    "Obama",
                    "is",
                    "speaking",
                    "at",
                    "the",
                    "White",
                    "House."
                ],
                "entities": [
                    {
                        "beginOffset": 1,
                        "endOffset": 2,
                        "score": 1,
                        "text": "Obama",
                        "type": "PER"
                    },
                    {
                        "beginOffset": 6,
                        "endOffset": 8,
                        "score": 1,
                        "text": "White House.",
                        "type": "ORG"
                    }
                ]
            }
        """
        pred = self.predict_proba(text)
        tags = self._get_tags(pred)
        prob = self._get_prob(pred)
        res = self._build_response(text, tags, prob)

        return res

    def predict(self, text):
        """Predict using the model.

        Args:
            text: string, the input text.

        Returns:
            tags: list, shape = (num_words,)
            Returns predicted values.
        """
        pred = self.predict_proba(text)
        tags = self._get_tags(pred)

        return tags

Methods

def analyze(self, text)

Analyze text and return pretty format.

Args

text
string, the input text.

Returns

res
dict.

Examples

>>> text = 'President Obama is speaking at the White House.'
>>> model.analyze(text)
{
    "words": [
        "President",
        "Obama",
        "is",
        "speaking",
        "at",
        "the",
        "White",
        "House."
    ],
    "entities": [
        {
            "beginOffset": 1,
            "endOffset": 2,
            "score": 1,
            "text": "Obama",
            "type": "PER"
        },
        {
            "beginOffset": 6,
            "endOffset": 8,
            "score": 1,
            "text": "White House.",
            "type": "ORG"
        }
    ]
}
Expand source code
def analyze(self, text):
    """Analyze text and return pretty format.

    Args:
        text: string, the input text.

    Returns:
        res: dict.

    Examples:
        >>> text = 'President Obama is speaking at the White House.'
        >>> model.analyze(text)
        {
            "words": [
                "President",
                "Obama",
                "is",
                "speaking",
                "at",
                "the",
                "White",
                "House."
            ],
            "entities": [
                {
                    "beginOffset": 1,
                    "endOffset": 2,
                    "score": 1,
                    "text": "Obama",
                    "type": "PER"
                },
                {
                    "beginOffset": 6,
                    "endOffset": 8,
                    "score": 1,
                    "text": "White House.",
                    "type": "ORG"
                }
            ]
        }
    """
    pred = self.predict_proba(text)
    tags = self._get_tags(pred)
    prob = self._get_prob(pred)
    res = self._build_response(text, tags, prob)

    return res
def predict(self, text)

Predict using the model.

Args

text
string, the input text.

Returns

tags
list, shape = (num_words,)

Returns predicted values.

Expand source code
def predict(self, text):
    """Predict using the model.

    Args:
        text: string, the input text.

    Returns:
        tags: list, shape = (num_words,)
        Returns predicted values.
    """
    pred = self.predict_proba(text)
    tags = self._get_tags(pred)

    return tags
def predict_proba(self, text)

Probability estimates.

The returned estimates for all classes are ordered by the label of classes.

Args

text : string, the input text.

Returns

y
array-like, shape = [num_words, num_classes]

Returns the probability of the word for each class in the model,

Expand source code
def predict_proba(self, text):
    """Probability estimates.

    The returned estimates for all classes are ordered by the
    label of classes.

    Args:
        text : string, the input text.

    Returns:
        y : array-like, shape = [num_words, num_classes]
        Returns the probability of the word for each class in the model,
    """
    assert isinstance(text, str)

    words = self.tokenizer(text)
    X = self.preprocessor.transform([words])
    y = self.model.predict(X)
    y = y[0]  # reduce batch dimension.

    return y