Module ktrain.text.ner.anago.utils

Utility functions.

Expand source code
"""
Utility functions.
"""

from ....imports import *


def download(url):
    """Download a trained weights, config and preprocessor.

    Args:
        url (str): target url.
    """
    filepath = keras.utils.get_file(fname="tmp.zip", origin=url, extract=True)
    base_dir = os.path.dirname(filepath)
    weights_file = os.path.join(base_dir, "weights.h5")
    params_file = os.path.join(base_dir, "params.json")
    preprocessor_file = os.path.join(base_dir, "preprocessor.pickle")

    return weights_file, params_file, preprocessor_file


def load_data_and_labels(filename, encoding="utf-8"):
    """Loads data and label from a file.

    Args:
        filename (str): path to the file.
        encoding (str): file encoding format.

        The file format is tab-separated values.
        A blank line is required at the end of a sentence.

        For example:
        ```
        EU      B-ORG
        rejects O
        German  B-MISC
        call    O
        to      O
        boycott O
        British B-MISC
        lamb    O
        .       O

        Peter   B-PER
        Blackburn       I-PER
        ...
        ```

    Returns:
        tuple(numpy array, numpy array): data and labels.

    Example:
        >>> filename = 'conll2003/en/ner/train.txt'
        >>> data, labels = load_data_and_labels(filename)
    """
    sents, labels = [], []
    words, tags = [], []
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, tag = line.split("\t")
                words.append(word)
                tags.append(tag)
            else:
                sents.append(words)
                labels.append(tags)
                words, tags = [], []

    return sents, labels


class AnagoNERSequence(keras.utils.Sequence):
    def __init__(self, x, y, batch_size=1, preprocess=None):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.preprocess = preprocess

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]

        return self.preprocess(batch_x, batch_y)

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)


class Vocabulary(object):
    """A vocabulary that maps tokens to ints (storing a vocabulary).

    Attributes:
        _token_count: A collections.Counter object holding the frequencies of tokens
            in the data used to build the Vocabulary.
        _token2id: A collections.defaultdict instance mapping token strings to
            numerical identifiers.
        _id2token: A list of token strings indexed by their numerical identifiers.
    """

    def __init__(self, max_size=None, lower=True, unk_token=True, specials=("<pad>",)):
        """Create a Vocabulary object.

        Args:
            max_size: The maximum size of the vocabulary, or None for no
                maximum. Default: None.
            lower: boolean. Whether to convert the texts to lowercase.
            unk_token: boolean. Whether to add unknown token.
            specials: The list of special tokens (e.g., padding or eos) that
                will be prepended to the vocabulary. Default: ('<pad>',)
        """
        self._max_size = max_size
        self._lower = lower
        self._unk = unk_token
        self._token2id = {token: i for i, token in enumerate(specials)}
        self._id2token = list(specials)
        self._token_count = Counter()

    def __len__(self):
        return len(self._token2id)

    def add_token(self, token):
        """Add token to vocabulary.

        Args:
            token (str): token to add.
        """
        token = self.process_token(token)
        self._token_count.update([token])

    def add_documents(self, docs):
        """Update dictionary from a collection of documents. Each document is a list
        of tokens.

        Args:
            docs (list): documents to add.
        """
        for sent in docs:
            sent = map(self.process_token, sent)
            self._token_count.update(sent)

    def doc2id(self, doc):
        """Get the list of token_id given doc.

        Args:
            doc (list): document.

        Returns:
            list: int id of doc.
        """
        doc = map(self.process_token, doc)
        return [self.token_to_id(token) for token in doc]

    def id2doc(self, ids):
        """Get the token list.

        Args:
            ids (list): token ids.

        Returns:
            list: token list.
        """
        return [self.id_to_token(idx) for idx in ids]

    def build(self):
        """
        Build vocabulary.
        """
        token_freq = self._token_count.most_common(self._max_size)
        idx = len(self.vocab)
        for token, _ in token_freq:
            self._token2id[token] = idx
            self._id2token.append(token)
            idx += 1
        if self._unk:
            unk = "<unk>"
            self._token2id[unk] = idx
            self._id2token.append(unk)

    def process_token(self, token):
        """Process token before following methods:
        * add_token
        * add_documents
        * doc2id
        * token_to_id

        Args:
            token (str): token to process.

        Returns:
            str: processed token string.
        """
        if self._lower:
            token = token.lower()

        return token

    def token_to_id(self, token):
        """Get the token_id of given token.

        Args:
            token (str): token from vocabulary.

        Returns:
            int: int id of token.
        """
        token = self.process_token(token)
        return self._token2id.get(token, len(self._token2id) - 1)

    def id_to_token(self, idx):
        """token-id to token (string).

        Args:
            idx (int): token id.

        Returns:
            str: string of given token id.
        """
        return self._id2token[idx]

    @property
    def vocab(self):
        """Return the vocabulary.

        Returns:
            dict: get the dict object of the vocabulary.
        """
        return self._token2id

    @property
    def reverse_vocab(self):
        """Return the vocabulary as a reversed dict object.

        Returns:
            dict: reversed vocabulary object.
        """
        return self._id2token


def filter_embeddings(embeddings, vocab, dim):
    """Loads word vectors in numpy array.

    Args:
        embeddings (dict): a dictionary of numpy array.
        vocab (dict): word_index lookup table.

    Returns:
        numpy array: an array of word embeddings.
    """
    if not isinstance(embeddings, dict):
        return
    _embeddings = np.zeros([len(vocab), dim])
    for word in vocab:
        if word in embeddings:
            word_idx = vocab[word]
            _embeddings[word_idx] = embeddings[word]

    return _embeddings


def load_glove(file):
    """Loads GloVe vectors in numpy array.

    Args:
        file (str): a path to a glove file.

    Return:
        dict: a dict of numpy arrays.
    """
    model = {}
    with open(file) as f:
        for line in f:
            line = line.split(" ")
            word = line[0]
            vector = np.array([float(val) for val in line[1:]])
            model[word] = vector

    return model

Functions

def download(url)

Download a trained weights, config and preprocessor.

Args

url : str
target url.
Expand source code
def download(url):
    """Download a trained weights, config and preprocessor.

    Args:
        url (str): target url.
    """
    filepath = keras.utils.get_file(fname="tmp.zip", origin=url, extract=True)
    base_dir = os.path.dirname(filepath)
    weights_file = os.path.join(base_dir, "weights.h5")
    params_file = os.path.join(base_dir, "params.json")
    preprocessor_file = os.path.join(base_dir, "preprocessor.pickle")

    return weights_file, params_file, preprocessor_file
def filter_embeddings(embeddings, vocab, dim)

Loads word vectors in numpy array.

Args

embeddings : dict
a dictionary of numpy array.
vocab : dict
word_index lookup table.

Returns

numpy array
an array of word embeddings.
Expand source code
def filter_embeddings(embeddings, vocab, dim):
    """Loads word vectors in numpy array.

    Args:
        embeddings (dict): a dictionary of numpy array.
        vocab (dict): word_index lookup table.

    Returns:
        numpy array: an array of word embeddings.
    """
    if not isinstance(embeddings, dict):
        return
    _embeddings = np.zeros([len(vocab), dim])
    for word in vocab:
        if word in embeddings:
            word_idx = vocab[word]
            _embeddings[word_idx] = embeddings[word]

    return _embeddings
def load_data_and_labels(filename, encoding='utf-8')

Loads data and label from a file.

Args

filename : str
path to the file.
encoding : str
file encoding format.

The file format is tab-separated values. A blank line is required at the end of a sentence.

For example:

EU      B-ORG
rejects O
German  B-MISC
call    O
to      O
boycott O
British B-MISC
lamb    O
.       O

Peter   B-PER
Blackburn       I-PER
...

Returns

tuple(numpy array, numpy array): data and labels.

Example

>>> filename = 'conll2003/en/ner/train.txt'
>>> data, labels = load_data_and_labels(filename)
Expand source code
def load_data_and_labels(filename, encoding="utf-8"):
    """Loads data and label from a file.

    Args:
        filename (str): path to the file.
        encoding (str): file encoding format.

        The file format is tab-separated values.
        A blank line is required at the end of a sentence.

        For example:
        ```
        EU      B-ORG
        rejects O
        German  B-MISC
        call    O
        to      O
        boycott O
        British B-MISC
        lamb    O
        .       O

        Peter   B-PER
        Blackburn       I-PER
        ...
        ```

    Returns:
        tuple(numpy array, numpy array): data and labels.

    Example:
        >>> filename = 'conll2003/en/ner/train.txt'
        >>> data, labels = load_data_and_labels(filename)
    """
    sents, labels = [], []
    words, tags = [], []
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, tag = line.split("\t")
                words.append(word)
                tags.append(tag)
            else:
                sents.append(words)
                labels.append(tags)
                words, tags = [], []

    return sents, labels
def load_glove(file)

Loads GloVe vectors in numpy array.

Args

file : str
a path to a glove file.

Return

dict: a dict of numpy arrays.

Expand source code
def load_glove(file):
    """Loads GloVe vectors in numpy array.

    Args:
        file (str): a path to a glove file.

    Return:
        dict: a dict of numpy arrays.
    """
    model = {}
    with open(file) as f:
        for line in f:
            line = line.split(" ")
            word = line[0]
            vector = np.array([float(val) for val in line[1:]])
            model[word] = vector

    return model

Classes

class AnagoNERSequence (x, y, batch_size=1, preprocess=None)

Base object for fitting to a sequence of data, such as a dataset.

Every Sequence must implement the __getitem__ and the __len__ methods. If you want to modify your dataset between epochs you may implement on_epoch_end. The method __getitem__ should return a complete batch.

Notes:

Sequence are a safer way to do multiprocessing. This structure guarantees that the network will only train once on each sample per epoch which is not the case with generators.

Examples:

from skimage.io import imread
from skimage.transform import resize
import numpy as np
import math

# Here, `x_set` is list of path to the images
# and `y_set` are the associated classes.

class CIFAR10Sequence(tf.keras.utils.Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) *
        self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) *
        self.batch_size]

        return np.array([
            resize(imread(file_name), (200, 200))
               for file_name in batch_x]), np.array(batch_y)
Expand source code
class AnagoNERSequence(keras.utils.Sequence):
    def __init__(self, x, y, batch_size=1, preprocess=None):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        self.preprocess = preprocess

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]

        return self.preprocess(batch_x, batch_y)

    def __len__(self):
        return math.ceil(len(self.x) / self.batch_size)

Ancestors

  • keras.utils.data_utils.Sequence
class Vocabulary (max_size=None, lower=True, unk_token=True, specials=('<pad>',))

A vocabulary that maps tokens to ints (storing a vocabulary).

Attributes

_token_count
A collections.Counter object holding the frequencies of tokens in the data used to build the Vocabulary.
_token2id
A collections.defaultdict instance mapping token strings to numerical identifiers.
_id2token
A list of token strings indexed by their numerical identifiers.

Create a Vocabulary object.

Args

max_size
The maximum size of the vocabulary, or None for no maximum. Default: None.
lower
boolean. Whether to convert the texts to lowercase.
unk_token
boolean. Whether to add unknown token.
specials
The list of special tokens (e.g., padding or eos) that will be prepended to the vocabulary. Default: ('',)
Expand source code
class Vocabulary(object):
    """A vocabulary that maps tokens to ints (storing a vocabulary).

    Attributes:
        _token_count: A collections.Counter object holding the frequencies of tokens
            in the data used to build the Vocabulary.
        _token2id: A collections.defaultdict instance mapping token strings to
            numerical identifiers.
        _id2token: A list of token strings indexed by their numerical identifiers.
    """

    def __init__(self, max_size=None, lower=True, unk_token=True, specials=("<pad>",)):
        """Create a Vocabulary object.

        Args:
            max_size: The maximum size of the vocabulary, or None for no
                maximum. Default: None.
            lower: boolean. Whether to convert the texts to lowercase.
            unk_token: boolean. Whether to add unknown token.
            specials: The list of special tokens (e.g., padding or eos) that
                will be prepended to the vocabulary. Default: ('<pad>',)
        """
        self._max_size = max_size
        self._lower = lower
        self._unk = unk_token
        self._token2id = {token: i for i, token in enumerate(specials)}
        self._id2token = list(specials)
        self._token_count = Counter()

    def __len__(self):
        return len(self._token2id)

    def add_token(self, token):
        """Add token to vocabulary.

        Args:
            token (str): token to add.
        """
        token = self.process_token(token)
        self._token_count.update([token])

    def add_documents(self, docs):
        """Update dictionary from a collection of documents. Each document is a list
        of tokens.

        Args:
            docs (list): documents to add.
        """
        for sent in docs:
            sent = map(self.process_token, sent)
            self._token_count.update(sent)

    def doc2id(self, doc):
        """Get the list of token_id given doc.

        Args:
            doc (list): document.

        Returns:
            list: int id of doc.
        """
        doc = map(self.process_token, doc)
        return [self.token_to_id(token) for token in doc]

    def id2doc(self, ids):
        """Get the token list.

        Args:
            ids (list): token ids.

        Returns:
            list: token list.
        """
        return [self.id_to_token(idx) for idx in ids]

    def build(self):
        """
        Build vocabulary.
        """
        token_freq = self._token_count.most_common(self._max_size)
        idx = len(self.vocab)
        for token, _ in token_freq:
            self._token2id[token] = idx
            self._id2token.append(token)
            idx += 1
        if self._unk:
            unk = "<unk>"
            self._token2id[unk] = idx
            self._id2token.append(unk)

    def process_token(self, token):
        """Process token before following methods:
        * add_token
        * add_documents
        * doc2id
        * token_to_id

        Args:
            token (str): token to process.

        Returns:
            str: processed token string.
        """
        if self._lower:
            token = token.lower()

        return token

    def token_to_id(self, token):
        """Get the token_id of given token.

        Args:
            token (str): token from vocabulary.

        Returns:
            int: int id of token.
        """
        token = self.process_token(token)
        return self._token2id.get(token, len(self._token2id) - 1)

    def id_to_token(self, idx):
        """token-id to token (string).

        Args:
            idx (int): token id.

        Returns:
            str: string of given token id.
        """
        return self._id2token[idx]

    @property
    def vocab(self):
        """Return the vocabulary.

        Returns:
            dict: get the dict object of the vocabulary.
        """
        return self._token2id

    @property
    def reverse_vocab(self):
        """Return the vocabulary as a reversed dict object.

        Returns:
            dict: reversed vocabulary object.
        """
        return self._id2token

Instance variables

var reverse_vocab

Return the vocabulary as a reversed dict object.

Returns

dict
reversed vocabulary object.
Expand source code
@property
def reverse_vocab(self):
    """Return the vocabulary as a reversed dict object.

    Returns:
        dict: reversed vocabulary object.
    """
    return self._id2token
var vocab

Return the vocabulary.

Returns

dict
get the dict object of the vocabulary.
Expand source code
@property
def vocab(self):
    """Return the vocabulary.

    Returns:
        dict: get the dict object of the vocabulary.
    """
    return self._token2id

Methods

def add_documents(self, docs)

Update dictionary from a collection of documents. Each document is a list of tokens.

Args

docs : list
documents to add.
Expand source code
def add_documents(self, docs):
    """Update dictionary from a collection of documents. Each document is a list
    of tokens.

    Args:
        docs (list): documents to add.
    """
    for sent in docs:
        sent = map(self.process_token, sent)
        self._token_count.update(sent)
def add_token(self, token)

Add token to vocabulary.

Args

token : str
token to add.
Expand source code
def add_token(self, token):
    """Add token to vocabulary.

    Args:
        token (str): token to add.
    """
    token = self.process_token(token)
    self._token_count.update([token])
def build(self)

Build vocabulary.

Expand source code
def build(self):
    """
    Build vocabulary.
    """
    token_freq = self._token_count.most_common(self._max_size)
    idx = len(self.vocab)
    for token, _ in token_freq:
        self._token2id[token] = idx
        self._id2token.append(token)
        idx += 1
    if self._unk:
        unk = "<unk>"
        self._token2id[unk] = idx
        self._id2token.append(unk)
def doc2id(self, doc)

Get the list of token_id given doc.

Args

doc : list
document.

Returns

list
int id of doc.
Expand source code
def doc2id(self, doc):
    """Get the list of token_id given doc.

    Args:
        doc (list): document.

    Returns:
        list: int id of doc.
    """
    doc = map(self.process_token, doc)
    return [self.token_to_id(token) for token in doc]
def id2doc(self, ids)

Get the token list.

Args

ids : list
token ids.

Returns

list
token list.
Expand source code
def id2doc(self, ids):
    """Get the token list.

    Args:
        ids (list): token ids.

    Returns:
        list: token list.
    """
    return [self.id_to_token(idx) for idx in ids]
def id_to_token(self, idx)

token-id to token (string).

Args

idx : int
token id.

Returns

str
string of given token id.
Expand source code
def id_to_token(self, idx):
    """token-id to token (string).

    Args:
        idx (int): token id.

    Returns:
        str: string of given token id.
    """
    return self._id2token[idx]
def process_token(self, token)

Process token before following methods: * add_token * add_documents * doc2id * token_to_id

Args

token : str
token to process.

Returns

str
processed token string.
Expand source code
def process_token(self, token):
    """Process token before following methods:
    * add_token
    * add_documents
    * doc2id
    * token_to_id

    Args:
        token (str): token to process.

    Returns:
        str: processed token string.
    """
    if self._lower:
        token = token.lower()

    return token
def token_to_id(self, token)

Get the token_id of given token.

Args

token : str
token from vocabulary.

Returns

int
int id of token.
Expand source code
def token_to_id(self, token):
    """Get the token_id of given token.

    Args:
        token (str): token from vocabulary.

    Returns:
        int: int id of token.
    """
    token = self.process_token(token)
    return self._token2id.get(token, len(self._token2id) - 1)