Module ktrain.text.ner.anago.utils
Utility functions.
Expand source code
"""
Utility functions.
"""
from ....imports import *
def download(url):
"""Download a trained weights, config and preprocessor.
Args:
url (str): target url.
"""
filepath = keras.utils.get_file(fname="tmp.zip", origin=url, extract=True)
base_dir = os.path.dirname(filepath)
weights_file = os.path.join(base_dir, "weights.h5")
params_file = os.path.join(base_dir, "params.json")
preprocessor_file = os.path.join(base_dir, "preprocessor.pickle")
return weights_file, params_file, preprocessor_file
def load_data_and_labels(filename, encoding="utf-8"):
"""Loads data and label from a file.
Args:
filename (str): path to the file.
encoding (str): file encoding format.
The file format is tab-separated values.
A blank line is required at the end of a sentence.
For example:
```
EU B-ORG
rejects O
German B-MISC
call O
to O
boycott O
British B-MISC
lamb O
. O
Peter B-PER
Blackburn I-PER
...
```
Returns:
tuple(numpy array, numpy array): data and labels.
Example:
>>> filename = 'conll2003/en/ner/train.txt'
>>> data, labels = load_data_and_labels(filename)
"""
sents, labels = [], []
words, tags = [], []
with open(filename, encoding=encoding) as f:
for line in f:
line = line.rstrip()
if line:
word, tag = line.split("\t")
words.append(word)
tags.append(tag)
else:
sents.append(words)
labels.append(tags)
words, tags = [], []
return sents, labels
class AnagoNERSequence(keras.utils.Sequence):
def __init__(self, x, y, batch_size=1, preprocess=None):
self.x = x
self.y = y
self.batch_size = batch_size
self.preprocess = preprocess
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]
return self.preprocess(batch_x, batch_y)
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
class Vocabulary(object):
"""A vocabulary that maps tokens to ints (storing a vocabulary).
Attributes:
_token_count: A collections.Counter object holding the frequencies of tokens
in the data used to build the Vocabulary.
_token2id: A collections.defaultdict instance mapping token strings to
numerical identifiers.
_id2token: A list of token strings indexed by their numerical identifiers.
"""
def __init__(self, max_size=None, lower=True, unk_token=True, specials=("<pad>",)):
"""Create a Vocabulary object.
Args:
max_size: The maximum size of the vocabulary, or None for no
maximum. Default: None.
lower: boolean. Whether to convert the texts to lowercase.
unk_token: boolean. Whether to add unknown token.
specials: The list of special tokens (e.g., padding or eos) that
will be prepended to the vocabulary. Default: ('<pad>',)
"""
self._max_size = max_size
self._lower = lower
self._unk = unk_token
self._token2id = {token: i for i, token in enumerate(specials)}
self._id2token = list(specials)
self._token_count = Counter()
def __len__(self):
return len(self._token2id)
def add_token(self, token):
"""Add token to vocabulary.
Args:
token (str): token to add.
"""
token = self.process_token(token)
self._token_count.update([token])
def add_documents(self, docs):
"""Update dictionary from a collection of documents. Each document is a list
of tokens.
Args:
docs (list): documents to add.
"""
for sent in docs:
sent = map(self.process_token, sent)
self._token_count.update(sent)
def doc2id(self, doc):
"""Get the list of token_id given doc.
Args:
doc (list): document.
Returns:
list: int id of doc.
"""
doc = map(self.process_token, doc)
return [self.token_to_id(token) for token in doc]
def id2doc(self, ids):
"""Get the token list.
Args:
ids (list): token ids.
Returns:
list: token list.
"""
return [self.id_to_token(idx) for idx in ids]
def build(self):
"""
Build vocabulary.
"""
token_freq = self._token_count.most_common(self._max_size)
idx = len(self.vocab)
for token, _ in token_freq:
self._token2id[token] = idx
self._id2token.append(token)
idx += 1
if self._unk:
unk = "<unk>"
self._token2id[unk] = idx
self._id2token.append(unk)
def process_token(self, token):
"""Process token before following methods:
* add_token
* add_documents
* doc2id
* token_to_id
Args:
token (str): token to process.
Returns:
str: processed token string.
"""
if self._lower:
token = token.lower()
return token
def token_to_id(self, token):
"""Get the token_id of given token.
Args:
token (str): token from vocabulary.
Returns:
int: int id of token.
"""
token = self.process_token(token)
return self._token2id.get(token, len(self._token2id) - 1)
def id_to_token(self, idx):
"""token-id to token (string).
Args:
idx (int): token id.
Returns:
str: string of given token id.
"""
return self._id2token[idx]
@property
def vocab(self):
"""Return the vocabulary.
Returns:
dict: get the dict object of the vocabulary.
"""
return self._token2id
@property
def reverse_vocab(self):
"""Return the vocabulary as a reversed dict object.
Returns:
dict: reversed vocabulary object.
"""
return self._id2token
def filter_embeddings(embeddings, vocab, dim):
"""Loads word vectors in numpy array.
Args:
embeddings (dict): a dictionary of numpy array.
vocab (dict): word_index lookup table.
Returns:
numpy array: an array of word embeddings.
"""
if not isinstance(embeddings, dict):
return
_embeddings = np.zeros([len(vocab), dim])
for word in vocab:
if word in embeddings:
word_idx = vocab[word]
_embeddings[word_idx] = embeddings[word]
return _embeddings
def load_glove(file):
"""Loads GloVe vectors in numpy array.
Args:
file (str): a path to a glove file.
Return:
dict: a dict of numpy arrays.
"""
model = {}
with open(file) as f:
for line in f:
line = line.split(" ")
word = line[0]
vector = np.array([float(val) for val in line[1:]])
model[word] = vector
return model
Functions
def download(url)
-
Download a trained weights, config and preprocessor.
Args
url
:str
- target url.
Expand source code
def download(url): """Download a trained weights, config and preprocessor. Args: url (str): target url. """ filepath = keras.utils.get_file(fname="tmp.zip", origin=url, extract=True) base_dir = os.path.dirname(filepath) weights_file = os.path.join(base_dir, "weights.h5") params_file = os.path.join(base_dir, "params.json") preprocessor_file = os.path.join(base_dir, "preprocessor.pickle") return weights_file, params_file, preprocessor_file
def filter_embeddings(embeddings, vocab, dim)
-
Loads word vectors in numpy array.
Args
embeddings
:dict
- a dictionary of numpy array.
vocab
:dict
- word_index lookup table.
Returns
numpy array
- an array of word embeddings.
Expand source code
def filter_embeddings(embeddings, vocab, dim): """Loads word vectors in numpy array. Args: embeddings (dict): a dictionary of numpy array. vocab (dict): word_index lookup table. Returns: numpy array: an array of word embeddings. """ if not isinstance(embeddings, dict): return _embeddings = np.zeros([len(vocab), dim]) for word in vocab: if word in embeddings: word_idx = vocab[word] _embeddings[word_idx] = embeddings[word] return _embeddings
def load_data_and_labels(filename, encoding='utf-8')
-
Loads data and label from a file.
Args
filename
:str
- path to the file.
encoding
:str
- file encoding format.
The file format is tab-separated values. A blank line is required at the end of a sentence.
For example:
EU B-ORG rejects O German B-MISC call O to O boycott O British B-MISC lamb O . O Peter B-PER Blackburn I-PER ...
Returns
tuple(numpy array, numpy array): data and labels.
Example
>>> filename = 'conll2003/en/ner/train.txt' >>> data, labels = load_data_and_labels(filename)
Expand source code
def load_data_and_labels(filename, encoding="utf-8"): """Loads data and label from a file. Args: filename (str): path to the file. encoding (str): file encoding format. The file format is tab-separated values. A blank line is required at the end of a sentence. For example: ``` EU B-ORG rejects O German B-MISC call O to O boycott O British B-MISC lamb O . O Peter B-PER Blackburn I-PER ... ``` Returns: tuple(numpy array, numpy array): data and labels. Example: >>> filename = 'conll2003/en/ner/train.txt' >>> data, labels = load_data_and_labels(filename) """ sents, labels = [], [] words, tags = [], [] with open(filename, encoding=encoding) as f: for line in f: line = line.rstrip() if line: word, tag = line.split("\t") words.append(word) tags.append(tag) else: sents.append(words) labels.append(tags) words, tags = [], [] return sents, labels
def load_glove(file)
-
Loads GloVe vectors in numpy array.
Args
file
:str
- a path to a glove file.
Return
dict: a dict of numpy arrays.
Expand source code
def load_glove(file): """Loads GloVe vectors in numpy array. Args: file (str): a path to a glove file. Return: dict: a dict of numpy arrays. """ model = {} with open(file) as f: for line in f: line = line.split(" ") word = line[0] vector = np.array([float(val) for val in line[1:]]) model[word] = vector return model
Classes
class AnagoNERSequence (x, y, batch_size=1, preprocess=None)
-
Base object for fitting to a sequence of data, such as a dataset.
Every
Sequence
must implement the__getitem__
and the__len__
methods. If you want to modify your dataset between epochs you may implementon_epoch_end
. The method__getitem__
should return a complete batch.Notes:
Sequence
are a safer way to do multiprocessing. This structure guarantees that the network will only train once on each sample per epoch which is not the case with generators.Examples:
from skimage.io import imread from skimage.transform import resize import numpy as np import math # Here, `x_set` is list of path to the images # and `y_set` are the associated classes. class CIFAR10Sequence(tf.keras.utils.Sequence): def __init__(self, x_set, y_set, batch_size): self.x, self.y = x_set, y_set self.batch_size = batch_size def __len__(self): return math.ceil(len(self.x) / self.batch_size) def __getitem__(self, idx): batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size] batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size] return np.array([ resize(imread(file_name), (200, 200)) for file_name in batch_x]), np.array(batch_y)
Expand source code
class AnagoNERSequence(keras.utils.Sequence): def __init__(self, x, y, batch_size=1, preprocess=None): self.x = x self.y = y self.batch_size = batch_size self.preprocess = preprocess def __getitem__(self, idx): batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size] batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size] return self.preprocess(batch_x, batch_y) def __len__(self): return math.ceil(len(self.x) / self.batch_size)
Ancestors
- keras.utils.data_utils.Sequence
class Vocabulary (max_size=None, lower=True, unk_token=True, specials=('<pad>',))
-
A vocabulary that maps tokens to ints (storing a vocabulary).
Attributes
_token_count
- A collections.Counter object holding the frequencies of tokens in the data used to build the Vocabulary.
_token2id
- A collections.defaultdict instance mapping token strings to numerical identifiers.
_id2token
- A list of token strings indexed by their numerical identifiers.
Create a Vocabulary object.
Args
max_size
- The maximum size of the vocabulary, or None for no maximum. Default: None.
lower
- boolean. Whether to convert the texts to lowercase.
unk_token
- boolean. Whether to add unknown token.
specials
- The list of special tokens (e.g., padding or eos) that
will be prepended to the vocabulary. Default: ('
',)
Expand source code
class Vocabulary(object): """A vocabulary that maps tokens to ints (storing a vocabulary). Attributes: _token_count: A collections.Counter object holding the frequencies of tokens in the data used to build the Vocabulary. _token2id: A collections.defaultdict instance mapping token strings to numerical identifiers. _id2token: A list of token strings indexed by their numerical identifiers. """ def __init__(self, max_size=None, lower=True, unk_token=True, specials=("<pad>",)): """Create a Vocabulary object. Args: max_size: The maximum size of the vocabulary, or None for no maximum. Default: None. lower: boolean. Whether to convert the texts to lowercase. unk_token: boolean. Whether to add unknown token. specials: The list of special tokens (e.g., padding or eos) that will be prepended to the vocabulary. Default: ('<pad>',) """ self._max_size = max_size self._lower = lower self._unk = unk_token self._token2id = {token: i for i, token in enumerate(specials)} self._id2token = list(specials) self._token_count = Counter() def __len__(self): return len(self._token2id) def add_token(self, token): """Add token to vocabulary. Args: token (str): token to add. """ token = self.process_token(token) self._token_count.update([token]) def add_documents(self, docs): """Update dictionary from a collection of documents. Each document is a list of tokens. Args: docs (list): documents to add. """ for sent in docs: sent = map(self.process_token, sent) self._token_count.update(sent) def doc2id(self, doc): """Get the list of token_id given doc. Args: doc (list): document. Returns: list: int id of doc. """ doc = map(self.process_token, doc) return [self.token_to_id(token) for token in doc] def id2doc(self, ids): """Get the token list. Args: ids (list): token ids. Returns: list: token list. """ return [self.id_to_token(idx) for idx in ids] def build(self): """ Build vocabulary. """ token_freq = self._token_count.most_common(self._max_size) idx = len(self.vocab) for token, _ in token_freq: self._token2id[token] = idx self._id2token.append(token) idx += 1 if self._unk: unk = "<unk>" self._token2id[unk] = idx self._id2token.append(unk) def process_token(self, token): """Process token before following methods: * add_token * add_documents * doc2id * token_to_id Args: token (str): token to process. Returns: str: processed token string. """ if self._lower: token = token.lower() return token def token_to_id(self, token): """Get the token_id of given token. Args: token (str): token from vocabulary. Returns: int: int id of token. """ token = self.process_token(token) return self._token2id.get(token, len(self._token2id) - 1) def id_to_token(self, idx): """token-id to token (string). Args: idx (int): token id. Returns: str: string of given token id. """ return self._id2token[idx] @property def vocab(self): """Return the vocabulary. Returns: dict: get the dict object of the vocabulary. """ return self._token2id @property def reverse_vocab(self): """Return the vocabulary as a reversed dict object. Returns: dict: reversed vocabulary object. """ return self._id2token
Instance variables
var reverse_vocab
-
Return the vocabulary as a reversed dict object.
Returns
dict
- reversed vocabulary object.
Expand source code
@property def reverse_vocab(self): """Return the vocabulary as a reversed dict object. Returns: dict: reversed vocabulary object. """ return self._id2token
var vocab
-
Return the vocabulary.
Returns
dict
- get the dict object of the vocabulary.
Expand source code
@property def vocab(self): """Return the vocabulary. Returns: dict: get the dict object of the vocabulary. """ return self._token2id
Methods
def add_documents(self, docs)
-
Update dictionary from a collection of documents. Each document is a list of tokens.
Args
docs
:list
- documents to add.
Expand source code
def add_documents(self, docs): """Update dictionary from a collection of documents. Each document is a list of tokens. Args: docs (list): documents to add. """ for sent in docs: sent = map(self.process_token, sent) self._token_count.update(sent)
def add_token(self, token)
-
Add token to vocabulary.
Args
token
:str
- token to add.
Expand source code
def add_token(self, token): """Add token to vocabulary. Args: token (str): token to add. """ token = self.process_token(token) self._token_count.update([token])
def build(self)
-
Build vocabulary.
Expand source code
def build(self): """ Build vocabulary. """ token_freq = self._token_count.most_common(self._max_size) idx = len(self.vocab) for token, _ in token_freq: self._token2id[token] = idx self._id2token.append(token) idx += 1 if self._unk: unk = "<unk>" self._token2id[unk] = idx self._id2token.append(unk)
def doc2id(self, doc)
-
Get the list of token_id given doc.
Args
doc
:list
- document.
Returns
list
- int id of doc.
Expand source code
def doc2id(self, doc): """Get the list of token_id given doc. Args: doc (list): document. Returns: list: int id of doc. """ doc = map(self.process_token, doc) return [self.token_to_id(token) for token in doc]
def id2doc(self, ids)
-
Get the token list.
Args
ids
:list
- token ids.
Returns
list
- token list.
Expand source code
def id2doc(self, ids): """Get the token list. Args: ids (list): token ids. Returns: list: token list. """ return [self.id_to_token(idx) for idx in ids]
def id_to_token(self, idx)
-
token-id to token (string).
Args
idx
:int
- token id.
Returns
str
- string of given token id.
Expand source code
def id_to_token(self, idx): """token-id to token (string). Args: idx (int): token id. Returns: str: string of given token id. """ return self._id2token[idx]
def process_token(self, token)
-
Process token before following methods: * add_token * add_documents * doc2id * token_to_id
Args
token
:str
- token to process.
Returns
str
- processed token string.
Expand source code
def process_token(self, token): """Process token before following methods: * add_token * add_documents * doc2id * token_to_id Args: token (str): token to process. Returns: str: processed token string. """ if self._lower: token = token.lower() return token
def token_to_id(self, token)
-
Get the token_id of given token.
Args
token
:str
- token from vocabulary.
Returns
int
- int id of token.
Expand source code
def token_to_id(self, token): """Get the token_id of given token. Args: token (str): token from vocabulary. Returns: int: int id of token. """ token = self.process_token(token) return self._token2id.get(token, len(self._token2id) - 1)