Module ktrain.text.ner.dataset
Expand source code
from ...dataset import SequenceDataset
from ...imports import *
class NERSequence(SequenceDataset):
def __init__(self, x, y, batch_size=1, p=None):
self.x = x
self.y = y
self.batch_size = batch_size
self.p = p
self.prepare_called = False
def prepare(self):
if self.p is not None and not self.prepare_called:
self.x, self.y = self.p.fix_tokenization(self.x, self.y)
self.prepare_called = True
return
def __getitem__(self, idx):
batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]
return self.p.transform(batch_x, batch_y)
def __len__(self):
return math.ceil(len(self.x) / self.batch_size)
def get_lengths(self, idx):
x_true, y_true = self[idx]
lengths = []
for y in np.argmax(y_true, -1):
try:
i = list(y).index(0)
except ValueError:
i = len(y)
lengths.append(i)
return lengths
def nsamples(self):
return len(self.x)
def get_y(self):
return self.y
def xshape(self):
return (len(self.x), self[0][0][0].shape[1])
def nclasses(self):
return len(self.p._label_vocab._id2token)
Classes
class NERSequence (x, y, batch_size=1, p=None)
-
Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, training=True) See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class NERSequence(SequenceDataset): def __init__(self, x, y, batch_size=1, p=None): self.x = x self.y = y self.batch_size = batch_size self.p = p self.prepare_called = False def prepare(self): if self.p is not None and not self.prepare_called: self.x, self.y = self.p.fix_tokenization(self.x, self.y) self.prepare_called = True return def __getitem__(self, idx): batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size] batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size] return self.p.transform(batch_x, batch_y) def __len__(self): return math.ceil(len(self.x) / self.batch_size) def get_lengths(self, idx): x_true, y_true = self[idx] lengths = [] for y in np.argmax(y_true, -1): try: i = list(y).index(0) except ValueError: i = len(y) lengths.append(i) return lengths def nsamples(self): return len(self.x) def get_y(self): return self.y def xshape(self): return (len(self.x), self[0][0][0].shape[1]) def nclasses(self): return len(self.p._label_vocab._id2token)
Ancestors
- SequenceDataset
- Dataset
- keras.utils.data_utils.Sequence
Methods
def get_lengths(self, idx)
-
Expand source code
def get_lengths(self, idx): x_true, y_true = self[idx] lengths = [] for y in np.argmax(y_true, -1): try: i = list(y).index(0) except ValueError: i = len(y) lengths.append(i) return lengths
def get_y(self)
-
Expand source code
def get_y(self): return self.y
def nsamples(self)
-
Expand source code
def nsamples(self): return len(self.x)
def prepare(self)
-
Expand source code
def prepare(self): if self.p is not None and not self.prepare_called: self.x, self.y = self.p.fix_tokenization(self.x, self.y) self.prepare_called = True return
Inherited members