Module ktrain.dataset
Expand source code
from .imports import *
class Dataset:
"""
```
Base class for custom datasets in ktrain.
If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.
The signature of to_tfdataset is as follows:
def to_tfdataset(self, train=True)
See ktrain.text.preprocess.TransformerDataset as an example.
```
"""
# required: used by ktrain.core.Learner instances
def nsamples(self):
raise NotImplemented
# required: used by ktrain.core.Learner instances
def get_y(self):
raise NotImplemented
# optional: to modify dataset between epochs (e.g., shuffle)
def on_epoch_end(self):
pass
# optional
def ondisk(self):
"""
```
Is data being read from disk like with DirectoryIterators?
```
"""
return False
# optional: used only if invoking *_classifier functions
def xshape(self):
"""
```
shape of X
Examples:
for images: input_shape
for text: (n_example, sequence_length)
```
"""
raise NotImplemented
# optional: used only if invoking *_classifier functions
def nclasses(self):
"""
```
Number of classes
For classification problems: this is the number of labels
Not used for regression problems
```
"""
raise NotImplemented
class TFDataset(Dataset):
"""
```
Wrapper for tf.data.Datasets
```
"""
def __init__(self, tfdataset, n, y):
"""
```
Args:
tfdataset(tf.data.Dataset): a tf.Dataset instance
n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
```
"""
if not isinstance(tfdataset, tf.data.Dataset):
raise ValueError(
"tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately"
)
self.tfdataset = tfdataset
self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[
0
] # extract batch_size from tfdataset
self.n = n
self.y = y
@property
def batch_size(self):
return self.bs
@batch_size.setter
def batch_size(self, value):
if value != self.bs:
warnings.warn(
"batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used"
)
def nsamples(self):
return self.n
def get_y(self):
return self.y
def to_tfdataset(self, train=True):
return self.tfdataset
class SequenceDataset(Dataset, keras.utils.Sequence):
"""
```
Base class for custom datasets in ktrain.
If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.
The signature of to_tfdataset is as follows:
def to_tfdataset(self, training=True)
See ktrain.text.preprocess.TransformerDataset as an example.
```
"""
def __init__(self, batch_size=32):
self.batch_size = batch_size
# required by keras.utils.Sequence instances
def __len__(self):
raise NotImplemented
# required by keras.utils.Sequence instances
def __getitem__(self, idx):
raise NotImplemented
return False
class MultiArrayDataset(SequenceDataset):
def __init__(self, x, y, batch_size=32, shuffle=True):
# error checks
err = False
if type(x) == np.ndarray:
if len(x.shape) != 2:
err = True
elif type(x) == list:
for d in x:
if type(d) != np.ndarray or len(d.shape) != 2:
err = True
break
else:
err = True
if err:
raise ValueError("x must be a 2d numpy array or a list of 2d numpy arrays")
if type(y) != np.ndarray:
raise ValueError("y must be a numpy array")
if type(x) == np.ndarray:
x = [x]
# set variables
super().__init__(batch_size=batch_size)
self.x, self.y = x, y
self.indices = np.arange(self.x[0].shape[0])
self.n_inputs = len(x)
self.shuffle = shuffle
def __len__(self):
return math.ceil(self.x[0].shape[0] / self.batch_size)
def __getitem__(self, idx):
inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
batch_x = []
for i in range(self.n_inputs):
batch_x.append(self.x[i][inds])
batch_y = self.y[inds]
return tuple(batch_x), batch_y
def nsamples(self):
return self.x[0].shape[0]
def get_y(self):
return self.y
def on_epoch_end(self):
if self.shuffle:
np.random.shuffle(self.indices)
def xshape(self):
return self.x[0].shape
def nclasses(self):
return self.y.shape[1]
def ondisk(self):
return False
Classes
class Dataset
-
Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, train=True) See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class Dataset: """ ``` Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, train=True) See ktrain.text.preprocess.TransformerDataset as an example. ``` """ # required: used by ktrain.core.Learner instances def nsamples(self): raise NotImplemented # required: used by ktrain.core.Learner instances def get_y(self): raise NotImplemented # optional: to modify dataset between epochs (e.g., shuffle) def on_epoch_end(self): pass # optional def ondisk(self): """ ``` Is data being read from disk like with DirectoryIterators? ``` """ return False # optional: used only if invoking *_classifier functions def xshape(self): """ ``` shape of X Examples: for images: input_shape for text: (n_example, sequence_length) ``` """ raise NotImplemented # optional: used only if invoking *_classifier functions def nclasses(self): """ ``` Number of classes For classification problems: this is the number of labels Not used for regression problems ``` """ raise NotImplemented
Subclasses
Methods
def get_y(self)
-
Expand source code
def get_y(self): raise NotImplemented
def nclasses(self)
-
Number of classes For classification problems: this is the number of labels Not used for regression problems
Expand source code
def nclasses(self): """ ``` Number of classes For classification problems: this is the number of labels Not used for regression problems ``` """ raise NotImplemented
def nsamples(self)
-
Expand source code
def nsamples(self): raise NotImplemented
def on_epoch_end(self)
-
Expand source code
def on_epoch_end(self): pass
def ondisk(self)
-
Is data being read from disk like with DirectoryIterators?
Expand source code
def ondisk(self): """ ``` Is data being read from disk like with DirectoryIterators? ``` """ return False
def xshape(self)
-
shape of X Examples: for images: input_shape for text: (n_example, sequence_length)
Expand source code
def xshape(self): """ ``` shape of X Examples: for images: input_shape for text: (n_example, sequence_length) ``` """ raise NotImplemented
class MultiArrayDataset (x, y, batch_size=32, shuffle=True)
-
Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, training=True) See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class MultiArrayDataset(SequenceDataset): def __init__(self, x, y, batch_size=32, shuffle=True): # error checks err = False if type(x) == np.ndarray: if len(x.shape) != 2: err = True elif type(x) == list: for d in x: if type(d) != np.ndarray or len(d.shape) != 2: err = True break else: err = True if err: raise ValueError("x must be a 2d numpy array or a list of 2d numpy arrays") if type(y) != np.ndarray: raise ValueError("y must be a numpy array") if type(x) == np.ndarray: x = [x] # set variables super().__init__(batch_size=batch_size) self.x, self.y = x, y self.indices = np.arange(self.x[0].shape[0]) self.n_inputs = len(x) self.shuffle = shuffle def __len__(self): return math.ceil(self.x[0].shape[0] / self.batch_size) def __getitem__(self, idx): inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size] batch_x = [] for i in range(self.n_inputs): batch_x.append(self.x[i][inds]) batch_y = self.y[inds] return tuple(batch_x), batch_y def nsamples(self): return self.x[0].shape[0] def get_y(self): return self.y def on_epoch_end(self): if self.shuffle: np.random.shuffle(self.indices) def xshape(self): return self.x[0].shape def nclasses(self): return self.y.shape[1] def ondisk(self): return False
Ancestors
- SequenceDataset
- Dataset
- keras.utils.data_utils.Sequence
Methods
def get_y(self)
-
Expand source code
def get_y(self): return self.y
def nsamples(self)
-
Expand source code
def nsamples(self): return self.x[0].shape[0]
def on_epoch_end(self)
-
Method called at the end of every epoch.
Expand source code
def on_epoch_end(self): if self.shuffle: np.random.shuffle(self.indices)
Inherited members
class SequenceDataset (batch_size=32)
-
Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, training=True) See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class SequenceDataset(Dataset, keras.utils.Sequence): """ ``` Base class for custom datasets in ktrain. If subclass of Dataset implements a method to to_tfdataset that converts the data to a tf.Dataset, then this will be invoked by Learner instances just prior to training so fit() will train using a tf.Dataset representation of your data. Sequence methods such as __get_item__ and __len__ must still be implemented. The signature of to_tfdataset is as follows: def to_tfdataset(self, training=True) See ktrain.text.preprocess.TransformerDataset as an example. ``` """ def __init__(self, batch_size=32): self.batch_size = batch_size # required by keras.utils.Sequence instances def __len__(self): raise NotImplemented # required by keras.utils.Sequence instances def __getitem__(self, idx): raise NotImplemented return False
Ancestors
- Dataset
- keras.utils.data_utils.Sequence
Subclasses
- MultiArrayDataset
- LinkSequenceWrapper
- NodeSequenceWrapper
- TabularDataset
- TransformerDataset
- NERSequence
Inherited members
class TFDataset (tfdataset, n, y)
-
Wrapper for tf.data.Datasets
Args: tfdataset(tf.data.Dataset): a tf.Dataset instance n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets) y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
Expand source code
class TFDataset(Dataset): """ ``` Wrapper for tf.data.Datasets ``` """ def __init__(self, tfdataset, n, y): """ ``` Args: tfdataset(tf.data.Dataset): a tf.Dataset instance n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets) y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded) ``` """ if not isinstance(tfdataset, tf.data.Dataset): raise ValueError( "tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately" ) self.tfdataset = tfdataset self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[ 0 ] # extract batch_size from tfdataset self.n = n self.y = y @property def batch_size(self): return self.bs @batch_size.setter def batch_size(self, value): if value != self.bs: warnings.warn( "batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used" ) def nsamples(self): return self.n def get_y(self): return self.y def to_tfdataset(self, train=True): return self.tfdataset
Ancestors
Instance variables
var batch_size
-
Expand source code
@property def batch_size(self): return self.bs
Methods
def get_y(self)
-
Expand source code
def get_y(self): return self.y
def nsamples(self)
-
Expand source code
def nsamples(self): return self.n
def to_tfdataset(self, train=True)
-
Expand source code
def to_tfdataset(self, train=True): return self.tfdataset
Inherited members