Module ktrain.dataset

Expand source code
from .imports import *


class Dataset:
    """
    ```
    Base class for custom datasets in ktrain.

    If subclass of Dataset implements a method to to_tfdataset
    that converts the data to a tf.Dataset, then this will be
    invoked by Learner instances just prior to training so
    fit() will train using a tf.Dataset representation of your data.
    Sequence methods such as __get_item__ and __len__
    must still be implemented.

    The signature of to_tfdataset is as follows:

    def to_tfdataset(self, train=True)

    See ktrain.text.preprocess.TransformerDataset as an example.
    ```
    """

    # required: used by ktrain.core.Learner instances
    def nsamples(self):
        raise NotImplemented

    # required: used by ktrain.core.Learner instances
    def get_y(self):
        raise NotImplemented

    # optional: to modify dataset between epochs (e.g., shuffle)
    def on_epoch_end(self):
        pass

    # optional
    def ondisk(self):
        """
        ```
        Is data being read from disk like with DirectoryIterators?
        ```
        """
        return False

    # optional: used only if invoking *_classifier functions
    def xshape(self):
        """
        ```
        shape of X
        Examples:
            for images: input_shape
            for text: (n_example, sequence_length)
        ```
        """
        raise NotImplemented

    # optional: used only if invoking *_classifier functions
    def nclasses(self):
        """
        ```
        Number of classes
        For classification problems: this is the number of labels
        Not used for regression problems
        ```
        """
        raise NotImplemented


class TFDataset(Dataset):
    """
    ```
    Wrapper for tf.data.Datasets
    ```
    """

    def __init__(self, tfdataset, n, y):
        """
        ```
        Args:
          tfdataset(tf.data.Dataset):  a tf.Dataset instance
          n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
          y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
        ```
        """
        if not isinstance(tfdataset, tf.data.Dataset):
            raise ValueError(
                "tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately"
            )
        self.tfdataset = tfdataset
        self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[
            0
        ]  # extract batch_size from tfdataset
        self.n = n
        self.y = y

    @property
    def batch_size(self):
        return self.bs

    @batch_size.setter
    def batch_size(self, value):
        if value != self.bs:
            warnings.warn(
                "batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used"
            )

    def nsamples(self):
        return self.n

    def get_y(self):
        return self.y

    def to_tfdataset(self, train=True):
        return self.tfdataset


class SequenceDataset(Dataset, keras.utils.Sequence):
    """
    ```
    Base class for custom datasets in ktrain.

    If subclass of Dataset implements a method to to_tfdataset
    that converts the data to a tf.Dataset, then this will be
    invoked by Learner instances just prior to training so
    fit() will train using a tf.Dataset representation of your data.
    Sequence methods such as __get_item__ and __len__
    must still be implemented.

    The signature of to_tfdataset is as follows:

    def to_tfdataset(self, training=True)

    See ktrain.text.preprocess.TransformerDataset as an example.
    ```
    """

    def __init__(self, batch_size=32):
        self.batch_size = batch_size

    # required by keras.utils.Sequence instances
    def __len__(self):
        raise NotImplemented

    # required by keras.utils.Sequence instances
    def __getitem__(self, idx):
        raise NotImplemented

        return False


class MultiArrayDataset(SequenceDataset):
    def __init__(self, x, y, batch_size=32, shuffle=True):
        # error checks
        err = False
        if type(x) == np.ndarray:
            if len(x.shape) != 2:
                err = True
        elif type(x) == list:
            for d in x:
                if type(d) != np.ndarray or len(d.shape) != 2:
                    err = True
                    break
        else:
            err = True
        if err:
            raise ValueError("x must be a 2d numpy array or a list of 2d numpy arrays")
        if type(y) != np.ndarray:
            raise ValueError("y must be a numpy array")
        if type(x) == np.ndarray:
            x = [x]

        # set variables
        super().__init__(batch_size=batch_size)
        self.x, self.y = x, y
        self.indices = np.arange(self.x[0].shape[0])
        self.n_inputs = len(x)
        self.shuffle = shuffle

    def __len__(self):
        return math.ceil(self.x[0].shape[0] / self.batch_size)

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_x = []
        for i in range(self.n_inputs):
            batch_x.append(self.x[i][inds])
        batch_y = self.y[inds]
        return tuple(batch_x), batch_y

    def nsamples(self):
        return self.x[0].shape[0]

    def get_y(self):
        return self.y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def xshape(self):
        return self.x[0].shape

    def nclasses(self):
        return self.y.shape[1]

    def ondisk(self):
        return False

Classes

class Dataset
Base class for custom datasets in ktrain.

If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.

The signature of to_tfdataset is as follows:

def to_tfdataset(self, train=True)

See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class Dataset:
    """
    ```
    Base class for custom datasets in ktrain.

    If subclass of Dataset implements a method to to_tfdataset
    that converts the data to a tf.Dataset, then this will be
    invoked by Learner instances just prior to training so
    fit() will train using a tf.Dataset representation of your data.
    Sequence methods such as __get_item__ and __len__
    must still be implemented.

    The signature of to_tfdataset is as follows:

    def to_tfdataset(self, train=True)

    See ktrain.text.preprocess.TransformerDataset as an example.
    ```
    """

    # required: used by ktrain.core.Learner instances
    def nsamples(self):
        raise NotImplemented

    # required: used by ktrain.core.Learner instances
    def get_y(self):
        raise NotImplemented

    # optional: to modify dataset between epochs (e.g., shuffle)
    def on_epoch_end(self):
        pass

    # optional
    def ondisk(self):
        """
        ```
        Is data being read from disk like with DirectoryIterators?
        ```
        """
        return False

    # optional: used only if invoking *_classifier functions
    def xshape(self):
        """
        ```
        shape of X
        Examples:
            for images: input_shape
            for text: (n_example, sequence_length)
        ```
        """
        raise NotImplemented

    # optional: used only if invoking *_classifier functions
    def nclasses(self):
        """
        ```
        Number of classes
        For classification problems: this is the number of labels
        Not used for regression problems
        ```
        """
        raise NotImplemented

Subclasses

Methods

def get_y(self)
Expand source code
def get_y(self):
    raise NotImplemented
def nclasses(self)
Number of classes
For classification problems: this is the number of labels
Not used for regression problems
Expand source code
def nclasses(self):
    """
    ```
    Number of classes
    For classification problems: this is the number of labels
    Not used for regression problems
    ```
    """
    raise NotImplemented
def nsamples(self)
Expand source code
def nsamples(self):
    raise NotImplemented
def on_epoch_end(self)
Expand source code
def on_epoch_end(self):
    pass
def ondisk(self)
Is data being read from disk like with DirectoryIterators?
Expand source code
def ondisk(self):
    """
    ```
    Is data being read from disk like with DirectoryIterators?
    ```
    """
    return False
def xshape(self)
shape of X
Examples:
    for images: input_shape
    for text: (n_example, sequence_length)
Expand source code
def xshape(self):
    """
    ```
    shape of X
    Examples:
        for images: input_shape
        for text: (n_example, sequence_length)
    ```
    """
    raise NotImplemented
class MultiArrayDataset (x, y, batch_size=32, shuffle=True)
Base class for custom datasets in ktrain.

If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.

The signature of to_tfdataset is as follows:

def to_tfdataset(self, training=True)

See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class MultiArrayDataset(SequenceDataset):
    def __init__(self, x, y, batch_size=32, shuffle=True):
        # error checks
        err = False
        if type(x) == np.ndarray:
            if len(x.shape) != 2:
                err = True
        elif type(x) == list:
            for d in x:
                if type(d) != np.ndarray or len(d.shape) != 2:
                    err = True
                    break
        else:
            err = True
        if err:
            raise ValueError("x must be a 2d numpy array or a list of 2d numpy arrays")
        if type(y) != np.ndarray:
            raise ValueError("y must be a numpy array")
        if type(x) == np.ndarray:
            x = [x]

        # set variables
        super().__init__(batch_size=batch_size)
        self.x, self.y = x, y
        self.indices = np.arange(self.x[0].shape[0])
        self.n_inputs = len(x)
        self.shuffle = shuffle

    def __len__(self):
        return math.ceil(self.x[0].shape[0] / self.batch_size)

    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_x = []
        for i in range(self.n_inputs):
            batch_x.append(self.x[i][inds])
        batch_y = self.y[inds]
        return tuple(batch_x), batch_y

    def nsamples(self):
        return self.x[0].shape[0]

    def get_y(self):
        return self.y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def xshape(self):
        return self.x[0].shape

    def nclasses(self):
        return self.y.shape[1]

    def ondisk(self):
        return False

Ancestors

Methods

def get_y(self)
Expand source code
def get_y(self):
    return self.y
def nsamples(self)
Expand source code
def nsamples(self):
    return self.x[0].shape[0]
def on_epoch_end(self)

Method called at the end of every epoch.

Expand source code
def on_epoch_end(self):
    if self.shuffle:
        np.random.shuffle(self.indices)

Inherited members

class SequenceDataset (batch_size=32)
Base class for custom datasets in ktrain.

If subclass of Dataset implements a method to to_tfdataset
that converts the data to a tf.Dataset, then this will be
invoked by Learner instances just prior to training so
fit() will train using a tf.Dataset representation of your data.
Sequence methods such as __get_item__ and __len__
must still be implemented.

The signature of to_tfdataset is as follows:

def to_tfdataset(self, training=True)

See ktrain.text.preprocess.TransformerDataset as an example.
Expand source code
class SequenceDataset(Dataset, keras.utils.Sequence):
    """
    ```
    Base class for custom datasets in ktrain.

    If subclass of Dataset implements a method to to_tfdataset
    that converts the data to a tf.Dataset, then this will be
    invoked by Learner instances just prior to training so
    fit() will train using a tf.Dataset representation of your data.
    Sequence methods such as __get_item__ and __len__
    must still be implemented.

    The signature of to_tfdataset is as follows:

    def to_tfdataset(self, training=True)

    See ktrain.text.preprocess.TransformerDataset as an example.
    ```
    """

    def __init__(self, batch_size=32):
        self.batch_size = batch_size

    # required by keras.utils.Sequence instances
    def __len__(self):
        raise NotImplemented

    # required by keras.utils.Sequence instances
    def __getitem__(self, idx):
        raise NotImplemented

        return False

Ancestors

  • Dataset
  • keras.utils.data_utils.Sequence

Subclasses

Inherited members

class TFDataset (tfdataset, n, y)
Wrapper for tf.data.Datasets
Args:
  tfdataset(tf.data.Dataset):  a tf.Dataset instance
  n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
  y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
Expand source code
class TFDataset(Dataset):
    """
    ```
    Wrapper for tf.data.Datasets
    ```
    """

    def __init__(self, tfdataset, n, y):
        """
        ```
        Args:
          tfdataset(tf.data.Dataset):  a tf.Dataset instance
          n(int): number of examples in dataset (cardinality, which can't reliably be extracted from tf.data.Datasets)
          y(np.ndarray): y values for each example - should be in the format expected by your moddel (e.g., 1-hot-encoded)
        ```
        """
        if not isinstance(tfdataset, tf.data.Dataset):
            raise ValueError(
                "tfdataset must be a fully-configured tf.data.Dataset with batch_size, etc. set appropriately"
            )
        self.tfdataset = tfdataset
        self.bs = next(tfdataset.as_numpy_iterator())[-1].shape[
            0
        ]  # extract batch_size from tfdataset
        self.n = n
        self.y = y

    @property
    def batch_size(self):
        return self.bs

    @batch_size.setter
    def batch_size(self, value):
        if value != self.bs:
            warnings.warn(
                "batch_size parameter is ignored, as pre-configured batch_size of tf.data.Dataset is used"
            )

    def nsamples(self):
        return self.n

    def get_y(self):
        return self.y

    def to_tfdataset(self, train=True):
        return self.tfdataset

Ancestors

Instance variables

var batch_size
Expand source code
@property
def batch_size(self):
    return self.bs

Methods

def get_y(self)
Expand source code
def get_y(self):
    return self.y
def nsamples(self)
Expand source code
def nsamples(self):
    return self.n
def to_tfdataset(self, train=True)
Expand source code
def to_tfdataset(self, train=True):
    return self.tfdataset

Inherited members