Module `ktrain.text.ner.data`

Expand source code

from ... import utils as U
from ...imports import *
from .. import textutils as TU
from . import preprocessor as pp
from .preprocessor import NERPreprocessor

MAXLEN = 128
WORD_COL = pp.WORD_COL
TAG_COL = pp.TAG_COL
SENT_COL = pp.SENT_COL


def entities_from_gmb(
    train_filepath,
    val_filepath=None,
    use_char=False,
    word_column=WORD_COL,
    tag_column=TAG_COL,
    sentence_column=SENT_COL,
    encoding=None,
    val_pct=0.1,
    verbose=1,
):
    """
    Loads sequence-labeled data from text file in the  Groningen
    Meaning Bank  (GMB) format.
    """

    return entities_from_txt(
        train_filepath=train_filepath,
        val_filepath=val_filepath,
        use_char=use_char,
        word_column=word_column,
        tag_column=tag_column,
        sentence_column=sentence_column,
        data_format="gmb",
        encoding=encoding,
        val_pct=val_pct,
        verbose=verbose,
    )


def entities_from_conll2003(
    train_filepath,
    val_filepath=None,
    use_char=False,
    encoding=None,
    val_pct=0.1,
    verbose=1,
):
    """
    Loads sequence-labeled data from a file in CoNLL2003 format.
    """
    return entities_from_txt(
        train_filepath=train_filepath,
        val_filepath=val_filepath,
        use_char=use_char,
        data_format="conll2003",
        encoding=encoding,
        val_pct=val_pct,
        verbose=verbose,
    )


def entities_from_txt(
    train_filepath,
    val_filepath=None,
    use_char=False,
    word_column=WORD_COL,
    tag_column=TAG_COL,
    sentence_column=SENT_COL,
    data_format="conll2003",
    encoding=None,
    val_pct=0.1,
    verbose=1,
):
    """
    Loads sequence-labeled data from comma or tab-delmited text file.
    Format of file is either the CoNLL2003 format or Groningen Meaning
    Bank (GMB) format - specified with data_format parameter.

    In both formats, each word appars on a separate line along with
    its associated tag (or label).
    The last item on each line should be the tag or label assigned to word.

    In the CoNLL2003 format, there is an empty line after
    each sentence.  In the GMB format, sentences are deliniated
    with a third column denoting the Sentence ID.



    More information on CoNLL2003 format:
       https://www.aclweb.org/anthology/W03-0419

    CoNLL Example (each column is typically separated by space or tab)
    and  no column headings:

       Paul     B-PER
       Newman   I-PER
       is       O
       a        O
       great    O
       actor    O
       !        O

    More information on GMB format:
    Refer to ner_dataset.csv on Kaggle here:
       https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/2

    GMB example (each column separated by comma or tab)
    with column headings:

      SentenceID   Word     Tag
      1            Paul     B-PER
      1            Newman   I-PER
      1            is       O
      1            a        O
      1            great    O
      1            actor    O
      1            !        O


    Args:
        train_filepath(str): file path to training CSV
        val_filepath (str): file path to validation dataset
        use_char(bool):    If True, data will be preprocessed to use character embeddings in addition to word embeddings
        word_column(str): name of column containing the text
        tag_column(str): name of column containing lael
        sentence_column(str): name of column containing Sentence IDs
        data_format(str): one of colnll2003 or gmb
                          word_column, tag_column, and sentence_column
                          ignored if 'conll2003'
        encoding(str): the encoding to use.  If None, encoding is discovered automatically
        val_pct(float): Proportion of training to use for validation.
        verbose (boolean): verbosity
    """

    # set dataframe converter
    if data_format == "gmb":
        data_to_df = pp.gmb_to_df
    else:
        data_to_df = pp.conll2003_to_df
        word_column, tag_column, sentence_column = WORD_COL, TAG_COL, SENT_COL

    # detect encoding
    if encoding is None:
        with open(train_filepath, "rb") as f:
            encoding = TU.detect_encoding(f.read())
            U.vprint(
                "detected encoding: %s (if wrong, set manually)" % (encoding),
                verbose=verbose,
            )

    # create dataframe
    train_df = data_to_df(train_filepath, encoding=encoding)

    val_df = (
        None if val_filepath is None else data_to_df(val_filepath, encoding=encoding)
    )
    return entities_from_df(
        train_df,
        val_df=val_df,
        word_column=word_column,
        tag_column=tag_column,
        sentence_column=sentence_column,
        use_char=use_char,
        val_pct=val_pct,
        verbose=verbose,
    )


def entities_from_df(
    train_df,
    val_df=None,
    word_column=WORD_COL,
    tag_column=TAG_COL,
    sentence_column=SENT_COL,
    use_char=False,
    val_pct=0.1,
    verbose=1,
):
    """
    Load entities from pandas DataFrame
    Args:
      train_df(pd.DataFrame): training data
      val_df(pdf.DataFrame): validation data
      word_column(str): name of column containing the text
      tag_column(str): name of column containing lael
      sentence_column(str): name of column containing Sentence IDs
      use_char(bool):    If True, data will be preprocessed to use character embeddings  in addition to word embeddings
      verbose (boolean): verbosity

    """
    # process dataframe and instantiate NERPreprocessor
    x, y = pp.process_df(
        train_df,
        word_column=word_column,
        tag_column=tag_column,
        sentence_column=sentence_column,
        verbose=verbose,
    )

    # get validation set
    if val_df is None:
        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=val_pct)
    else:
        x_train, y_train = x, y
        (x_valid, y_valid) = pp.process_df(
            val_df,
            word_column=word_column,
            tag_column=tag_column,
            sentence_column=sentence_column,
            verbose=0,
        )

    # preprocess and convert to generator
    from .anago.preprocessing import IndexTransformer

    p = IndexTransformer(use_char=use_char)
    preproc = NERPreprocessor(p)
    preproc.fit(x_train, y_train)
    from .dataset import NERSequence

    trn = NERSequence(x_train, y_train, batch_size=U.DEFAULT_BS, p=p)
    val = NERSequence(x_valid, y_valid, batch_size=U.DEFAULT_BS, p=p)

    return (trn, val, preproc)


def entities_from_array(
    x_train, y_train, x_test=None, y_test=None, use_char=False, val_pct=0.1, verbose=1
):
    """
    Load entities from arrays
    Args:
      x_train(list): list of list of entity tokens for training
                     Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']]
      y_train(list): list of list of tokens representing entity labels
                     Example:  y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']]
      x_test(list): list of list of entity tokens for validation
                     Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']]
      y_test(list): list of list of tokens representing entity labels
                     Example:  y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']]
     use_char(bool):    If True, data will be preprocessed to use character embeddings  in addition to word embeddings
     val_pct(float):  percentage of training to use for validation if no validation data is supplied
     verbose (boolean): verbosity

    """
    # TODO: converting to df to use entities_from_df - needs to be refactored
    train_df = pp.array_to_df(x_train, y_train)
    val_df = None
    if x_test is not None and y_test is not None:
        val_df = pp.array_to_df(x_test, y_test)
    if verbose:
        print("training data sample:")
        print(train_df.head())
        if val_df is not None:
            print("validation data sample:")
            print(val_df.head())
    return entities_from_df(
        train_df, val_df=val_df, val_pct=val_pct, use_char=use_char, verbose=verbose
    )

Functions

def entities_from_array(x_train, y_train, x_test=None, y_test=None, use_char=False, val_pct=0.1, verbose=1)

Load entities from arrays

Args

x_train(list): list of list of entity tokens for training Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']] y_train(list): list of list of tokens representing entity labels Example: y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']] x_test(list): list of list of entity tokens for validation Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']] y_test(list): list of list of tokens representing entity labels Example: y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']] use_char(bool): If True, data will be preprocessed to use character embeddings in addition to word embeddings val_pct(float): percentage of training to use for validation if no validation data is supplied verbose (boolean): verbosity

Expand source code

def entities_from_array(
    x_train, y_train, x_test=None, y_test=None, use_char=False, val_pct=0.1, verbose=1
):
    """
    Load entities from arrays
    Args:
      x_train(list): list of list of entity tokens for training
                     Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']]
      y_train(list): list of list of tokens representing entity labels
                     Example:  y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']]
      x_test(list): list of list of entity tokens for validation
                     Example: x_train = [['Hello', 'world'], ['Hello', 'Cher'], ['I', 'love', 'Chicago']]
      y_test(list): list of list of tokens representing entity labels
                     Example:  y_train = [['O', 'O'], ['O', 'B-PER'], ['O', 'O', 'B-LOC']]
     use_char(bool):    If True, data will be preprocessed to use character embeddings  in addition to word embeddings
     val_pct(float):  percentage of training to use for validation if no validation data is supplied
     verbose (boolean): verbosity

    """
    # TODO: converting to df to use entities_from_df - needs to be refactored
    train_df = pp.array_to_df(x_train, y_train)
    val_df = None
    if x_test is not None and y_test is not None:
        val_df = pp.array_to_df(x_test, y_test)
    if verbose:
        print("training data sample:")
        print(train_df.head())
        if val_df is not None:
            print("validation data sample:")
            print(val_df.head())
    return entities_from_df(
        train_df, val_df=val_df, val_pct=val_pct, use_char=use_char, verbose=verbose
    )

def entities_from_conll2003(train_filepath, val_filepath=None, use_char=False, encoding=None, val_pct=0.1, verbose=1)

Loads sequence-labeled data from a file in CoNLL2003 format.

Expand source code

def entities_from_conll2003(
    train_filepath,
    val_filepath=None,
    use_char=False,
    encoding=None,
    val_pct=0.1,
    verbose=1,
):
    """
    Loads sequence-labeled data from a file in CoNLL2003 format.
    """
    return entities_from_txt(
        train_filepath=train_filepath,
        val_filepath=val_filepath,
        use_char=use_char,
        data_format="conll2003",
        encoding=encoding,
        val_pct=val_pct,
        verbose=verbose,
    )

def entities_from_df(train_df, val_df=None, word_column='Word', tag_column='Tag', sentence_column='SentenceID', use_char=False, val_pct=0.1, verbose=1)

Load entities from pandas DataFrame

Args

train_df(pd.DataFrame): training data
val_df(pdf.DataFrame): validation data
word_column(str): name of column containing the text
tag_column(str): name of column containing lael
sentence_column(str): name of column containing Sentence IDs
use_char(bool): If True, data will be preprocessed to use character embeddings in addition to word embeddings
verbose : boolean: verbosity

Expand source code

def entities_from_df(
    train_df,
    val_df=None,
    word_column=WORD_COL,
    tag_column=TAG_COL,
    sentence_column=SENT_COL,
    use_char=False,
    val_pct=0.1,
    verbose=1,
):
    """
    Load entities from pandas DataFrame
    Args:
      train_df(pd.DataFrame): training data
      val_df(pdf.DataFrame): validation data
      word_column(str): name of column containing the text
      tag_column(str): name of column containing lael
      sentence_column(str): name of column containing Sentence IDs
      use_char(bool):    If True, data will be preprocessed to use character embeddings  in addition to word embeddings
      verbose (boolean): verbosity

    """
    # process dataframe and instantiate NERPreprocessor
    x, y = pp.process_df(
        train_df,
        word_column=word_column,
        tag_column=tag_column,
        sentence_column=sentence_column,
        verbose=verbose,
    )

    # get validation set
    if val_df is None:
        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=val_pct)
    else:
        x_train, y_train = x, y
        (x_valid, y_valid) = pp.process_df(
            val_df,
            word_column=word_column,
            tag_column=tag_column,
            sentence_column=sentence_column,
            verbose=0,
        )

    # preprocess and convert to generator
    from .anago.preprocessing import IndexTransformer

    p = IndexTransformer(use_char=use_char)
    preproc = NERPreprocessor(p)
    preproc.fit(x_train, y_train)
    from .dataset import NERSequence

    trn = NERSequence(x_train, y_train, batch_size=U.DEFAULT_BS, p=p)
    val = NERSequence(x_valid, y_valid, batch_size=U.DEFAULT_BS, p=p)

    return (trn, val, preproc)

def entities_from_gmb(train_filepath, val_filepath=None, use_char=False, word_column='Word', tag_column='Tag', sentence_column='SentenceID', encoding=None, val_pct=0.1, verbose=1)

Loads sequence-labeled data from text file in the Groningen Meaning Bank (GMB) format.

Expand source code

def entities_from_gmb(
    train_filepath,
    val_filepath=None,
    use_char=False,
    word_column=WORD_COL,
    tag_column=TAG_COL,
    sentence_column=SENT_COL,
    encoding=None,
    val_pct=0.1,
    verbose=1,
):
    """
    Loads sequence-labeled data from text file in the  Groningen
    Meaning Bank  (GMB) format.
    """

    return entities_from_txt(
        train_filepath=train_filepath,
        val_filepath=val_filepath,
        use_char=use_char,
        word_column=word_column,
        tag_column=tag_column,
        sentence_column=sentence_column,
        data_format="gmb",
        encoding=encoding,
        val_pct=val_pct,
        verbose=verbose,
    )

def entities_from_txt(train_filepath, val_filepath=None, use_char=False, word_column='Word', tag_column='Tag', sentence_column='SentenceID', data_format='conll2003', encoding=None, val_pct=0.1, verbose=1)

Loads sequence-labeled data from comma or tab-delmited text file. Format of file is either the CoNLL2003 format or Groningen Meaning Bank (GMB) format - specified with data_format parameter.

In both formats, each word appars on a separate line along with its associated tag (or label). The last item on each line should be the tag or label assigned to word.

In the CoNLL2003 format, there is an empty line after each sentence. In the GMB format, sentences are deliniated with a third column denoting the Sentence ID.

More information on CoNLL2003 format: https://www.aclweb.org/anthology/W03-0419

CoNLL Example (each column is typically separated by space or tab) and no column headings:

Paul B-PER Newman I-PER is O a O great O actor O ! O

More information on GMB format: Refer to ner_dataset.csv on Kaggle here: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/2

GMB example (each column separated by comma or tab) with column headings:

SentenceID Word Tag 1 Paul B-PER 1 Newman I-PER 1 is O 1 a O 1 great O 1 actor O 1 ! O

Args

train_filepath(str): file path to training CSV
val_filepath : str: file path to validation dataset
use_char(bool): If True, data will be preprocessed to use character embeddings in addition to word embeddings
word_column(str): name of column containing the text
tag_column(str): name of column containing lael
sentence_column(str): name of column containing Sentence IDs
data_format(str): one of colnll2003 or gmb
word_column, tag_column, and sentence_column
ignored if 'conll2003'
encoding(str): the encoding to use. If None, encoding is discovered automatically
val_pct(float): Proportion of training to use for validation.
verbose : boolean: verbosity

Expand source code

def entities_from_txt(
    train_filepath,
    val_filepath=None,
    use_char=False,
    word_column=WORD_COL,
    tag_column=TAG_COL,
    sentence_column=SENT_COL,
    data_format="conll2003",
    encoding=None,
    val_pct=0.1,
    verbose=1,
):
    """
    Loads sequence-labeled data from comma or tab-delmited text file.
    Format of file is either the CoNLL2003 format or Groningen Meaning
    Bank (GMB) format - specified with data_format parameter.

    In both formats, each word appars on a separate line along with
    its associated tag (or label).
    The last item on each line should be the tag or label assigned to word.

    In the CoNLL2003 format, there is an empty line after
    each sentence.  In the GMB format, sentences are deliniated
    with a third column denoting the Sentence ID.



    More information on CoNLL2003 format:
       https://www.aclweb.org/anthology/W03-0419

    CoNLL Example (each column is typically separated by space or tab)
    and  no column headings:

       Paul     B-PER
       Newman   I-PER
       is       O
       a        O
       great    O
       actor    O
       !        O

    More information on GMB format:
    Refer to ner_dataset.csv on Kaggle here:
       https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/version/2

    GMB example (each column separated by comma or tab)
    with column headings:

      SentenceID   Word     Tag
      1            Paul     B-PER
      1            Newman   I-PER
      1            is       O
      1            a        O
      1            great    O
      1            actor    O
      1            !        O


    Args:
        train_filepath(str): file path to training CSV
        val_filepath (str): file path to validation dataset
        use_char(bool):    If True, data will be preprocessed to use character embeddings in addition to word embeddings
        word_column(str): name of column containing the text
        tag_column(str): name of column containing lael
        sentence_column(str): name of column containing Sentence IDs
        data_format(str): one of colnll2003 or gmb
                          word_column, tag_column, and sentence_column
                          ignored if 'conll2003'
        encoding(str): the encoding to use.  If None, encoding is discovered automatically
        val_pct(float): Proportion of training to use for validation.
        verbose (boolean): verbosity
    """

    # set dataframe converter
    if data_format == "gmb":
        data_to_df = pp.gmb_to_df
    else:
        data_to_df = pp.conll2003_to_df
        word_column, tag_column, sentence_column = WORD_COL, TAG_COL, SENT_COL

    # detect encoding
    if encoding is None:
        with open(train_filepath, "rb") as f:
            encoding = TU.detect_encoding(f.read())
            U.vprint(
                "detected encoding: %s (if wrong, set manually)" % (encoding),
                verbose=verbose,
            )

    # create dataframe
    train_df = data_to_df(train_filepath, encoding=encoding)

    val_df = (
        None if val_filepath is None else data_to_df(val_filepath, encoding=encoding)
    )
    return entities_from_df(
        train_df,
        val_df=val_df,
        word_column=word_column,
        tag_column=tag_column,
        sentence_column=sentence_column,
        use_char=use_char,
        val_pct=val_pct,
        verbose=verbose,
    )