Module ktrain.text.ner.models
Expand source code
from ... import utils as U
from ...imports import *
from . import preprocessor as pp
BILSTM_CRF = "bilstm-crf"
BILSTM = "bilstm"
BILSTM_ELMO = "bilstm-elmo"
BILSTM_CRF_ELMO = "bilstm-crf-elmo"
BILSTM_TRANSFORMER = "bilstm-transformer"
SEQUENCE_TAGGERS = {
BILSTM: "Bidirectional LSTM (https://arxiv.org/abs/1603.01360)",
BILSTM_TRANSFORMER: "Bidirectional LSTM w/ transformer embeddings (multlingual BERT is default)",
BILSTM_CRF: "Bidirectional LSTM-CRF (https://arxiv.org/abs/1603.01360)",
BILSTM_ELMO: "Bidirectional LSTM w/ Elmo embeddings [English only]",
BILSTM_CRF_ELMO: "Bidirectional LSTM-CRF w/ Elmo embeddings [English only]",
}
V1_ONLY_MODELS = [BILSTM_CRF, BILSTM_CRF_ELMO]
TRANSFORMER_MODELS = [BILSTM_TRANSFORMER]
ELMO_MODELS = [BILSTM_ELMO, BILSTM_CRF_ELMO]
def print_sequence_taggers():
for k, v in SEQUENCE_TAGGERS.items():
print("%s: %s" % (k, v))
def sequence_tagger(
name,
preproc,
wv_path_or_url=None,
transformer_model="bert-base-multilingual-cased",
transformer_layers_to_use=U.DEFAULT_TRANSFORMER_LAYERS,
bert_model=None,
word_embedding_dim=100,
char_embedding_dim=25,
word_lstm_size=100,
char_lstm_size=25,
fc_dim=100,
dropout=0.5,
verbose=1,
):
"""
Build and return a sequence tagger (i.e., named entity recognizer).
Args:
name (string): one of:
- 'bilstm-crf' for Bidirectional LSTM-CRF model
- 'bilstm' for Bidirectional LSTM (no CRF layer)
preproc(NERPreprocessor): an instance of NERPreprocessor
wv_path_or_url(str): either a URL or file path toa fasttext word vector file (.vec or .vec.zip or .vec.gz)
Example valid values for wv_path_or_url:
Randomly-initialized word embeeddings:
set wv_path_or_url=None
English pretrained word vectors:
https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Chinese pretrained word vectors:
https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz
Russian pretrained word vectors:
https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz
Dutch pretrained word vectors:
https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz
See these two Web pages for a full list of URLs to word vector files for
different languages:
1. https://fasttext.cc/docs/en/english-vectors.html (for English)
2. https://fasttext.cc/docs/en/crawl-vectors.html (for non-English langages)
Default:None (randomly-initialized word embeddings are used)
transformer_model_name(str): the name of the transformer model. default: 'bert-base-multilingual-cased'
This parameter is only used if bilstm-transformer is selected for name parameter.
The value of this parameter is a name of transformer model from here:
https://huggingface.co/transformers/pretrained_models.html
or a community-uploaded BERT model from here:
https://huggingface.co/models
Example values:
bert-base-multilingual-cased: Multilingual BERT (157 languages) - this is the default
bert-base-cased: English BERT
bert-base-chinese: Chinese BERT
distilbert-base-german-cased: German DistilBert
albert-base-v2: English ALBERT model
monologg/biobert_v1.1_pubmed: community uploaded BioBERT (pretrained on PubMed)
transformer_layers_to_use(list): indices of hidden layers to use. default:[-2] # second-to-last layer
To use the concatenation of last 4 layers: use [-1, -2, -3, -4]
bert_model(str): alias for transformer_model
word_embedding_dim (int): word embedding dimensions.
char_embedding_dim (int): character embedding dimensions.
word_lstm_size (int): character LSTM feature extractor output dimensions.
char_lstm_size (int): word tagger LSTM output dimensions.
fc_dim (int): output fully-connected layer size.
dropout (float): dropout rate.
verbose (boolean): verbosity of output
Return:
model (Model): A Keras Model instance
"""
# backwards compatibility
name = BILSTM_TRANSFORMER if name == "bilstm-bert" else name
if bert_model is not None:
transformer_model = bert_model
warnings.warn(
"The bert_model argument is deprecated - please use transformer_model instead.",
DeprecationWarning,
stacklevel=2,
)
if name not in SEQUENCE_TAGGERS:
raise ValueError(
f"Invalid model name {name}. {'Did you mean bilstm-transformer?' if name == 'bilstm-bert' else ''}"
)
# check BERT
if name in TRANSFORMER_MODELS and not transformer_model:
raise ValueError(
f"transformer_model is required for {BILSTM_TRANSFORMER} models"
)
if name in TRANSFORMER_MODELS and DISABLE_V2_BEHAVIOR:
raise ValueError(
"BERT and other transformer models cannot be used with DISABLE_v2_BEHAVIOR"
)
# check CRF
if not DISABLE_V2_BEHAVIOR and name in V1_ONLY_MODELS:
warnings.warn(
"Falling back to BiLSTM (no CRF) because DISABLE_V2_BEHAVIOR=False"
)
msg = (
"\nIMPORTANT NOTE: ktrain uses the CRF module from keras_contrib, which is not yet\n"
+ "fully compatible with TensorFlow 2. You can still use the BiLSTM-CRF model\n"
+ "in ktrain for sequence tagging with TensorFlow 2, but you must add the\n"
+ "following to the top of your script or notebook BEFORE you import ktrain:\n\n"
+ "import os\n"
+ "os.environ['DISABLE_V2_BEHAVIOR'] = '1'\n\n"
+ "For this run, a vanilla BiLSTM model (with no CRF layer) will be used.\n"
)
print(msg)
name = BILSTM if name == BILSTM_CRF else BILSTM_ELMO
# check for use_char=True
if not DISABLE_V2_BEHAVIOR and preproc.p._use_char:
# turn off masking due to open TF2 issue ##33148: https://github.com/tensorflow/tensorflow/issues/33148
warnings.warn(
"Setting use_char=False: character embeddings cannot be used in TF2 due to open TensorFlow 2 bug (#33148).\n"
+ 'Add os.environ["DISABLE_V2_BEHAVIOR"] = "1" to the top of script if you really want to use it.'
)
preproc.p._use_char = False
if verbose:
emb_names = []
if wv_path_or_url is not None:
emb_names.append(
"word embeddings initialized with fasttext word vectors (%s)"
% (os.path.basename(wv_path_or_url))
)
else:
emb_names.append("word embeddings initialized randomly")
if name in TRANSFORMER_MODELS:
emb_names.append("transformer embeddings with " + transformer_model)
if name in ELMO_MODELS:
emb_names.append("Elmo embeddings for English")
if preproc.p._use_char:
emb_names.append("character embeddings")
if len(emb_names) > 1:
print("Embedding schemes employed (combined with concatenation):")
else:
print("embedding schemes employed:")
for emb_name in emb_names:
print("\t%s" % (emb_name))
print()
# setup embedding
if wv_path_or_url is not None:
wv_model, word_embedding_dim = preproc.get_wv_model(
wv_path_or_url, verbose=verbose
)
else:
wv_model = None
if name == BILSTM_CRF:
use_crf = False if not DISABLE_V2_BEHAVIOR else True # fallback to bilstm
elif name == BILSTM_CRF_ELMO:
use_crf = False if not DISABLE_V2_BEHAVIOR else True # fallback to bilstm
preproc.p.activate_elmo()
elif name == BILSTM:
use_crf = False
elif name == BILSTM_ELMO:
use_crf = False
preproc.p.activate_elmo()
elif name == BILSTM_TRANSFORMER:
use_crf = False
preproc.p.activate_transformer(
transformer_model, layers=transformer_layers_to_use, force=True
)
else:
raise ValueError("Unsupported model name")
from .anago.models import BiLSTMCRF
model = BiLSTMCRF(
char_embedding_dim=char_embedding_dim,
word_embedding_dim=word_embedding_dim,
char_lstm_size=char_lstm_size,
word_lstm_size=word_lstm_size,
fc_dim=fc_dim,
char_vocab_size=preproc.p.char_vocab_size,
word_vocab_size=preproc.p.word_vocab_size,
num_labels=preproc.p.label_size,
dropout=dropout,
use_crf=use_crf,
use_char=preproc.p._use_char,
embeddings=wv_model,
use_elmo=preproc.p.elmo_is_activated(),
use_transformer_with_dim=preproc.p.get_transformer_dim(),
)
model, loss = model.build()
model.compile(loss=loss, optimizer=U.DEFAULT_OPT)
return model
Functions
def print_sequence_taggers()
-
Expand source code
def print_sequence_taggers(): for k, v in SEQUENCE_TAGGERS.items(): print("%s: %s" % (k, v))
def sequence_tagger(name, preproc, wv_path_or_url=None, transformer_model='bert-base-multilingual-cased', transformer_layers_to_use=[-2], bert_model=None, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, verbose=1)
-
Build and return a sequence tagger (i.e., named entity recognizer).
Args
name
:string
- one of: - 'bilstm-crf' for Bidirectional LSTM-CRF model - 'bilstm' for Bidirectional LSTM (no CRF layer)
preproc(NERPreprocessor): an instance of NERPreprocessor wv_path_or_url(str): either a URL or file path toa fasttext word vector file (.vec or .vec.zip or .vec.gz) Example valid values for wv_path_or_url:
Randomly-initialized word embeeddings: set wv_path_or_url=None English pretrained word vectors: <https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip> Chinese pretrained word vectors: <https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz> Russian pretrained word vectors: <https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz> Dutch pretrained word vectors: <https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz> See these two Web pages for a full list of URLs to word vector files for different languages: 1. <https://fasttext.cc/docs/en/english-vectors.html> (for English) 2. <https://fasttext.cc/docs/en/crawl-vectors.html> (for non-English langages) Default:None (randomly-initialized word embeddings are used)
transformer_model_name(str): the name of the transformer model. default: 'bert-base-multilingual-cased' This parameter is only used if bilstm-transformer is selected for name parameter. The value of this parameter is a name of transformer model from here: https://huggingface.co/transformers/pretrained_models.html or a community-uploaded BERT model from here: https://huggingface.co/models Example values: bert-base-multilingual-cased: Multilingual BERT (157 languages) - this is the default bert-base-cased: English BERT bert-base-chinese: Chinese BERT distilbert-base-german-cased: German DistilBert albert-base-v2: English ALBERT model monologg/biobert_v1.1_pubmed: community uploaded BioBERT (pretrained on PubMed)
- transformer_layers_to_use(list): indices of hidden layers to use. default:[-2] # second-to-last layer
- To use the concatenation of last 4 layers: use [-1, -2, -3, -4]
- bert_model(str): alias for transformer_model
word_embedding_dim
:int
- word embedding dimensions.
char_embedding_dim
:int
- character embedding dimensions.
word_lstm_size
:int
- character LSTM feature extractor output dimensions.
char_lstm_size
:int
- word tagger LSTM output dimensions.
fc_dim
:int
- output fully-connected layer size.
dropout
:float
- dropout rate.
verbose
:boolean
- verbosity of output
Return
model (Model): A Keras Model instance
Expand source code
def sequence_tagger( name, preproc, wv_path_or_url=None, transformer_model="bert-base-multilingual-cased", transformer_layers_to_use=U.DEFAULT_TRANSFORMER_LAYERS, bert_model=None, word_embedding_dim=100, char_embedding_dim=25, word_lstm_size=100, char_lstm_size=25, fc_dim=100, dropout=0.5, verbose=1, ): """ Build and return a sequence tagger (i.e., named entity recognizer). Args: name (string): one of: - 'bilstm-crf' for Bidirectional LSTM-CRF model - 'bilstm' for Bidirectional LSTM (no CRF layer) preproc(NERPreprocessor): an instance of NERPreprocessor wv_path_or_url(str): either a URL or file path toa fasttext word vector file (.vec or .vec.zip or .vec.gz) Example valid values for wv_path_or_url: Randomly-initialized word embeeddings: set wv_path_or_url=None English pretrained word vectors: https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip Chinese pretrained word vectors: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz Russian pretrained word vectors: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz Dutch pretrained word vectors: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz See these two Web pages for a full list of URLs to word vector files for different languages: 1. https://fasttext.cc/docs/en/english-vectors.html (for English) 2. https://fasttext.cc/docs/en/crawl-vectors.html (for non-English langages) Default:None (randomly-initialized word embeddings are used) transformer_model_name(str): the name of the transformer model. default: 'bert-base-multilingual-cased' This parameter is only used if bilstm-transformer is selected for name parameter. The value of this parameter is a name of transformer model from here: https://huggingface.co/transformers/pretrained_models.html or a community-uploaded BERT model from here: https://huggingface.co/models Example values: bert-base-multilingual-cased: Multilingual BERT (157 languages) - this is the default bert-base-cased: English BERT bert-base-chinese: Chinese BERT distilbert-base-german-cased: German DistilBert albert-base-v2: English ALBERT model monologg/biobert_v1.1_pubmed: community uploaded BioBERT (pretrained on PubMed) transformer_layers_to_use(list): indices of hidden layers to use. default:[-2] # second-to-last layer To use the concatenation of last 4 layers: use [-1, -2, -3, -4] bert_model(str): alias for transformer_model word_embedding_dim (int): word embedding dimensions. char_embedding_dim (int): character embedding dimensions. word_lstm_size (int): character LSTM feature extractor output dimensions. char_lstm_size (int): word tagger LSTM output dimensions. fc_dim (int): output fully-connected layer size. dropout (float): dropout rate. verbose (boolean): verbosity of output Return: model (Model): A Keras Model instance """ # backwards compatibility name = BILSTM_TRANSFORMER if name == "bilstm-bert" else name if bert_model is not None: transformer_model = bert_model warnings.warn( "The bert_model argument is deprecated - please use transformer_model instead.", DeprecationWarning, stacklevel=2, ) if name not in SEQUENCE_TAGGERS: raise ValueError( f"Invalid model name {name}. {'Did you mean bilstm-transformer?' if name == 'bilstm-bert' else ''}" ) # check BERT if name in TRANSFORMER_MODELS and not transformer_model: raise ValueError( f"transformer_model is required for {BILSTM_TRANSFORMER} models" ) if name in TRANSFORMER_MODELS and DISABLE_V2_BEHAVIOR: raise ValueError( "BERT and other transformer models cannot be used with DISABLE_v2_BEHAVIOR" ) # check CRF if not DISABLE_V2_BEHAVIOR and name in V1_ONLY_MODELS: warnings.warn( "Falling back to BiLSTM (no CRF) because DISABLE_V2_BEHAVIOR=False" ) msg = ( "\nIMPORTANT NOTE: ktrain uses the CRF module from keras_contrib, which is not yet\n" + "fully compatible with TensorFlow 2. You can still use the BiLSTM-CRF model\n" + "in ktrain for sequence tagging with TensorFlow 2, but you must add the\n" + "following to the top of your script or notebook BEFORE you import ktrain:\n\n" + "import os\n" + "os.environ['DISABLE_V2_BEHAVIOR'] = '1'\n\n" + "For this run, a vanilla BiLSTM model (with no CRF layer) will be used.\n" ) print(msg) name = BILSTM if name == BILSTM_CRF else BILSTM_ELMO # check for use_char=True if not DISABLE_V2_BEHAVIOR and preproc.p._use_char: # turn off masking due to open TF2 issue ##33148: https://github.com/tensorflow/tensorflow/issues/33148 warnings.warn( "Setting use_char=False: character embeddings cannot be used in TF2 due to open TensorFlow 2 bug (#33148).\n" + 'Add os.environ["DISABLE_V2_BEHAVIOR"] = "1" to the top of script if you really want to use it.' ) preproc.p._use_char = False if verbose: emb_names = [] if wv_path_or_url is not None: emb_names.append( "word embeddings initialized with fasttext word vectors (%s)" % (os.path.basename(wv_path_or_url)) ) else: emb_names.append("word embeddings initialized randomly") if name in TRANSFORMER_MODELS: emb_names.append("transformer embeddings with " + transformer_model) if name in ELMO_MODELS: emb_names.append("Elmo embeddings for English") if preproc.p._use_char: emb_names.append("character embeddings") if len(emb_names) > 1: print("Embedding schemes employed (combined with concatenation):") else: print("embedding schemes employed:") for emb_name in emb_names: print("\t%s" % (emb_name)) print() # setup embedding if wv_path_or_url is not None: wv_model, word_embedding_dim = preproc.get_wv_model( wv_path_or_url, verbose=verbose ) else: wv_model = None if name == BILSTM_CRF: use_crf = False if not DISABLE_V2_BEHAVIOR else True # fallback to bilstm elif name == BILSTM_CRF_ELMO: use_crf = False if not DISABLE_V2_BEHAVIOR else True # fallback to bilstm preproc.p.activate_elmo() elif name == BILSTM: use_crf = False elif name == BILSTM_ELMO: use_crf = False preproc.p.activate_elmo() elif name == BILSTM_TRANSFORMER: use_crf = False preproc.p.activate_transformer( transformer_model, layers=transformer_layers_to_use, force=True ) else: raise ValueError("Unsupported model name") from .anago.models import BiLSTMCRF model = BiLSTMCRF( char_embedding_dim=char_embedding_dim, word_embedding_dim=word_embedding_dim, char_lstm_size=char_lstm_size, word_lstm_size=word_lstm_size, fc_dim=fc_dim, char_vocab_size=preproc.p.char_vocab_size, word_vocab_size=preproc.p.word_vocab_size, num_labels=preproc.p.label_size, dropout=dropout, use_crf=use_crf, use_char=preproc.p._use_char, embeddings=wv_model, use_elmo=preproc.p.elmo_is_activated(), use_transformer_with_dim=preproc.p.get_transformer_dim(), ) model, loss = model.build() model.compile(loss=loss, optimizer=U.DEFAULT_OPT) return model