Text analyzers to help create text-based covariates, treatments, or outcomes for causal analyses.
zsl = ZeroShotClassifier()
labels=['politics', 'elections', 'sports', 'films', 'television']
doc = 'I am extremely dissatisfied with the President and will definitely vote in 2020.'
preds = zsl.predict(doc, labels=labels, include_labels=True)
preds
d = dict(preds)
assert d['politics'] > 0.9
assert d['elections'] > 0.9
assert d['sports'] < 0.1
assert d['films'] < 0.1
assert d['television'] < 0.1
te = TextEncoder()
e = te.encode('The moon is bright.')
assert e.shape[0] == 1
assert e.shape[1] == 1024
from sklearn.datasets import fetch_20newsgroups
# we only want to keep the body of the documents!
remove = ('headers', 'footers', 'quotes')
# fetch train and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
# compile the texts
texts = newsgroups_train.data + newsgroups_test.data
# let's also store the newsgroup category associated with each document
# we can display this information in visualizations
targets = [target for target in list(newsgroups_train.target) + list(newsgroups_test.target)]
categories = [newsgroups_train.target_names[target] for target in targets]
tm = TopicModel(texts, n_features=10000)
tm.print_topics()
tm.build(texts)
texts[1]
tm.doc_topics[1]
tm.topics[ np.argmax(tm.doc_topics[1])]
tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' +
'the development and manufacturing of advanced rockets and spacecraft for missions ' +
'to and beyond Earth orbit.'])
tm.topics[ np.argmax(tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' +
'the development and manufacturing of advanced rockets and spacecraft for missions ' +
'to and beyond Earth orbit.']))]