Preprocesses dataset
import pandas as pd
df = pd.read_csv('sample_data/music_seed50.tsv', sep='\t', error_bad_lines=False)
pp = DataframePreprocessor(treatment_col='T_ac', outcome_col='Y_sim',
text_col='text', include_cols=['C_true', 'product'])
df, X, Y, T = pp.preprocess(df, training=True)
X.head()
test_df = pd.DataFrame({
'C_true' : [0, 1],
'product': ['vinyl', 'mp3 music'],
'text' : ['This record hurts my ears.', "The music of Yanni is beautiful and breath-taking."],
'Y_sim' : [0, 1],
'T_ac' : [0, 1],
})
test_df.head()
_, X_test, _, _ = pp.preprocess(test_df, training=False)
assert sum([X_test.columns.values[i] == col for i,col in enumerate(X.columns.values)]) == len(X.columns.values)
test_df = pd.DataFrame({
'product': ['vinyl', 'mp3 music'],
'text' : ['This record hurts my ears.', "The music of Yanni is beautiful and breath-taking."],
'Y_sim' : [0, 1],
'T_ac' : [0, 1],
})
error = False
try:
_, X_test, _, _ = pp.preprocess(test_df, training=False)
except ValueError:
error = True
assert error is True