Dataset Informations: https://nlp.stanford.edu/wiki/Software/Classifier/Sentiment
Dowload the data from http://www.cs.cornell.edu/People/pabo/movie-review-data/rt-polaritydata.tar.gz to the folder:
melime/experiments/data/
unzip the data. You can use something as:
tar -xzf rt-polaritydata.tar.gz
import sys, os
sys.path.append('..')
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from melime.generators.word2vec_gen import Word2VecGen
from melime.explainers.explainer import Explainer
from melime.explainers.visualizations.plot_importance import ExplainGraph
from melime.explainers.visualizations.visualization import ExplainText
path_ = './data/rt-polaritydata/'
def load_polarity(path=path_):
data = []
labels = []
f_names = ['rt-polarity.neg', 'rt-polarity.pos']
for (l, f) in enumerate(f_names):
for line in open(os.path.join(path, f), 'rb'):
data.append(line.decode('utf8', errors='ignore').strip())
labels.append(l)
return data, labels
x, y = load_polarity()
x_train_all, x_test, y_train_all, y_test = train_test_split(
x, y, test_size=.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(
x_train_all, y_train_all, test_size=.1, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)
class VectorizeText():
def __init__(self):
self.count_vect = CountVectorizer()
self.tf_transformer = TfidfTransformer(use_idf=False)
def fit(self, x):
x = self.count_vect.fit_transform(x)
self.tf_transformer.fit(x)
def transform(self, x):
x = self.count_vect.transform(x)
x = self.tf_transformer.transform(x)
return x
vect_text = VectorizeText()
vect_text.fit(x_train)
x_vec_train = vect_text.transform(x_train)
# Train Model
clf = MultinomialNB().fit(x_vec_train, y_train)
def MNB_predict(texts):
return clf.predict(vect_text.transform(texts))
def MNB_predict_prob(texts):
return clf.predict_proba(vect_text.transform(texts))
preds = MNB_predict(x_val)
print('Val accuracy', metrics.accuracy_score(y_val, preds))
generator = Word2VecGen(x_train_all)
Instance to be explained
x_explain = [x_test[1]]
print('x_explain:', x_explain)
print(x_explain[0])
print('Predicted class:', MNB_predict(x_explain)[0])
print('Predict probabilities:', MNB_predict_prob(x_explain))
print('True class:', y_test[1])
x_explain = np.array(x_explain)
feature_names=[f'{e}: {word}' for e, word in enumerate(x_explain[0].split())]
words=[f'{word}' for e, word in enumerate(x_explain[0].split())]
explainer = Explainer(
model_predict=MNB_predict_prob, feature_names=feature_names, generator=generator
)
explanation, con_fav_samples = explainer.explain_instance(
x_explain=x_explain,
class_index=1,
n_samples=50,
tol_importance=0.001,
include_x_explain_train=False,
)
explain_dict = explanation.explain()
fig, ax = plt.subplots(figsize=(8, 9))
ax.set_title('Importance', fontsize=25)
names = feature_names
importances = [*explain_dict['importances']['mean'].values()]
ax = ExplainGraph.plot_feature_importance(
ax=ax, names=feature_names, vals=importances, size_title=15)
plt.savefig('text_explanation.pdf', dpi=300)
Favorable sampled phrases:
for phrase, prob in zip(con_fav_samples.samples_fav, con_fav_samples.y_fav):
print(f'{phrase} - Prop {prob:5.3f}')
Contrary sampled phrases:
for phrase, prob in zip(con_fav_samples.samples_con, con_fav_samples.y_con):
print(f'{phrase} - Prob.: {prob:5.3}')
obj = ExplainText.plot(importances=importances, words=words)
obj
with open("explain_text.html", "w") as fd:
fd.write(obj.data)
Thank you!