from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
from keras.layers.recurrent import LSTM
from keras.constraints import max_norm
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import string
import csv
Source_File = "Emotion_Dataset2.csv"
Glov_Model_File = "glove.6B.100d.txt"
class Simple_Emotion_Classifier:
# fit a tokenizer - this portion of code taken from kaggle
def create_tokenizer(self, lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# encode a list of lines - this portion of code taken from kaggle
def encode_text(self, tokenizer, lines, length):
encoded = tokenizer.texts_to_sequences(lines) # integer encode
padded = pad_sequences(encoded, maxlen=length, padding='post') # pad encoded sequences
return padded
def define_model(self, length, vocab_size, embedding_matrix1):
# define model
inputs1 = Input(shape=(length,))
x = Embedding(vocab_size, 100, weights=[embedding_matrix1], trainable=False)(inputs1)
x = Bidirectional(LSTM(64))(x)
x1 = Dense(6, activation='relu')(x)
x1 = Dense(4, activation='softmax', name='emotion_classifier')(x1)
x2 = Dense(6, activation='relu', kernel_constraint=max_norm(1.0))(x)
x2 = Dense(1, kernel_initializer='normal', name='emotion_weight')(x2)
model = Model(inputs=[inputs1], outputs=[x1, x2]) # compile
losses = {
"emotion_classifier": "mse",
"emotion_weight": "mape",
}
lossWeights = {"emotion_classifier": 1.0, "emotion_weight": 1.0}
metric = {
"emotion_classifier": 'mse',
"emotion_weight": "mape",
}
# model.compile(loss=['categorical_crossentropy', 'mse'], optimizer='adam', metrics=['accuracy','mse']) # summarize
model.compile(loss=losses, loss_weights=lossWeights, optimizer='adam', metrics=metric) # summarize
print(model.summary())
return model
def Train_Model(self, Source_File, Glov_Model_File, epoch_count1, batch_size1):
# load training dataset
List_Txt = []
List_Class = []
List_Weight = []
List_Txt, List_Class, List_Weight = self.Read_Lines_Classes_Weights(Source_File)
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(List_Class)
encoded_Y = encoder.transform(List_Class)
# convert integers to dummy variables (i.e. one hot encoded)
Train_Labels = np_utils.to_categorical(encoded_Y)
print(List_Class)
print(List_Weight)
print("Text data size ", len(List_Txt))
print("Class size ", len(encoded_Y))
print("Class weight size ", len(List_Weight))
# create tokenizer
tokenizer = self.create_tokenizer(List_Txt)
# calculate max document length
length = 30
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
# load the whole embedding into memory
embeddings_index = dict()
f = open(Glov_Model_File, 'r', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
# create a weight matrix for words in training docs
embedding_matrix1 = zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix1[i] = embedding_vector
# encode data
trainX1 = self.encode_text(tokenizer, List_Txt, length)
# define model
model = self.define_model(length, vocab_size, embedding_matrix1)
# fit model
model.fit([trainX1], [array(Train_Labels), array(List_Weight)], epochs=epoch_count1,
batch_size=batch_size1, verbose=1)
# save the model
model.save("Multi_Tasking.h5")
return tokenizer, length
def Read_Lines_Classes_Weights(self, SourceFile):
Lines = [] # contains sentences with stopwords
Lines_Cln = [] # contains highly cleaned lines
Train_Labels = []
Train_Label_Weight = []
word_freq = {}
data = pd.read_csv(SourceFile)
for line in data.values:
# print(line[1], line[2], line[3])
line1 = str(line[1]).strip()
if int(len(line1)) >= 1:
Train_Labels.append(line[2]) # add train labels
lab_weight1 = float(line[3])
Train_Label_Weight.append(lab_weight1) # add train label weight
txt0 = str(line1[1])
wds_list = txt0.split()
# Remove words having length greater than 15
if int(len(wds_list)) > 0:
txt1 = ' '.join(wds_list)
# split into words
tokens = word_tokenize(txt1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(stemmed)
if txt2.strip().__len__()==0:
txt2 = "hello world"
#Stem lines remove noisy words
Lines.append(txt2)
else:
Lines.append("hellow world")
return Lines, Train_Labels, Train_Label_Weight
def Emotion_Classifier_Test(self, Input_text, tokenizer1, length1, model1):
cln_test_lines = []
line1 = str(Input_text).strip().lower()
tokens = word_tokenize(line1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
# Stem lines remove noisy words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(stemmed)
if txt2.__len__() > 0:
cln_test_lines.append(txt2)
else:
cln_test_lines.append("Very Good")
testX1 = self.encode_text(tokenizer1, cln_test_lines, length1)
# predicting class for model
pred = model1.predict([testX1])
print(pred)
pred_Anger = pred[0][0][0]
print("Anger ", pred_Anger)
pred_Fear = pred[0][0][1]
print("Fear ", pred_Fear)
pred_Joy = pred[0][0][2]
print("Joy ", pred_Joy)
pred_Sad = pred[0][0][3]
print("Sadness ", pred_Sad)
pred_Strength = pred[1][0][0]
print("prediction_strength",pred_Strength)
pred1_array = pred[0]
CATEGORIES = ["Anger", "Fear", "Joy", "Sadness"]
pred_name = CATEGORIES[np.argmax(pred1_array)]
print("Predicted Class => ",pred_name)
if __name__=="__main__":
print("Calling Simple Emotion Classifier Training and Test")
S_EMO_CLS = Simple_Emotion_Classifier()
# Train the model
tokenizer, length = S_EMO_CLS.Train_Model(Source_File,Glov_Model_File, epoch_count1=80,batch_size1=50)
model1 = load_model("Multi_Tasking.h5")
input_text1 = " Jimmy Carr makes me want to cry and cry *shiver*"
S_EMO_CLS.Emotion_Classifier_Test(input_text1,tokenizer,length,model1)
Details of the dataset:
- The dataset used in this code is available HERE. Download the README (for dataset)
- Download the
glove.6B.100d.txt from HERE (Or visit -
https://nlp.stanford.edu/projects/glove/ ) EmoInt-2017 Dataset main page: https://competitions.codalab.org/competitions/16380#learn_the_details-datasets
EmoBank Dataset main page: https://github.com/JULIELab/EmoBank
- Please Follow the copyright information available with the dataset and give full credit to the corresponding owners.