By Dr. Niraj Kumarin Deep Learning, Multi-task Deep Learningon
A Basic Multi-Tasking System
We can divide the Multi-task learning into four layers. Here Shared layer learns jointly learns important features from text input and plays a very important role. Finally, Task-Layer uses this jointly learned features for different task specific predictions. However, in complex Multi-Task learning, the Task layer can use additional features (additional to that learned from Shared-Layers), and additional shared networks.This is a basic code to demonstrate hard-multi-tasking system. To run this download "Emotion_Dataset2.csv" and "glove.6B.100d.txt" from links given below.
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
from keras.layers.recurrent import LSTM
from keras.constraints import max_norm
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import string
import csv
Source_File = "Emotion_Dataset2.csv"
Glov_Model_File = "glove.6B.100d.txt"
class Simple_Emotion_Classifier:
# fit a tokenizer - this portion of code taken from kaggle
def create_tokenizer(self, lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# encode a list of lines - this portion of code taken from kaggle
def encode_text(self, tokenizer, lines, length):
encoded = tokenizer.texts_to_sequences(lines) # integer encode
padded = pad_sequences(encoded, maxlen=length, padding='post') # pad encoded sequences
return padded
def define_model(self, length, vocab_size, embedding_matrix1):
# define model
inputs1 = Input(shape=(length,))
x = Embedding(vocab_size, 100, weights=[embedding_matrix1], trainable=False)(inputs1)
x = Bidirectional(LSTM(64))(x)
x1 = Dense(6, activation='relu')(x)
x1 = Dense(4, activation='softmax', name='emotion_classifier')(x1)
x2 = Dense(6, activation='relu', kernel_constraint=max_norm(1.0))(x)
x2 = Dense(1, kernel_initializer='normal', name='emotion_weight')(x2)
model = Model(inputs=[inputs1], outputs=[x1, x2]) # compile
losses = {
"emotion_classifier": "mse",
"emotion_weight": "mape",
}
lossWeights = {"emotion_classifier": 1.0, "emotion_weight": 1.0}
metric = {
"emotion_classifier": 'mse',
"emotion_weight": "mape",
}
# model.compile(loss=['categorical_crossentropy', 'mse'], optimizer='adam', metrics=['accuracy','mse']) # summarize
model.compile(loss=losses, loss_weights=lossWeights, optimizer='adam', metrics=metric) # summarize
print(model.summary())
return model
def Train_Model(self, Source_File, Glov_Model_File, epoch_count1, batch_size1):
# load training dataset
List_Txt = []
List_Class = []
List_Weight = []
List_Txt, List_Class, List_Weight = self.Read_Lines_Classes_Weights(Source_File)
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(List_Class)
encoded_Y = encoder.transform(List_Class)
# convert integers to dummy variables (i.e. one hot encoded)
Train_Labels = np_utils.to_categorical(encoded_Y)
print(List_Class)
print(List_Weight)
print("Text data size ", len(List_Txt))
print("Class size ", len(encoded_Y))
print("Class weight size ", len(List_Weight))
# create tokenizer
tokenizer = self.create_tokenizer(List_Txt)
# calculate max document length
length = 30
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
# load the whole embedding into memory
embeddings_index = dict()
f = open(Glov_Model_File, 'r', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
# create a weight matrix for words in training docs
embedding_matrix1 = zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix1[i] = embedding_vector
# encode data
trainX1 = self.encode_text(tokenizer, List_Txt, length)
# define model
model = self.define_model(length, vocab_size, embedding_matrix1)
# fit model
model.fit([trainX1], [array(Train_Labels), array(List_Weight)], epochs=epoch_count1,
batch_size=batch_size1, verbose=1)
# save the model
model.save("Multi_Tasking.h5")
return tokenizer, length
def Read_Lines_Classes_Weights(self, SourceFile):
Lines = [] # contains sentences with stopwords
Lines_Cln = [] # contains highly cleaned lines
Train_Labels = []
Train_Label_Weight = []
word_freq = {}
data = pd.read_csv(SourceFile)
for line in data.values:
# print(line[1], line[2], line[3])
line1 = str(line[1]).strip()
if int(len(line1)) >= 1:
Train_Labels.append(line[2]) # add train labels
lab_weight1 = float(line[3])
Train_Label_Weight.append(lab_weight1) # add train label weight
txt0 = str(line1[1])
wds_list = txt0.split()
# Remove words having length greater than 15
if int(len(wds_list)) > 0:
txt1 = ' '.join(wds_list)
# split into words
tokens = word_tokenize(txt1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(stemmed)
if txt2.strip().__len__()==0:
txt2 = "hello world"
#Stem lines remove noisy words
Lines.append(txt2)
else:
Lines.append("hellow world")
return Lines, Train_Labels, Train_Label_Weight
def Emotion_Classifier_Test(self, Input_text, tokenizer1, length1, model1):
cln_test_lines = []
line1 = str(Input_text).strip().lower()
tokens = word_tokenize(line1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
# Stem lines remove noisy words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(stemmed)
if txt2.__len__() > 0:
cln_test_lines.append(txt2)
else:
cln_test_lines.append("Very Good")
testX1 = self.encode_text(tokenizer1, cln_test_lines, length1)
# predicting class for model
pred = model1.predict([testX1])
print(pred)
pred_Anger = pred[0][0][0]
print("Anger ", pred_Anger)
pred_Fear = pred[0][0][1]
print("Fear ", pred_Fear)
pred_Joy = pred[0][0][2]
print("Joy ", pred_Joy)
pred_Sad = pred[0][0][3]
print("Sadness ", pred_Sad)
pred_Strength = pred[1][0][0]
print("prediction_strength",pred_Strength)
pred1_array = pred[0]
CATEGORIES = ["Anger", "Fear", "Joy", "Sadness"]
pred_name = CATEGORIES[np.argmax(pred1_array)]
print("Predicted Class => ",pred_name)
if __name__=="__main__":
print("Calling Simple Emotion Classifier Training and Test")
S_EMO_CLS = Simple_Emotion_Classifier()
# Train the model
tokenizer, length = S_EMO_CLS.Train_Model(Source_File,Glov_Model_File, epoch_count1=80,batch_size1=50)
model1 = load_model("Multi_Tasking.h5")
input_text1 = " Jimmy Carr makes me want to cry and cry *shiver*"
S_EMO_CLS.Emotion_Classifier_Test(input_text1,tokenizer,length,model1)
Details of the dataset:
The dataset used in this code is available HERE. Download the README (for dataset)