Exploring AI-ML-NLP: April 2020

By Dr. Niraj Kumar in Deep Learning, Multi-task Deep Learning on April 7, 2020
A Basic Multi-Tasking System

We can divide the Multi-task learning into four layers. Here Shared layer learns jointly learns important features from text input and plays a very important role. Finally, Task-Layer uses this jointly learned features for different task specific predictions. However, in complex Multi-Task learning, the Task layer can use additional features (additional to that learned from Shared-Layers), and additional shared networks.
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
from keras.layers.recurrent import LSTM
from keras.constraints import max_norm
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import string
import csv

Source_File = "Emotion_Dataset2.csv"
Glov_Model_File = "glove.6B.100d.txt"

class Simple_Emotion_Classifier:
    # fit a tokenizer - this portion of code taken from kaggle
    def create_tokenizer(self, lines):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(lines)
        return tokenizer

    # encode a list of lines - this portion of code taken from kaggle
    def encode_text(self, tokenizer, lines, length):
        encoded = tokenizer.texts_to_sequences(lines)  # integer encode
        padded = pad_sequences(encoded, maxlen=length, padding='post')  # pad encoded sequences
        return padded

    def define_model(self, length, vocab_size, embedding_matrix1):
        # define model
        inputs1 = Input(shape=(length,))
        x = Embedding(vocab_size, 100, weights=[embedding_matrix1], trainable=False)(inputs1)
        x = Bidirectional(LSTM(64))(x)

        x1 = Dense(6, activation='relu')(x)
        x1 = Dense(4, activation='softmax', name='emotion_classifier')(x1)

        x2 = Dense(6, activation='relu', kernel_constraint=max_norm(1.0))(x)
        x2 = Dense(1, kernel_initializer='normal', name='emotion_weight')(x2)

        model = Model(inputs=[inputs1], outputs=[x1, x2])  # compile

        losses = {
            "emotion_classifier": "mse",
            "emotion_weight": "mape",
        }
        lossWeights = {"emotion_classifier": 1.0, "emotion_weight": 1.0}

        metric = {
            "emotion_classifier": 'mse',
            "emotion_weight": "mape",
        }

        # model.compile(loss=['categorical_crossentropy', 'mse'], optimizer='adam', metrics=['accuracy','mse'])  # summarize
        model.compile(loss=losses, loss_weights=lossWeights, optimizer='adam', metrics=metric)  # summarize
        print(model.summary())

        return model

    def Train_Model(self, Source_File, Glov_Model_File, epoch_count1, batch_size1):
        # load training dataset
        List_Txt = []
        List_Class = []
        List_Weight = []
        List_Txt, List_Class, List_Weight = self.Read_Lines_Classes_Weights(Source_File)
        # encode class values as integers
        encoder = LabelEncoder()
        encoder.fit(List_Class)
        encoded_Y = encoder.transform(List_Class)
        # convert integers to dummy variables (i.e. one hot encoded)
        Train_Labels = np_utils.to_categorical(encoded_Y)
        print(List_Class)
        print(List_Weight)
        print("Text data size ", len(List_Txt))
        print("Class size ", len(encoded_Y))
        print("Class weight size ", len(List_Weight))
        # create tokenizer
        tokenizer = self.create_tokenizer(List_Txt)
        # calculate max document length
        length = 30
        # calculate vocabulary size
        vocab_size = len(tokenizer.word_index) + 1
        # load the whole embedding into memory
        embeddings_index = dict()
        f = open(Glov_Model_File, 'r', encoding='utf-8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        # create a weight matrix for words in training docs
        embedding_matrix1 = zeros((vocab_size, 100))
        for word, i in tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix1[i] = embedding_vector
        # encode data
        trainX1 = self.encode_text(tokenizer, List_Txt, length)
        # define model
        model = self.define_model(length, vocab_size, embedding_matrix1)
        # fit model
        model.fit([trainX1], [array(Train_Labels), array(List_Weight)], epochs=epoch_count1,
                           batch_size=batch_size1, verbose=1)
        # save the model
        model.save("Multi_Tasking.h5")
        return tokenizer, length

    def Read_Lines_Classes_Weights(self, SourceFile):
        Lines = []  # contains sentences with stopwords
        Lines_Cln = [] # contains highly cleaned lines
        Train_Labels = []
        Train_Label_Weight = []
        word_freq = {}
        data = pd.read_csv(SourceFile)
        for line in data.values:
            # print(line[1], line[2], line[3])
            line1 = str(line[1]).strip()
            if int(len(line1)) >= 1:
                Train_Labels.append(line[2])  # add train labels
                lab_weight1 = float(line[3])
                Train_Label_Weight.append(lab_weight1)  # add train label weight
                txt0 = str(line1[1])
                wds_list = txt0.split()
                # Remove words having length greater than 15
                if int(len(wds_list)) > 0:
                    txt1 = ' '.join(wds_list)
                    # split into words
                    tokens = word_tokenize(txt1)
                    # prepare regex for char filtering
                    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
                    # remove punctuation from each word
                    tokens = [re_punc.sub('', w) for w in tokens]
                    # remove remaining tokens that are not alphabetic
                    tokens = [word for word in tokens if word.isalpha()]
                    stop_words = set(stopwords.words('english'))
                    tokens = [w for w in tokens if not w in stop_words]
                    # filter out short tokens
                    tokens = [word for word in tokens if len(word) > 0]
                    # stemming of words
                    porter = PorterStemmer()
                    stemmed = [porter.stem(word) for word in tokens]
                    txt2 = ' '.join(stemmed)
                    if txt2.strip().__len__()==0:
                        txt2 = "hello world"
                    #Stem lines remove noisy words
                    Lines.append(txt2)
                else:
                    Lines.append("hellow world")
        return Lines, Train_Labels, Train_Label_Weight

    def Emotion_Classifier_Test(self, Input_text, tokenizer1, length1, model1):
        cln_test_lines = []
        line1 = str(Input_text).strip().lower()
        tokens = word_tokenize(line1)
        # prepare regex for char filtering
        re_punc = re.compile('[%s]' % re.escape(string.punctuation))
        # remove punctuation from each word
        tokens = [re_punc.sub('', w) for w in tokens]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]
        # filter out short tokens
        tokens = [word for word in tokens if len(word) > 0]
        # stemming of words
        # Stem lines remove noisy words
        porter = PorterStemmer()
        stemmed = [porter.stem(word) for word in tokens]
        txt2 = ' '.join(stemmed)
        if txt2.__len__() > 0:
            cln_test_lines.append(txt2)
        else:
            cln_test_lines.append("Very Good")
        testX1 = self.encode_text(tokenizer1, cln_test_lines, length1)
        # predicting class for model
        pred = model1.predict([testX1])
        print(pred)
        pred_Anger = pred[0][0][0]
        print("Anger ", pred_Anger)
        pred_Fear = pred[0][0][1]
        print("Fear ", pred_Fear)
        pred_Joy = pred[0][0][2]
        print("Joy ", pred_Joy)
        pred_Sad = pred[0][0][3]
        print("Sadness ", pred_Sad)
        pred_Strength = pred[1][0][0]
        print("prediction_strength",pred_Strength)
        pred1_array = pred[0]
        CATEGORIES = ["Anger", "Fear", "Joy", "Sadness"]
        pred_name = CATEGORIES[np.argmax(pred1_array)]
        print("Predicted Class => ",pred_name)

if __name__=="__main__":
    print("Calling Simple Emotion Classifier Training and Test")
    S_EMO_CLS = Simple_Emotion_Classifier()
    # Train the model
    tokenizer, length = S_EMO_CLS.Train_Model(Source_File,Glov_Model_File, epoch_count1=80,batch_size1=50)
    model1 = load_model("Multi_Tasking.h5")
    input_text1 = " Jimmy Carr makes me want to cry and cry *shiver*"
    S_EMO_CLS.Emotion_Classifier_Test(input_text1,tokenizer,length,model1)
Details of the dataset:

The dataset used in this code is available HERE. Download the README (for dataset)
Download the glove.6B.100d.txt from HERE (Or visit - https://nlp.stanford.edu/projects/glove/ )
EmoInt-2017 Dataset main page: https://competitions.codalab.org/competitions/16380#learn_the_details-datasets
EmoBank Dataset main page: https://github.com/JULIELab/EmoBank
Please Follow the copyright information available with the dataset and give full credit to the corresponding owners.
Exploring AI-ML-NLP

Tuesday, April 7, 2020

Multi-Tasking Deep Learning

A Basic Multi-Tasking System

Details of the dataset:

Blog Archive