By Dr. Niraj Kumarin Deep Learning, Multi-task Deep Learningon
A Basic Multi-Tasking System
We can divide the Multi-task learning into four layers. Here Shared layer learns jointly learns important features from text input and plays a very important role. Finally, Task-Layer uses this jointly learned features for different task specific predictions. However, in complex Multi-Task learning, the Task layer can use additional features (additional to that learned from Shared-Layers), and additional shared networks.This is a basic code to demonstrate hard-multi-tasking system. To run this download "Emotion_Dataset2.csv" and "glove.6B.100d.txt" from links given below.
from numpy import array
from numpy import asarray
from numpy import zeros
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.layers import Embedding
from keras.models import load_model
from keras.layers.recurrent import LSTM
from keras.constraints import max_norm
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import string
import csv
Source_File = "Emotion_Dataset2.csv"
Glov_Model_File = "glove.6B.100d.txt"
class Simple_Emotion_Classifier:
# fit a tokenizer - this portion of code taken from kaggle
def create_tokenizer(self, lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# encode a list of lines - this portion of code taken from kaggle
def encode_text(self, tokenizer, lines, length):
encoded = tokenizer.texts_to_sequences(lines) # integer encode
padded = pad_sequences(encoded, maxlen=length, padding='post') # pad encoded sequences
return padded
def define_model(self, length, vocab_size, embedding_matrix1):
# define model
inputs1 = Input(shape=(length,))
x = Embedding(vocab_size, 100, weights=[embedding_matrix1], trainable=False)(inputs1)
x = Bidirectional(LSTM(64))(x)
x1 = Dense(6, activation='relu')(x)
x1 = Dense(4, activation='softmax', name='emotion_classifier')(x1)
x2 = Dense(6, activation='relu', kernel_constraint=max_norm(1.0))(x)
x2 = Dense(1, kernel_initializer='normal', name='emotion_weight')(x2)
model = Model(inputs=[inputs1], outputs=[x1, x2]) # compile
losses = {
"emotion_classifier": "mse",
"emotion_weight": "mape",
}
lossWeights = {"emotion_classifier": 1.0, "emotion_weight": 1.0}
metric = {
"emotion_classifier": 'mse',
"emotion_weight": "mape",
}
# model.compile(loss=['categorical_crossentropy', 'mse'], optimizer='adam', metrics=['accuracy','mse']) # summarize
model.compile(loss=losses, loss_weights=lossWeights, optimizer='adam', metrics=metric) # summarize
print(model.summary())
return model
def Train_Model(self, Source_File, Glov_Model_File, epoch_count1, batch_size1):
# load training dataset
List_Txt = []
List_Class = []
List_Weight = []
List_Txt, List_Class, List_Weight = self.Read_Lines_Classes_Weights(Source_File)
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(List_Class)
encoded_Y = encoder.transform(List_Class)
# convert integers to dummy variables (i.e. one hot encoded)
Train_Labels = np_utils.to_categorical(encoded_Y)
print(List_Class)
print(List_Weight)
print("Text data size ", len(List_Txt))
print("Class size ", len(encoded_Y))
print("Class weight size ", len(List_Weight))
# create tokenizer
tokenizer = self.create_tokenizer(List_Txt)
# calculate max document length
length = 30
# calculate vocabulary size
vocab_size = len(tokenizer.word_index) + 1
# load the whole embedding into memory
embeddings_index = dict()
f = open(Glov_Model_File, 'r', encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
# create a weight matrix for words in training docs
embedding_matrix1 = zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix1[i] = embedding_vector
# encode data
trainX1 = self.encode_text(tokenizer, List_Txt, length)
# define model
model = self.define_model(length, vocab_size, embedding_matrix1)
# fit model
model.fit([trainX1], [array(Train_Labels), array(List_Weight)], epochs=epoch_count1,
batch_size=batch_size1, verbose=1)
# save the model
model.save("Multi_Tasking.h5")
return tokenizer, length
def Read_Lines_Classes_Weights(self, SourceFile):
Lines = [] # contains sentences with stopwords
Lines_Cln = [] # contains highly cleaned lines
Train_Labels = []
Train_Label_Weight = []
word_freq = {}
data = pd.read_csv(SourceFile)
for line in data.values:
# print(line[1], line[2], line[3])
line1 = str(line[1]).strip()
if int(len(line1)) >= 1:
Train_Labels.append(line[2]) # add train labels
lab_weight1 = float(line[3])
Train_Label_Weight.append(lab_weight1) # add train label weight
txt0 = str(line1[1])
wds_list = txt0.split()
# Remove words having length greater than 15
if int(len(wds_list)) > 0:
txt1 = ' '.join(wds_list)
# split into words
tokens = word_tokenize(txt1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(stemmed)
if txt2.strip().__len__()==0:
txt2 = "hello world"
#Stem lines remove noisy words
Lines.append(txt2)
else:
Lines.append("hellow world")
return Lines, Train_Labels, Train_Label_Weight
def Emotion_Classifier_Test(self, Input_text, tokenizer1, length1, model1):
cln_test_lines = []
line1 = str(Input_text).strip().lower()
tokens = word_tokenize(line1)
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 0]
# stemming of words
# Stem lines remove noisy words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
txt2 = ' '.join(stemmed)
if txt2.__len__() > 0:
cln_test_lines.append(txt2)
else:
cln_test_lines.append("Very Good")
testX1 = self.encode_text(tokenizer1, cln_test_lines, length1)
# predicting class for model
pred = model1.predict([testX1])
print(pred)
pred_Anger = pred[0][0][0]
print("Anger ", pred_Anger)
pred_Fear = pred[0][0][1]
print("Fear ", pred_Fear)
pred_Joy = pred[0][0][2]
print("Joy ", pred_Joy)
pred_Sad = pred[0][0][3]
print("Sadness ", pred_Sad)
pred_Strength = pred[1][0][0]
print("prediction_strength",pred_Strength)
pred1_array = pred[0]
CATEGORIES = ["Anger", "Fear", "Joy", "Sadness"]
pred_name = CATEGORIES[np.argmax(pred1_array)]
print("Predicted Class => ",pred_name)
if __name__=="__main__":
print("Calling Simple Emotion Classifier Training and Test")
S_EMO_CLS = Simple_Emotion_Classifier()
# Train the model
tokenizer, length = S_EMO_CLS.Train_Model(Source_File,Glov_Model_File, epoch_count1=80,batch_size1=50)
model1 = load_model("Multi_Tasking.h5")
input_text1 = " Jimmy Carr makes me want to cry and cry *shiver*"
S_EMO_CLS.Emotion_Classifier_Test(input_text1,tokenizer,length,model1)
Details of the dataset:
The dataset used in this code is available HERE. Download the README (for dataset)
Impact of Social Networking Techniques and Shannon
Information Theory on Text Mining
Part-1: Centrality Entropy
1. Entropy
The following equation presents the Shannon’s definition of entropy of a random variable X which can take 'n' values.
(1)
In the Shannon entropy equation, p(xi) represents the probability of the given symbol 'xi'. The Shannon entropy equation provides a way to estimate the average minimum number of bits needed to encode a string of symbols, based on the frequency of the symbols. The minimum average number of bits is per symbol and it can be calculated as:
(2)
Example: Suppose we take the string: "Shannon" and calculates its entropy:
Discussion: Now, according to Eq-2, the ceiling value of 2.128 is 3. So, each symbol will be encoded by 3 bits and current string "Shannon" will require 21 bits to encode your string optimally (as, it contains total 7 characters).
2. Social Networking (Centrality Measures)
There are a lot of social networking techniques available. However, to demonstrate the combined impact of social networking techniques and Shannon information theory on text mining, I will use "closeness centrality".
2.1. Closeness-Centrality: It gives the information about how close a node is, w.r.t., the entire network [4]. see more information at:
3. Impact of Entropy on Centrality Measures
Centrality entropy provides information on the degree of reachability for a node in the graph[5]. However, such calculations may have different impact on (1) fully connected graph and (2) partially connected graph. The following contains the impact of such calculations on both types of graphs [2, 3, 5].
Fully connected graph: In a fully connected graph the removal of any node will have the same effect on centrality entropy as when any other node is removed. All nodes are equally important for the flow of information [5].
Partially connected graph: In partially connected graphs, those nodes whose removal will split the
graph in two or more parts or that will reduce substantially the number of geodesic paths available to reach other nodes when removed, will have a higher impact in decreasing the total centrality entropy [5]. This event produces the largest change in centrality entropy for the graph.
4. How to apply it in Text Mining
4.1. Representing Text as a Graph
To apply the social networking techniques on text, we need to convert the text into graph. Instead of going into a complex steps, we can use a very simple form of word graph of text, similar to that used in Textrank [6]. This word graph of text add link between any two words in the text, if (1) they are adjacent to each other and (2) comes under nouns/adjectives parts of speech tags. The following example will demonstrate it:
NOTE: However, we can go through a more informative graph, which contains edge weight as product of (1) co-occurrence frequency of terms and (2) their semantic strength. Such, approaches, neutralize the impact of noisy words having very high occurrence frequency [1, 7]. A sample java code can be downloaded from my site.
4.2. Applying Entropy of centrality Measures
For this, we use the text graph used, in previous section. Next, according to the process applied in [5], we use the following.
Calculation: To calculate the centrality entropy, we first calculate the entropy of entire graph by using the centrality score of node/edge (as per requirements), Next, we just drop the a node, and all incident edges for that node, and then calculate the entropy of the remaining graph. The importance score for each node is the difference of entropy scores calculated before and after removing that node. By using the same process, we can collect the score of all nodes of the graph.
4.3. Result and Analysis
We, use the text given in the section 3, and extracted 10 top ranked words by using (1) Pagerank algorithm, (2) closeness centrality score and (3) current setting of entropy of centrality. We find that, top-10 extracted words, extracted by using Pagerank, contains 7 informative terms. Top-10 words extracted by using closeness centrality contains 6 informative words and Finally with the current setting, it was 8 out of 10.
Reference:
Niraj Kumar, Kannan Srinathan and Vasudeva Varma; "A Graph based Unsupervised N-gram Filtration Technique for Automatic Keyphrase Extraction"; accepted for publication in "Int. J. of Data Mining, Modelling and Management".
Borgatti, S. P. (2003), The key player problem. Dynamic social network modeling and analysis: Workshop summary and papers. National Academy Press.
Everett, M. G. and Borgatti, S. P. (1999). The centrality of groups and classes. Journal of Mathematical Sociology, Vol. 23, No. 3. pp. 181-201.
Girvan, M. and Newman, M. E. J. (2002). Community structure in social and biological networks, Proc. Natl. Acad. Sci. USA 99, 7821–7826.
Ortiz-Arroyo, D. (2010). Discovering sets of key players in social networks (pp. 27-47). Springer London.
Mihalcea, Rada, and Paul Tarau. "TextRank: Bringing order into texts." Association for Computational Linguistics, 2004.
Niraj Kumar, Kannan Srinathan, Vasudeva Varma:A Knowledge Induced Graph-Theoretical Model for Extract and Abstract Single Document Summarization. CICLing (2) 2013: LNCS 7817, pp. 408-423