In this blog we will see step by step for how to handle Big data with imbalanced dataset and create multiclass text classifier using TensorFlow.
- Import the data, ensuring it’s accurate and reliable.
- Treat Imbalanced data using Class weights Score.
- Use TensorFlow with SoftMax activation for multiclass classification
- Check accuracy and other metrics.
Import Libraries & Datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns#Using Pandas library to load dataset
data = pd.read_csv('/content/drive/MyDrive/DataAnalysisProjects/topic_classification_data.csv')
df.label.value_counts()
Import class weight to treat Imbalanced Datasets
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight ='balanced',
classes=np.unique(df['label']),y = df['label'])class_weights.sort()
print(class_weights)
X_train, X_test = train_test_split(df, test_size=0.2, random_state=111)
table = tf.lookup.StaticHashTable(
initializer=tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(['Politics', 'Health', 'Emotion', 'Financial', 'Sport','Science']),
values=tf.constant([0, 1, 2, 3, 4, 5]),
),
default_value=tf.constant(-1),
name="target_encoding"
)@tf.function
def target(x):
return table.lookup(x)
def show_batch(dataset, size=5):
for batch, label in dataset.take(size):
print(batch.numpy())
print(target(label).numpy())
show_batch(dataset_test,6)
One hot encoding for labels
def fetch(text, labels):
return text, tf.one_hot(target(labels),6)train_data_f=dataset_train.map(fetch)
test_data_f=dataset_test.map(fetch)
train_data, train_labels = next(iter(train_data_f.batch(5)))
train_data, train_labels
embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
## This is token based pre-trained embedding layer from google
## If we use this then we dont need to add any tokenizing parameter
# This will automatically tokenize
hub_layer = hub.KerasLayer(embedding, output_shape=[128], input_shape=[],
dtype=tf.string, trainable=True)
hub_layer(train_data[:1])
model = tf.keras.Sequential()
model.add(hub_layer)
# Hub layer is embeded layer we are using above
for units in [128, 128, 64 , 32]:
# 4 hidden layers
model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
# Dropout of 30% in each hidden layer
model.add(tf.keras.layers.Dense(6, activation='softmax'))
# Softmax activation for multiclass classificationmodel.summary()
history = model.fit(train_data_f,
epochs=4,
validation_data=test_data_f,
verbose=1,
class_weight=weights)## Class_weight = weights for the assigned weight scores for all labels to treat them equal
# When we use class_weights then it use weighted cross entropy func in tensorflow
from sklearn.metrics import classification_reportprint(classification_report(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1)))
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1))
Be the first to comment