Multi-class Text Classification using TensorFlow | by Nihar Jamdar | Sep, 2023


In this blog we will see step by step for how to handle Big data with imbalanced dataset and create multiclass text classifier using TensorFlow.

  1. Import the data, ensuring it’s accurate and reliable.
  2. Treat Imbalanced data using Class weights Score.
  3. Use TensorFlow with SoftMax activation for multiclass classification
  4. Check accuracy and other metrics.

Import Libraries & Datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Using Pandas library to load dataset
data = pd.read_csv('/content/drive/MyDrive/DataAnalysisProjects/topic_classification_data.csv')
df.label.value_counts()

Imbalanced Dataset

Import class weight to treat Imbalanced Datasets

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight ='balanced',
classes=np.unique(df['label']),y = df['label'])

class_weights.sort()

print(class_weights)

Note: class_weights will assign the lesser weights to those labels which are having higher data-points and higher weights to labels with less data-points
X_train, X_test = train_test_split(df, test_size=0.2, random_state=111)
table = tf.lookup.StaticHashTable(
initializer=tf.lookup.KeyValueTensorInitializer(
keys=tf.constant(['Politics', 'Health', 'Emotion', 'Financial', 'Sport','Science']),
values=tf.constant([0, 1, 2, 3, 4, 5]),
),
default_value=tf.constant(-1),
name="target_encoding"
)

@tf.function
def target(x):
return table.lookup(x)

def show_batch(dataset, size=5):
for batch, label in dataset.take(size):
print(batch.numpy())
print(target(label).numpy())

show_batch(dataset_test,6)

Numerical representation of string labels

One hot encoding for labels

def fetch(text, labels):
return text, tf.one_hot(target(labels),6)

train_data_f=dataset_train.map(fetch)
test_data_f=dataset_test.map(fetch)

train_data, train_labels = next(iter(train_data_f.batch(5)))
train_data, train_labels

embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
## This is token based pre-trained embedding layer from google
## If we use this then we dont need to add any tokenizing parameter
# This will automatically tokenize
hub_layer = hub.KerasLayer(embedding, output_shape=[128], input_shape=[],
dtype=tf.string, trainable=True)
hub_layer(train_data[:1])
model = tf.keras.Sequential()
model.add(hub_layer)
# Hub layer is embeded layer we are using above
for units in [128, 128, 64 , 32]:
# 4 hidden layers
model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))
# Dropout of 30% in each hidden layer
model.add(tf.keras.layers.Dense(6, activation='softmax'))
# Softmax activation for multiclass classification

model.summary()

history = model.fit(train_data_f,
epochs=4,
validation_data=test_data_f,
verbose=1,
class_weight=weights)

## Class_weight = weights for the assigned weight scores for all labels to treat them equal
# When we use class_weights then it use weighted cross entropy func in tensorflow

Train Acc: 96% , Test Acc: 89%
from sklearn.metrics import classification_report

print(classification_report(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1)))

from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels.numpy().argmax(axis=1), y_pred.argmax(axis=1))
Confusion Matrix



Source link

Be the first to comment

Leave a Reply

Your email address will not be published.


*