Fine Tune a BERT model w/ Tensorflow | by Practicing DatScy

There are two different ways to use pre-trained models in Tensorflow: tensorflow hub (via Kaggle) and the tensorflow_models library. In this post, I outline how to load models using tensorflow hub (via Kaggle).

!pip install "tensorflow==2.16.1"# TensorFlow Model Garden package
!pip install tf-models-official
# Include the latest changes for the TensorFlow Model Garden package
!pip install tf-models-nightly
# Update Datasets
!pip install --upgrade tensorflow tensorflow-datasets

import os
import json
import mathimport numpy as np
import pandas as pd
# https://plotly.com/python/line-charts/
import plotly.express as px
import tensorflow as tf
# Tensorflow Datasets
import tensorflow_datasets as tfds
# Method 0
import tensorflow_hub as hub
# Method 1
import tensorflow_models as tfm


# Load Data
# Split: https://www.tensorflow.org/datasets/splits
BATCH_SIZE=32
glue, info = tfds.load('glue/mrpc',
split=['train','validation','test'],
with_info=True,
batch_size=BATCH_SIZE, 
data_dir='/kaggle/working')

# View the loaded dataset
glue

# View information about the dataset
info

# View a the data
list(glue[0].batch(1).take(1).as_numpy_iterator())[0]

# Define Tokenizer using the pre-prepared vocabulary# Download the vocabulary file
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)
tokenizer = tfm.nlp.layers.FastWordpieceBertTokenizer(vocab_file=os.path.join(gs_folder_bert, "vocab.txt"), lower_case=True)

The vocab.txt file contains all the words in the dataset and additional token (subwords, alphabet in different languages, characters, and keywords like [PAD], etc…).

Text from the dataset and characters, that are in the vocab.txt.

Special keywords used for training BERT, included in vocab.txt.

# Define BertPackInputs: a function that creates 3 matricies # Put the dataset data in the correct format for the BERT model
max_seq_length = 128
# Gives statistics about vocab.txt
special_tokens_dict = tokenizer.get_special_tokens_dict()
packer = tfm.nlp.layers.BertPackInputs(
seq_length=max_seq_length, special_tokens_dict=special_tokens_dict)

class BertInputProcessor(tf.keras.layers.Layer):
def __init__(self, tokenizer, packer):
super().__init__()
self.tokenizer = tokenizer
self.packer = packerdef call(self, inputs):
tok1 = self.tokenizer(inputs['sentence1'])
tok2 = self.tokenizer(inputs['sentence2'])
packed = self.packer([tok1, tok2])
if 'label' in inputs:
return packed, inputs['label']
else:
return packed
# Pack the data in the correct format
bert_inputs_processor = BertInputProcessor(tokenizer, packer)
bert_inputs_processor

# Define the train, val, test datasetsglue_train = glue[0].map(bert_inputs_processor).prefetch(1)
glue_validation = glue[1].map(bert_inputs_processor).prefetch(1)
glue_test = glue[2].map(bert_inputs_processor).prefetch(1)

# View data prepared in the way that BERT requires
example_inputs, example_labels = next(iter(glue_train)
example_inputs

example_labels

Usage 0: load model and input text

Kaggle tensorflow bert model can be found at [4].

PREPROCESS_MODEL = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
preprocess = hub.load(PREPROCESS_MODEL)
print('preprocess :', preprocess)BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
bert = hub.load(BERT_MODEL)
print('bert :', bert)

# Test the loaded model# Convert the binary text to string sentences
# https://www.tensorflow.org/hub/tutorials/bert_experts
sentences = list(glue[0].batch(1).take(1).as_numpy_iterator())[0]['sentence1']
sentences = np.ravel(sentences)
sentences = [i.decode("utf-8") for i in sentences]
print('sentences: ', sentences)
inputs = preprocess(sentences)
print('inputs.keys(): ', inputs.keys())

The preprocess model takes a text array as an input, and outputs a dictionary with three keys (input_word_ids, input_mask, input_type_ids).

inputs

# Insert the inputs into the BERT model, to obtain several output formats
outputs = bert(inputs)print('Output keys of model:', list(outputs.keys()))
# ['default', 'sequence_output', 'encoder_outputs', 'pooled_output']
# Dictionary of tensors w/ keys 
# outputs
print("outputs['sequence_output'].shape: ", outputs['sequence_output'].shape)
print("outputs['encoder_outputs'].shape: ", outputs['encoder_outputs'][0].shape)
print("outputs['pooled_output'].shape: ", outputs['pooled_output'].shape)

The main three outputs are explained:

sequence_output: represents each input token in the context; contextual embedding for every token in the dataset. shape=(32, 128, 128)
encoder_output: intermediate activations of the last Transformer block
pooled_output: represents each input as a whole; the embedding of the sentence

Usage 1: using Sequential notation WITHOUT Fine tuning — only give text sentences

model = tf.keras.Sequential()PREPROCESS_MODEL = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
keras_hub_layer = hub.KerasLayer(PREPROCESS_MODEL, 
# input_shape=(BATCH_SIZE, ),
trainable=True,
dtype=tf.string)
# Wrap the KerasLayer object inside a Lambda layer
lambda_layer = tf.keras.layers.Lambda(lambda x: keras_hub_layer(x))
model.add(lambda_layer)
BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
keras_hub_layer2 = hub.KerasLayer(BERT_MODEL, 
# input_shape=(BATCH_SIZE, 128),
# output_shape=(BATCH_SIZE, 128),  
trainable=True,
dtype=tf.string)
# Outputs all 13 outputs
# lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x))
# out.keys() = dict_keys(['encoder_outputs', 'sequence_output', 'pooled_output', 'default'])
lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x)["pooled_output"])
model.add(lambda_layer2)
# select x = x["pooled_output"]      # [batch_size, 128]
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

# Test model WITHOUT Fine Tuning: on a dataset converted to string
sentences_tensor = tf.convert_to_tensor(sentences)
model(sentences_tensor)

The desired output tells whether the two GLUE sentences (sentence1 and sentence2) are ‘equivalent’>0.5 or ‘non-equivalent’<0.5.

Note that the results do not mean anything because the data has not been fine tuned with respect to a given label.

Usage 2: using Sequential notation WITH Fine tuning — data is already pre-treated, give BERT pre-treated inputs

bert_model = tf.keras.Sequential()BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
keras_hub_layer2 = hub.KerasLayer(BERT_MODEL, 
# input_shape=(BATCH_SIZE, 128),
# output_shape=(BATCH_SIZE, 128),  
trainable=True,
dtype=tf.string)
# Outputs all 13 outputs
# lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x))
# out.keys() = dict_keys(['encoder_outputs', 'sequence_output', 'pooled_output', 'default'])
lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x)["pooled_output"])
bert_model.add(lambda_layer2)
# select x = x["pooled_output"]      # [batch_size, 128]
bert_model.add(tf.keras.layers.Dropout(0.2))
bert_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

EPOCHS = 30train_data_size = len(glue[0])  # info.splits['train'].num_examples
steps_per_epoch = int(train_data_size / BATCH_SIZE)
num_train_steps = steps_per_epoch * EPOCHS
initial_learning_rate=2e-5
end_learning_rate=0
print('train_data_size: ', train_data_size)
print('steps_per_epoch: ', steps_per_epoch)
print('num_train_steps: ', num_train_steps)

# Learning Rate Schedule for Fine Tuning
def exponential_lr(epoch,
start_lr = initial_learning_rate, 
min_lr = 0.00001, max_lr = 0.001,
rampup_epochs = 5, sustain_epochs = 0,
exp_decay = 0.8):def lr(epoch, start_lr, min_lr, max_lr, rampup_epochs, sustain_epochs, exp_decay):
lr = ((max_lr - min_lr) *
exp_decay**(epoch - rampup_epochs - sustain_epochs) +
min_lr)
return lr
return lr(epoch,
start_lr,
min_lr,
max_lr,
rampup_epochs,
sustain_epochs,
exp_decay)
# they tunned the learning rate so that the loss would decay correctly
lr_callback = tf.keras.callbacks.LearningRateScheduler(exponential_lr, verbose=True)
rng = [i for i in range(EPOCHS)]
y = [exponential_lr(x) for x in rng]

df = pd.DataFrame({'step': rng, 'rate': y})
fig = px.line(df, x="step", y="rate")
fig.show()

opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
# OR
# opt = 'adam'bert_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
# Print model summary
bert_model.summary()

patience = 10
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=patience, mode='min')history = bert_model.fit(
glue_train,
validation_data=glue_validation,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
steps_per_epoch=steps_per_epoch,
verbose=1,
# callbacks=[lr_callback, early_stopping],
callbacks=[lr_callback]
)
# lr = 0.00001, accuracy= 0.4062
# learning_rate callback, accuracy=

Without using the learning rate callback the accuracy remains around chance level (0.4-0.5). With the learning rate callback the accuracy can increase to 0.64 steadily, but the loss and accuracy values oscillate between good to bad performance indicating the learning rate is sometimes too large.

Usage 3: using custom Subclassing notation WITHOUT Fine tuning

class preprocess_bert_model(tf.keras.Model):
def __init__(self):
super(preprocess_bert_model, self).__init__()PREPROCESS_MODEL = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
preprocess = hub.load(PREPROCESS_MODEL)
self.x_pp = hub.KerasLayer(preprocess, trainable=False)
BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
bert = hub.load(BERT_MODEL)
self.x_bert = hub.KerasLayer(bert, trainable=True)
self.dropout = tf.keras.layers.Dropout(0.1)
self.dense = tf.keras.layers.Dense(1, activation='sigmoid')
def call(self, inputs):
x = self.x_pp(inputs)
x = self.x_bert(x)
x = x["pooled_output"]      # [batch_size, 128]
# sequence_output = x["sequence_output"]  # [batch_size, seq_length, 128]
x = self.dropout(x)
x = self.dense(x)
return x

embedding_model = preprocess_bert_model()

# Test model WITHOUT Fine Tuning: on a dataset converted to string
sentences_tensor = tf.convert_to_tensor(sentences)
embedding_model(sentences_tensor)

Usage 4: using custom Subclassing notation WITH Fine tuning

class bert_model(tf.keras.Model):
def __init__(self):
super(bert_model, self).__init__()BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
bert = hub.load(BERT_MODEL)
self.x_bert = hub.KerasLayer(bert, trainable=True)
self.dropout = tf.keras.layers.Dropout(0.1)
self.dense = tf.keras.layers.Dense(1, activation='sigmoid')
def call(self, inputs):
x = self.x_bert(inputs)
x = x["pooled_output"]      # [batch_size, 128]
# sequence_output = x["sequence_output"]  # [batch_size, seq_length, 128]
x = self.dropout(x)
x = self.dense(x)
return x

bert_classifier = bert_model()bert_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bert_classifier.summary()

patience = 10
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=patience, mode='min')history = bert_classifier.fit(
glue_train,
validation_data=glue_validation,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
steps_per_epoch=steps_per_epoch,
verbose=1,
# callbacks=[lr_callback, early_stopping],
callbacks=[lr_callback]
)

Happy Practicing! 👋
🎁 Donate: support the blog! | 💻 GitHub | 🔔 Subscribe