There are two different ways to use pre-trained models in Tensorflow: tensorflow hub (via Kaggle) and the tensorflow_models library. In this post, I outline how to load models using tensorflow hub (via Kaggle).
!pip install "tensorflow==2.16.1"# TensorFlow Model Garden package
!pip install tf-models-official
# Include the latest changes for the TensorFlow Model Garden package
!pip install tf-models-nightly
# Update Datasets
!pip install --upgrade tensorflow tensorflow-datasets
import os
import json
import mathimport numpy as np
import pandas as pd
# https://plotly.com/python/line-charts/
import plotly.express as px
import tensorflow as tf
# Tensorflow Datasets
import tensorflow_datasets as tfds
# Method 0
import tensorflow_hub as hub
# Method 1
import tensorflow_models as tfm
# Load Data
# Split: https://www.tensorflow.org/datasets/splits
BATCH_SIZE=32
glue, info = tfds.load('glue/mrpc',
split=['train','validation','test'],
with_info=True,
batch_size=BATCH_SIZE,
data_dir='/kaggle/working')
# View the loaded dataset
glue
# View information about the dataset
info
# View a the data
list(glue[0].batch(1).take(1).as_numpy_iterator())[0]
# Define Tokenizer using the pre-prepared vocabulary# Download the vocabulary file
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)
tokenizer = tfm.nlp.layers.FastWordpieceBertTokenizer(vocab_file=os.path.join(gs_folder_bert, "vocab.txt"), lower_case=True)
The vocab.txt file contains all the words in the dataset and additional token (subwords, alphabet in different languages, characters, and keywords like [PAD], etc…).
# Define BertPackInputs: a function that creates 3 matricies # Put the dataset data in the correct format for the BERT model
max_seq_length = 128
# Gives statistics about vocab.txt
special_tokens_dict = tokenizer.get_special_tokens_dict()
packer = tfm.nlp.layers.BertPackInputs(
seq_length=max_seq_length, special_tokens_dict=special_tokens_dict)
class BertInputProcessor(tf.keras.layers.Layer):
def __init__(self, tokenizer, packer):
super().__init__()
self.tokenizer = tokenizer
self.packer = packerdef call(self, inputs):
tok1 = self.tokenizer(inputs['sentence1'])
tok2 = self.tokenizer(inputs['sentence2'])
packed = self.packer([tok1, tok2])
if 'label' in inputs:
return packed, inputs['label']
else:
return packed
# Pack the data in the correct format
bert_inputs_processor = BertInputProcessor(tokenizer, packer)
bert_inputs_processor
# Define the train, val, test datasetsglue_train = glue[0].map(bert_inputs_processor).prefetch(1)
glue_validation = glue[1].map(bert_inputs_processor).prefetch(1)
glue_test = glue[2].map(bert_inputs_processor).prefetch(1)
# View data prepared in the way that BERT requires
example_inputs, example_labels = next(iter(glue_train)
example_inputs
example_labels
Usage 0: load model and input text
Kaggle tensorflow bert model can be found at [4].
PREPROCESS_MODEL = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
preprocess = hub.load(PREPROCESS_MODEL)
print('preprocess :', preprocess)BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
bert = hub.load(BERT_MODEL)
print('bert :', bert)
# Test the loaded model# Convert the binary text to string sentences
# https://www.tensorflow.org/hub/tutorials/bert_experts
sentences = list(glue[0].batch(1).take(1).as_numpy_iterator())[0]['sentence1']
sentences = np.ravel(sentences)
sentences = [i.decode("utf-8") for i in sentences]
print('sentences: ', sentences)
inputs = preprocess(sentences)
print('inputs.keys(): ', inputs.keys())
inputs
# Insert the inputs into the BERT model, to obtain several output formats
outputs = bert(inputs)print('Output keys of model:', list(outputs.keys()))
# ['default', 'sequence_output', 'encoder_outputs', 'pooled_output']
# Dictionary of tensors w/ keys
# outputs
print("outputs['sequence_output'].shape: ", outputs['sequence_output'].shape)
print("outputs['encoder_outputs'].shape: ", outputs['encoder_outputs'][0].shape)
print("outputs['pooled_output'].shape: ", outputs['pooled_output'].shape)
The main three outputs are explained:
- sequence_output: represents each input token in the context; contextual embedding for every token in the dataset. shape=(32, 128, 128)
- encoder_output: intermediate activations of the last Transformer block
- pooled_output: represents each input as a whole; the embedding of the sentence
Usage 1: using Sequential notation WITHOUT Fine tuning — only give text sentences
model = tf.keras.Sequential()PREPROCESS_MODEL = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
keras_hub_layer = hub.KerasLayer(PREPROCESS_MODEL,
# input_shape=(BATCH_SIZE, ),
trainable=True,
dtype=tf.string)
# Wrap the KerasLayer object inside a Lambda layer
lambda_layer = tf.keras.layers.Lambda(lambda x: keras_hub_layer(x))
model.add(lambda_layer)
BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
keras_hub_layer2 = hub.KerasLayer(BERT_MODEL,
# input_shape=(BATCH_SIZE, 128),
# output_shape=(BATCH_SIZE, 128),
trainable=True,
dtype=tf.string)
# Outputs all 13 outputs
# lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x))
# out.keys() = dict_keys(['encoder_outputs', 'sequence_output', 'pooled_output', 'default'])
lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x)["pooled_output"])
model.add(lambda_layer2)
# select x = x["pooled_output"] # [batch_size, 128]
model.add(tf.keras.layers.Dropout(0.1))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
# Test model WITHOUT Fine Tuning: on a dataset converted to string
sentences_tensor = tf.convert_to_tensor(sentences)
model(sentences_tensor)
Note that the results do not mean anything because the data has not been fine tuned with respect to a given label.
Usage 2: using Sequential notation WITH Fine tuning — data is already pre-treated, give BERT pre-treated inputs
bert_model = tf.keras.Sequential()BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
keras_hub_layer2 = hub.KerasLayer(BERT_MODEL,
# input_shape=(BATCH_SIZE, 128),
# output_shape=(BATCH_SIZE, 128),
trainable=True,
dtype=tf.string)
# Outputs all 13 outputs
# lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x))
# out.keys() = dict_keys(['encoder_outputs', 'sequence_output', 'pooled_output', 'default'])
lambda_layer2 = tf.keras.layers.Lambda(lambda x: keras_hub_layer2(x)["pooled_output"])
bert_model.add(lambda_layer2)
# select x = x["pooled_output"] # [batch_size, 128]
bert_model.add(tf.keras.layers.Dropout(0.2))
bert_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
EPOCHS = 30train_data_size = len(glue[0]) # info.splits['train'].num_examples
steps_per_epoch = int(train_data_size / BATCH_SIZE)
num_train_steps = steps_per_epoch * EPOCHS
initial_learning_rate=2e-5
end_learning_rate=0
print('train_data_size: ', train_data_size)
print('steps_per_epoch: ', steps_per_epoch)
print('num_train_steps: ', num_train_steps)
# Learning Rate Schedule for Fine Tuning
def exponential_lr(epoch,
start_lr = initial_learning_rate,
min_lr = 0.00001, max_lr = 0.001,
rampup_epochs = 5, sustain_epochs = 0,
exp_decay = 0.8):def lr(epoch, start_lr, min_lr, max_lr, rampup_epochs, sustain_epochs, exp_decay):
lr = ((max_lr - min_lr) *
exp_decay**(epoch - rampup_epochs - sustain_epochs) +
min_lr)
return lr
return lr(epoch,
start_lr,
min_lr,
max_lr,
rampup_epochs,
sustain_epochs,
exp_decay)
# they tunned the learning rate so that the loss would decay correctly
lr_callback = tf.keras.callbacks.LearningRateScheduler(exponential_lr, verbose=True)
rng = [i for i in range(EPOCHS)]
y = [exponential_lr(x) for x in rng]
df = pd.DataFrame({'step': rng, 'rate': y})
fig = px.line(df, x="step", y="rate")
fig.show()
opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
# OR
# opt = 'adam'bert_model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
# Print model summary
bert_model.summary()
patience = 10
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=patience, mode='min')history = bert_model.fit(
glue_train,
validation_data=glue_validation,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
steps_per_epoch=steps_per_epoch,
verbose=1,
# callbacks=[lr_callback, early_stopping],
callbacks=[lr_callback]
)
# lr = 0.00001, accuracy= 0.4062
# learning_rate callback, accuracy=
Without using the learning rate callback the accuracy remains around chance level (0.4-0.5). With the learning rate callback the accuracy can increase to 0.64 steadily, but the loss and accuracy values oscillate between good to bad performance indicating the learning rate is sometimes too large.
Usage 3: using custom Subclassing notation WITHOUT Fine tuning
class preprocess_bert_model(tf.keras.Model):
def __init__(self):
super(preprocess_bert_model, self).__init__()PREPROCESS_MODEL = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
preprocess = hub.load(PREPROCESS_MODEL)
self.x_pp = hub.KerasLayer(preprocess, trainable=False)
BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
bert = hub.load(BERT_MODEL)
self.x_bert = hub.KerasLayer(bert, trainable=True)
self.dropout = tf.keras.layers.Dropout(0.1)
self.dense = tf.keras.layers.Dense(1, activation='sigmoid')
def call(self, inputs):
x = self.x_pp(inputs)
x = self.x_bert(x)
x = x["pooled_output"] # [batch_size, 128]
# sequence_output = x["sequence_output"] # [batch_size, seq_length, 128]
x = self.dropout(x)
x = self.dense(x)
return x
embedding_model = preprocess_bert_model()
# Test model WITHOUT Fine Tuning: on a dataset converted to string
sentences_tensor = tf.convert_to_tensor(sentences)
embedding_model(sentences_tensor)
Usage 4: using custom Subclassing notation WITH Fine tuning
class bert_model(tf.keras.Model):
def __init__(self):
super(bert_model, self).__init__()BERT_MODEL = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-10-h-128-a-2/versions/2"
bert = hub.load(BERT_MODEL)
self.x_bert = hub.KerasLayer(bert, trainable=True)
self.dropout = tf.keras.layers.Dropout(0.1)
self.dense = tf.keras.layers.Dense(1, activation='sigmoid')
def call(self, inputs):
x = self.x_bert(inputs)
x = x["pooled_output"] # [batch_size, 128]
# sequence_output = x["sequence_output"] # [batch_size, seq_length, 128]
x = self.dropout(x)
x = self.dense(x)
return x
bert_classifier = bert_model()bert_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
bert_classifier.summary()
patience = 10
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=patience, mode='min')history = bert_classifier.fit(
glue_train,
validation_data=glue_validation,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
steps_per_epoch=steps_per_epoch,
verbose=1,
# callbacks=[lr_callback, early_stopping],
callbacks=[lr_callback]
)
Happy Practicing! 👋
🎁 Donate: support the blog! | 💻 GitHub | 🔔 Subscribe
- Direct downloads of trained model parameters: https://github.com/google-research/bert?tab=readme-ov-file
- Code tutorials of how to train the BERT model: https://github.com/tensorflow/models/tree/master/official/common
- Fine Tune Glue Tutorial: https://www.tensorflow.org/tfmodels/nlp/fine_tune_bert
4. Model url for bert on Kaggle: https://www.kaggle.com/models/tensorflow/bert
Be the first to comment