Incompatible shapes issue when prediction output and actual output dimensions differ

I am trying to create a multi-class classification model with Keras and it appears that when I try to fit the model, the shapes in the output are different that target. Although, I am not explicitly defining the shape in the code though.

#######################################
### -------- Load libraries ------- ###
#Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
#Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
import pandas as pd
from sklearn.model_selection import train_test_split

#Import data from csv
data = pd.read_csv('local_folder/datasets/training.csv')

#Remove a row if any of the three remaining columns are missing
data = data[['id', 'code1', 'code2', 'code3', 'code4', 'text']].dropna()

#Remove rows, where the label is present only ones (can't be split)
data = data.groupby('code1').filter(lambda x : len(x) > 1)
data = data.groupby('code2').filter(lambda x : len(x) > 1)
data = data.groupby('code3').filter(lambda x : len(x) > 1)
data = data.groupby('code4').filter(lambda x : len(x) > 1)

#Set your model output as categorical and save in new label col
data['code1'] = pd.Categorical(data['code1'])
data['code2'] = pd.Categorical(data['code2'])
data['code3'] = pd.Categorical(data['code3'])
data['code4'] = pd.Categorical(data['code4'])

# Transform your output to numeric
data['code1_label'] = data['code1'].cat.codes
data['code2_label'] = data['code2'].cat.codes
data['code3_label'] = data['code3'].cat.codes
data['code4_label'] = data['code4'].cat.codes

#Only consider records which have multiple values
data = data[data['code1'].map(data['code1'].value_counts()) > 5]
data = data[data['code2'].map(data['code2'].value_counts()) > 5]
data = data[data['code3'].map(data['code3'].value_counts()) > 5]
data = data[data['code4'].map(data['code4'].value_counts()) > 5]

# Split into train and test - stratify over label
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['code1_label']])

#######################################
### --------- Setup BERT ---------- ###

#Name of the BERT model to use
model_name = 'bert-base-uncased'

#Max length of tokens
max_length = 100

#Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

#Load BERT tokenizer
#tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name)
tokenizer.save_pretrained("./BERT-Topic_Trained_Model/models/tokenizer/")

#Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

#######################################
### ------- Build the model ------- ###

#TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

#Load the MainLayer
bert = transformer_model.layers[0]

#Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
#inputs = {'input_ids': input_ids}

#Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

#Then build your model output
code1 = Dense(units=len(data.code1.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='code1')(pooled_output)
code2 = Dense(units=len(data.code2.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='code2')(pooled_output)
code3 = Dense(units=len(data.code3.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='code3')(pooled_output)
code4 = Dense(units=len(data.code4.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='code4')(pooled_output)
outputs = {'code1': code1, 'code2': code2, 'code3': code3, 'code4': code4}

#And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

#Take a look at the model
model.summary()

This is how the output looks like -

Model built in 1.570626974105835 seconds
Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 attention_mask (InputLayer)    [(None, 100)]        0           []                               

 input_ids (InputLayer)         [(None, 100)]        0           []                               

 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['attention_mask[0][0]',         
                                thPoolingAndCrossAt               'input_ids[0][0]']              
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                               
                                 768),                                                            
                                 pooler_output=(Non                                               
                                e, 768),                                                          
                                 past_key_values=No                                               
                                ne, hidden_states=N                                               
                                one, attentions=Non                                               
                                e, cross_attentions                                               
                                =None)                                                            

 pooled_output (Dropout)        (None, 768)          0           ['bert[0][1]']                   

 code1 (Dense)                   (None, 57)           43833       ['pooled_output[0][0]']          

 code2 (Dense)                   (None, 100)          76900       ['pooled_output[0][0]']          

 code3 (Dense)                   (None, 149)          114581      ['pooled_output[0][0]']          

 code4 (Dense)                   (None, 265)          203785      ['pooled_output[0][0]']          

==================================================================================================
Total params: 109,921,339
Trainable params: 109,921,339
Non-trainable params: 0
__________________________________________________________________________________________________

Now as you can see the output shape for code4 labels is (None, 265)

#######################################
### ------- Train the model ------- ###

#Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

#Set loss and metrics
loss = {'code1': CategoricalCrossentropy(from_logits = True),
       'code2': CategoricalCrossentropy(from_logits = True),
       'code3': CategoricalCrossentropy(from_logits = True),
       'code4': CategoricalCrossentropy(from_logits = True)}
metric = {'code1': CategoricalAccuracy('accuracy'),
         'code2': CategoricalAccuracy('accuracy'),
         'code3': CategoricalAccuracy('accuracy'),
         'code4': CategoricalAccuracy('accuracy')}

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_code1 = to_categorical(data['code1_label'])
y_code2 = to_categorical(data['code2_label'])
y_code3 = to_categorical(data['code3_label'])
y_code4 = to_categorical(data['code4_label'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=data['text'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    #x={'input_ids': x['input_ids']},
    y={'code1': y_code1, 'code2': y_code2, 'code3': y_code3, 'code4': y_code4},
    validation_split=0.2,
    batch_size=128,
    epochs=5)

Error Message:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [23], in <cell line: 48>()
48 history = model.fit(
49    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
50    #x={'input_ids': x['input_ids']},
51    y={'code1': y_code1, 'code2': y_code2, 'code3': y_code3, 'code4': y_code4},
52    validation_split=0.2,
53    batch_size=128,
54    epochs=5)

File /opt/conda/lib/python3.8/site-packages/keras/utils/traceback_utils.py:67, in filter_traceback.<locals>.error_handler(*args, **kwargs)
     65 except Exception as e:  # pylint: disable=broad-except
     66   filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67   raise e.with_traceback(filtered_tb) from None
     68 finally:
     69   del filtered_tb

File /tmp/__autograph_generated_filekacvt2lr.py:15, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
     13 try:
     14     do_return = True
---> 15     retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
     16 except:
     17     do_return = False

ValueError: in user code:

    File "/opt/conda/lib/python3.8/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.8/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.8/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/opt/conda/lib/python3.8/site-packages/keras/engine/training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.8/site-packages/keras/engine/training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "/opt/conda/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/conda/lib/python3.8/site-packages/keras/losses.py", line 139, in __call__
        losses = call_fn(y_true, y_pred)
    File "/opt/conda/lib/python3.8/site-packages/keras/losses.py", line 243, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/conda/lib/python3.8/site-packages/keras/losses.py", line 1787, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/opt/conda/lib/python3.8/site-packages/keras/backend.py", line 5119, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 263) and (None, 265) are incompatible

Although this error is specific towards code4 from the spare perspective, I believe that this would happen with each of the 4 labels. So, is there a way to clean code this since I am trying to avoid hardcoding the shape (if that is even possible), since the test dataset size would differ over time.

keras-team / tf-keras

Incompatible shapes issue when prediction output and actual output dimensions differ #491