davidtvs / pytorch-lr-finder

A learning rate range test implementation in PyTorch
MIT License
912 stars 116 forks source link

Obtaining ValueError on m-BERT even after using TrainDataLoaderIter #73

Closed ishandutta0098 closed 7 months ago

ishandutta0098 commented 3 years ago

I am training a multilingual-bert model for a sentiment classification task. My torch dataset returns a dictionary. I tried to run lr_finder.range_test(....) with and without TrainDataLoaderIter but I get the same ValueError both times.

Torch Dataset

class JigsawDataset:
    def __init__(self, df, train_transforms = None):
        self.comment_text = df["comment_text"].values
        self.target = df["toxic"].values
        self.tokenizer = config.BERT_TOKENIZER
        self.max_len = config.MAX_LEN
        self.langs = df["lang"].values
        self.train_transforms = train_transforms

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())
        lang = self.langs[item]

        if self.train_transforms:
            comment_text, _ = self.train_transforms(data=(comment_text, lang))['data']

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        data_loader_dict = {}
        data_loader_dict["ids"] = torch.tensor(ids, dtype=torch.long)
        data_loader_dict["mask"] = torch.tensor(mask, dtype=torch.long)
        data_loader_dict["token_type_ids"] = torch.tensor(token_type_ids, dtype=torch.long)
        data_loader_dict["targets"] = torch.tensor(self.target[item], dtype=torch.float)

        return data_loader_dict

Run Function

%%time

def run():

    class CustomTrainIter(TrainDataLoaderIter):
        def input_labels_from_batch(self, batch_data):
            return batch_data["ids"], batch_data["mask"], batch_data["token_type_ids"], batch_data["targets"]

    def loss_fn(outputs, targets):
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def train_fn(data_loader, model, optimizer, device,):

        model, optimizer, data_loader = accelerator.prepare(model, optimizer, data_loader)
        model.train()

        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            optimizer.zero_grad()
            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

            loss = loss_fn(outputs, targets)

            if bi % 1000 == 0:
                print(f"bi={bi}, loss={loss}")

            accelerator.backward(loss)
            optimizer.step()

    def eval_fn(data_loader, model, device):
        model.eval()
        fin_targets = []
        fin_outputs = []

        with torch.no_grad():
            for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
                ids = d["ids"]
                token_type_ids = d["token_type_ids"]
                mask = d["mask"]
                targets = d["targets"]

                ids = ids.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                targets = targets.to(device, dtype=torch.float)

                outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
                fin_targets.extend(targets.cpu().detach().numpy().tolist())
                fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        return fin_outputs, fin_targets

    df1 = pd.read_csv(
        "/workspace/data/jigsaw-multilingual/input/jigsaw-data/jigsaw-toxic-comment-train.csv", 
        usecols = ["comment_text", "toxic"]    
    )

    df1 = df1.head(1000)

    df2 = pd.read_csv(
        "/workspace/data/jigsaw-multilingual/input/jigsaw-data/jigsaw-unintended-bias-train.csv",
        usecols = ["comment_text", "toxic"]
    )

    df2 = df2.head(1000)

    df_train = pd.concat([df1, df2], axis = 0).reset_index(drop = True)
    df_train["comment_text"] = df_train["comment_text"].apply(clean_text)

    df_valid = pd.read_csv("/workspace/data/jigsaw-multilingual/input/jigsaw-data/Translated Datasets/jigsaw_miltilingual_valid_translated.csv")
    df_valid["comment_text"] = df_valid["translated"]
    df_valid.drop("translated", axis = 1, inplace = True)
    df_valid["comment_text"] = df_valid["comment_text"].apply(clean_text)

    nlp_transform = NLPTransform()

    df_train['lang'] = 'en'
    non_toxic_sentences = set()
    for comment_text in tqdm(df_train['comment_text'], total=df.shape[0]):
        non_toxic_sentences.update(nlp_transform.get_sentences(comment_text), 'en')

    transform = AddNonToxicSentencesTransform(non_toxic_sentences=list(non_toxic_sentences), p=1.0, sentence_range=(1,2))

    train_dataset = JigsawDataset(
       df =  df_train,
       train_transforms = get_train_transforms()
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, 
        batch_size=config.TRAIN_BATCH_SIZE, 
        num_workers=4
    )

    valid_dataset = JigsawDataset(
        df = df_valid,
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, 
        batch_size=config.VALID_BATCH_SIZE, 
        num_workers=1
    )

    device = torch.device(config.DEVICE)
    model = BERTModel()

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)

    criterion = nn.BCEWithLogitsLoss()
    lr_finder = LRFinder(
        model, 
        optimizer, 
        criterion, 
        device = config.DEVICE
    )

    custom_train_iter = CustomTrainIter(train_data_loader)

    lr_finder.range_test(
        custom_train_iter, 
        end_lr = 10, 
        num_iter = 100, 
        step_mode = "exp"
    )

    best_accuracy = 0
    for epoch in range(config.EPOCHS):

        print(f"----------EPOCH: {epoch}----------")
        train_fn(train_data_loader, model, optimizer, device)
        outputs, targets = eval_fn(valid_data_loader, model, device)
        targets = np.array(targets) >= 0.5
        accuracy = metrics.roc_auc_score(targets, outputs)
        print(f"----------ROC AUC Score = {accuracy}----------")
        print()
        if accuracy > best_accuracy:
            torch.save(model.state_dict(), config.MODEL_PATH)
            best_accuracy = accuracy

if name == "main": run()

Error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>

<timed exec> in run()

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in range_test(self, train_loader, val_loader, start_lr, end_lr, num_iter, step_mode, smooth_f, diverge_th, accumulation_steps, non_blocking_transfer)
    318                 train_iter,
    319                 accumulation_steps,
--> 320                 non_blocking_transfer=non_blocking_transfer,
    321             )
    322             if val_loader:

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in _train_batch(self, train_iter, accumulation_steps, non_blocking_transfer)
    369         self.optimizer.zero_grad()
    370         for i in range(accumulation_steps):
--> 371             inputs, labels = next(train_iter)
    372             inputs, labels = self._move_to_device(
    373                 inputs, labels, non_blocking=non_blocking_transfer

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in __next__(self)
     57         try:
     58             batch = next(self._iterator)
---> 59             inputs, labels = self.inputs_labels_from_batch(batch)
     60         except StopIteration:
     61             if not self.auto_reset:

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in inputs_labels_from_batch(self, batch_data)
     34                 "Your batch type is not supported: {}. Please inherit from "
     35                 "`TrainDataLoaderIter` or `ValDataLoaderIter` and override the "
---> 36                 "`inputs_labels_from_batch` method.".format(type(batch_data))
     37             )
     38 

ValueError: Your batch type is not supported: <class 'dict'>. Please inherit from `TrainDataLoaderIter` or `ValDataLoaderIter` and override the `inputs_labels_from_batch` method.
NaleRaphael commented 3 years ago

Update_1: fix typo of method name inputs_labels_from_batch, input should be plural here Update_2: missing invocation of super() in the subclass of nn.Module

Hi @ishandutta0098,

You should return only 2 values from the method inputs_labels_from_batch(), you can try this snippet to see whether it works or not:

class CustomTrainIter(TrainDataLoaderIter):
    def inputs_labels_from_batch(self, batch_data):
        inputs = batch_data["ids"], batch_data["mask"], batch_data["token_type_ids"]
        labels = batch_data["targets"]
        return inputs, labels

And the following code shows why we have to format the output like that: https://github.com/davidtvs/pytorch-lr-finder/blob/acc5e7ee7711a460bf3e1cc5c5f05575ba1e1b4b/torch_lr_finder/lr_finder.py#L371-L378

  1. In L371, data sample will be retrieved from a TrainDataLoaderIter. Content of inputs and labels are determined to the implementation of TrainDataLoaderIter.inputs_labels_from_batch().
  2. In L377, the model takes the inputs as input data of model.forward(). In your case, signature of model.forward() should be something like this:
    class Model(nn.Module):
        def forward(self, ids=None, mask=None, token_type_ids=None):
            # ...

    Since the method invocation self.model(inputs) (in L377) takes only 1 variable, we have to pack all custom inputs into 1 value. And that's why the inputs is written as inputs = batch_data["ids"], batch_data["mask"], batch_data["token_type_ids"] in the snippet above.


Besides, since your model.forward() takes only keyword arguments, you might want to guarantee the order of inputs. To achieve this, you can create a wrapper for your model, and use it in lr_finder.range_test():

class ModelWrapper(nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, inputs):
        # You can organize the order of inputs here, or even handle more complicated forward logic for model.
        ids, mask, token_type_ids = inputs
        return self.model(ids=ids, mask=mask, token_type_ids=token_type_ids)

# ... code for utilizing lr_finder
my_model = Model(...)        # your original model
model_wrapper = ModelWrapper(my_model)

lr_finder = LRFinder(
    model_wrapper, 
    optimizer, 
    criterion, 
    device = config.DEVICE
)

If there is any further question, please feel free to let me know!

ishandutta0098 commented 3 years ago

Hi @NaleRaphael, I tried as you said but I now I am getting two different errors.

Original Model

class BERTModel(nn.Module):
    def __init__(self):
        super(BERTModel, self).__init__()
        self.bert = transformers.BertModel.from_pretrained("bert-base-multilingual-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1) # *2 since we have 2 pooling layers

    def forward(self, ids, mask, token_type_ids):
        o1, _ = self.bert(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids
        )

        mean_pooling = torch.mean(o1, 1)
        max_pooling, _ = torch.max(o1, 1)
        cat = torch.cat((mean_pooling, max_pooling), 1)

        bo = self.bert_drop(cat)
        output = self.out(bo)
        return output

Case-1:

Model Wrapper

class ModelWrapper(nn.Module):
    def __init__(self, model):
        self.model = model

    def forward(self, inputs):
        ids, mask, token_type_ids = inputs

        return self.model(
            ids = ids, 
            mask = mask,
            token_type_ids = token_type_ids
        )

CustomTrainerIter

class CustomTrainIter(TrainDataLoaderIter):
            def input_labels_from_batch(self, batch_data):
                inputs = batch_data["ids"], batch_data["mask"], batch_data["token_type_ids"]
                labels = batch_data["targets"] 
                return  inputs, labels

lr_finder Implementation

criterion = nn.BCEWithLogitsLoss()
        lr_finder = LRFinder(
            model_wrapper, 
            optimizer, 
            criterion, 
            device = config.DEVICE
        )

        custom_train_iter = CustomTrainIter(train_data_loader)

        lr_finder.range_test(
            custom_train_iter, 
            end_lr = 10, 
            num_iter = 100, 
            step_mode = "exp"
        )

        lr_finder.plot()

Error

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<timed exec> in <module>

<timed exec> in run()

<ipython-input-64-162d67c07fe1> in __init__(self, model)
      1 class ModelWrapper(nn.Module):
      2     def __init__(self, model):
----> 3         self.model = model
      4 
      5     def forward(self, inputs):

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in __setattr__(self, name, value)
    803                 if modules is None:
    804                     raise AttributeError(
--> 805                         "cannot assign module before Module.__init__() call")
    806                 remove_from(self.__dict__, self._parameters, self._buffers, self._non_persistent_buffers_set)
    807                 modules[name] = value

AttributeError: cannot assign module before Module.__init__() call

Case-2:

Model Wrapper

class ModelWrapper(nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, inputs):
        ids, mask, token_type_ids = inputs

        return self.model(
            ids = ids, 
            mask = mask,
            token_type_ids = token_type_ids
        )

CustomTrainerIter

class CustomTrainIter(TrainDataLoaderIter):
            def input_labels_from_batch(self, batch_data):
                inputs = batch_data["ids"], batch_data["mask"], batch_data["token_type_ids"]
                labels = batch_data["targets"] 
                return  inputs, labels

lr_finder Implementation

criterion = nn.BCEWithLogitsLoss()
        lr_finder = LRFinder(
            model_wrapper, 
            optimizer, 
            criterion, 
            device = config.DEVICE
        )

        custom_train_iter = CustomTrainIter(train_data_loader)

        lr_finder.range_test(
            custom_train_iter, 
            end_lr = 10, 
            num_iter = 100, 
            step_mode = "exp"
        )

        lr_finder.plot()

Error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>

<timed exec> in run()

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in range_test(self, train_loader, val_loader, start_lr, end_lr, num_iter, step_mode, smooth_f, diverge_th, accumulation_steps, non_blocking_transfer)
    318                 train_iter,
    319                 accumulation_steps,
--> 320                 non_blocking_transfer=non_blocking_transfer,
    321             )
    322             if val_loader:

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in _train_batch(self, train_iter, accumulation_steps, non_blocking_transfer)
    369         self.optimizer.zero_grad()
    370         for i in range(accumulation_steps):
--> 371             inputs, labels = next(train_iter)
    372             inputs, labels = self._move_to_device(
    373                 inputs, labels, non_blocking=non_blocking_transfer

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in __next__(self)
     57         try:
     58             batch = next(self._iterator)
---> 59             inputs, labels = self.inputs_labels_from_batch(batch)
     60         except StopIteration:
     61             if not self.auto_reset:

/opt/conda/lib/python3.6/site-packages/torch_lr_finder/lr_finder.py in inputs_labels_from_batch(self, batch_data)
     34                 "Your batch type is not supported: {}. Please inherit from "
     35                 "`TrainDataLoaderIter` or `ValDataLoaderIter` and override the "
---> 36                 "`inputs_labels_from_batch` method.".format(type(batch_data))
     37             )
     38 

ValueError: Your batch type is not supported: <class 'dict'>. Please inherit from `TrainDataLoaderIter` or `ValDataLoaderIter` and override the `inputs_labels_from_batch` method.
NaleRaphael commented 3 years ago

Hi, @ishandutta0098

Sorry for the late response, things are busy here.

That's my fault, there were some typos in previous comment, and the error raised in your case 1 indicating the missing invocation of super(). The model wrapper should be like this:

class ModelWrapper(nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, inputs):
        ids, mask, token_type_ids = inputs
        return self.model(ids=ids, mask=mask, token_type_ids=token_type_ids)

And the typos locate at the method name input_labels_from_batch, it should be inputs_labels_from_batch instead. (input should be plural)

I'll update the that comment in case any one copy those incorrect code. Hope it work this time, and please do let me know if there is any further problem.

davidtvs commented 7 months ago

Closing due to inactivity