A Question about finetuning models using adapters with FSDP with accelerate

Abhrant commented 4 weeks ago

I am trying to use a PEFT method "SSF : (https://arxiv.org/pdf/2210.08823)" to finetune large models using FSDP. This my code for creating new layers :

    class SSFLayer:
        def __init__(self, dim) -> None:
            self.ssf_scale = nn.Parameter(torch.ones(dim)).to(self.weight.device)
            self.ssf_shift = nn.Parameter(torch.zeros(dim)).to(self.weight.device)
            self.merged = False
            nn.init.normal_(self.ssf_scale, mean=1, std=0.02)
            nn.init.normal_(self.ssf_shift, std=0.02)

        def ssf_ada(self, x):
            self.to(x.device)
            if x.dtype == torch.float16:
                ssf_scale = self.ssf_scale.half()
                ssf_shift = self.ssf_shift.half()
            else:
                ssf_scale = self.ssf_scale.float()
                ssf_shift = self.ssf_shift.float()

            if x.shape[-1] == ssf_scale.shape[0]:
                return x * ssf_scale + ssf_shift
            elif x.shape[1] == ssf_scale.shape[0]:
                return x * ssf_scale.view(1, -1, 1, 1) + ssf_shift.view(1, -1, 1, 1)
            else:
                raise ValueError(
                    "The input tensor shape does not match the shape of the scale factor."
                )

        def merge(self) -> None:
            if self.merged:
                warnings.warn("Already merged. Nothing to do.")
                return
            if self.weight.device == self.ssf_scale.device:
                pass
            else:
                self.ssf_scale.to(self.weight.device)

            if self.weight.shape == self.ssf_scale.shape:  # layernorm
                self.weight.data = self.weight.data * self.ssf_scale.to(self.weight.device)

            elif (
                self.weight.shape[0] == self.ssf_scale.shape[0]
                and len(self.weight.shape) == 2
            ):  # linear
                self.weight.data = self.weight.data * self.ssf_scale.to(
                    self.weight.device
                ).view(-1, 1)

            elif (
                self.weight.shape[0] == self.ssf_scale.shape[0]
                and len(self.weight.shape) == 4
            ):  # conv
                self.weight.data = self.weight.data * self.ssf_scale.to(
                    self.weight.device
                ).view(-1, 1, 1, 1)

            else:
                raise ValueError(
                    "the input tensor shape does not match the shape of the scale factor."
                )
            if self.bias is None:
                factory_kwargs = {"device": self.weight.device, "`dtype`": torch.float16}
                if isinstance(self, Conv2d):
                    self.bias = nn.Parameter(
                        torch.zeros(self.out_channels, **factory_kwargs)
                    )
                else:
                    self.bias = nn.Parameter(
                        torch.zeros(self.out_features, **factory_kwargs)
                    )

            self.bias.data = self.ssf_shift.to(
                self.bias.device
            ) + self.bias.data * self.ssf_scale.to(self.bias.device)

            self.merged = True

  class Linear(nn.Linear, SSFLayer):
      def __init__(self, in_features, out_features, dim, **kwargs):
          nn.Linear.__init__(self, in_features, out_features, **kwargs)
          SSFLayer.__init__(self, dim)
          self.weight.requires_grad = False

      def forward(self, x):
          result = super().forward(x)
          result = self.ssf_ada(result)
          return result

  class LayerNorm(nn.LayerNorm, SSFLayer):
      def __init__(self, dim, **kwargs):
          nn.LayerNorm.__init__(self, dim)
          SSFLayer.__init__(self, dim)
          self.weight.requires_grad = False

      def forward(self, x):
          result = super().forward(x)
          result = self.ssf_ada(result)
          return result

This code runs on a single GPU without any issues. But when I am trying to run the same code on multiple GPUs (L4) using FSDP via accelerate, I keep getting this error : ValueError: Must flatten tensors with uniform dtype but got torch.float16 and torch.float32

I don't understand why this keeps happening. Some help please ?

PS. - I have also tried exclusively converting parameters to .float() and .half() based on the original param dtype, but that didn't help.

Abhrant commented 4 weeks ago

Tagging huggingface/peft#1834 for reference.

BenjaminBossan commented 4 weeks ago

Did you try out what I mentioned in my comment? This is not something that accelerate can solve automatically for you.

Abhrant commented 4 weeks ago

Fized it !

nn.Linear.init(self, in_features, out_features, **kwargs) automatically started a layer in fp32. I passed the dtype arg in kwargs now and it works!

Thanks @BenjaminBossan

huggingface / accelerate

A Question about finetuning models using adapters with FSDP with accelerate #2835