DeepSpeed still, still changes metric states from fp32 to fp16

🐛 Bug

SpearmanCorrCoef does not work with deepspeed strategy when precision is 16. I believe this is related to a unexpected type conversion from 32 to 16. Spearman logging works as expected when precision is set to 32. This seems to be nearly the exact issue as #1561 except I'm using SpearmanCorrCoef instead of PearsonCorrCoef.

To Reproduce

See modified code example from #1561

Code sample

```py import os import torch from torch.utils.data import DataLoader, Dataset from pytorch_lightning import LightningModule, Trainer, LightningDataModule from torchmetrics import PearsonCorrCoef, MeanAbsoluteError, SpearmanCorrCoef # Dataset for testing class RandomDataset(Dataset): def __init__(self, size, num_samples): self.len = num_samples self.data = torch.randn(num_samples, size) def __getitem__(self, index): return self.data[index] def __len__(self): return self.len class PlDataModule(LightningDataModule): def train_dataloader(self): return DataLoader(RandomDataset(32, 64), batch_size=2) class BoringModel(LightningModule): def __init__(self): super().__init__() self.layer = torch.nn.Linear(32, 32) self.metric = PearsonCorrCoef() self.metric2 = SpearmanCorrCoef() self.mae = MeanAbsoluteError() print("Before DeepSpeed initialization") print("self.metric2.preds", self.metric.mean_x) print("self.metric2.preds.dtype", self.metric.mean_x.dtype) print("self.metric.mean_x", self.metric.mean_x) print("self.metric.mean_x.dtype", self.metric.mean_x.dtype) print("self.metric.mean_y", self.metric.mean_y) print("self.metric.dtype", self.metric.mean_y.dtype) print("self.metric.var_x", self.metric.var_x) print("self.metric.var_x.dtype", self.metric.var_x.dtype) print("self.metric.var_y", self.metric.var_y) print("self.metric.var_y.dtype", self.metric.var_y.dtype) print("self.metric.corr_xy", self.metric.corr_xy) print("self.metric.corr_xy.dtype", self.metric.corr_xy.dtype) print("self.metric.n_total", self.metric.n_total) print("self.mae.sum_abs_error.dtype", self.mae.sum_abs_error.dtype) print("self.mae.total.dtype", self.mae.total.dtype) def forward(self, x): return self.layer(x) def training_step(self, batch, batch_idx): pred = self.forward(batch) loss = self(batch).sum() self.metric.update(torch.flatten(pred), torch.flatten(batch)) print("After DeepSpeed initialization") print("self.metric2.preds", self.metric.mean_x) print("self.metric2.preds.dtype", self.metric.mean_x.dtype) print("self.metric.mean_x", self.metric.mean_x) print("self.metric.mean_x.dtype", self.metric.mean_x.dtype) print("self.metric.mean_y", self.metric.mean_y) print("self.metric.dtype", self.metric.mean_y.dtype) print("self.metric.var_x", self.metric.var_x) print("self.metric.var_x.dtype", self.metric.var_x.dtype) print("self.metric.var_y", self.metric.var_y) print("self.metric.var_y.dtype", self.metric.var_y.dtype) print("self.metric.corr_xy", self.metric.corr_xy) print("self.metric.corr_xy.dtype", self.metric.corr_xy.dtype) print("self.metric.n_total", self.metric.n_total) print("self.mae.sum_abs_error.dtype", self.mae.sum_abs_error.dtype) print("self.mae.total.dtype", self.mae.total.dtype) return {"loss": loss} def configure_optimizers(self): return torch.optim.SGD(self.layer.parameters(), lr=0.1) def run(): model = BoringModel() trainer = Trainer( default_root_dir=".", # default_root_dir=os.getcwd(), limit_train_batches=10, num_sanity_val_steps=0, max_epochs=1, strategy="deepspeed_stage_1", accelerator="gpu", precision=16 ) trainer.fit(model, datamodule=PlDataModule()) run() ```

My output

``` Before DeepSpeed initialization self.metric2.preds tensor([0.]) self.metric2.preds.dtype torch.float32 self.metric.mean_x tensor([0.]) self.metric.mean_x.dtype torch.float32 self.metric.mean_y tensor([0.]) self.metric.dtype torch.float32 self.metric.var_x tensor([0.]) self.metric.var_x.dtype torch.float32 self.metric.var_y tensor([0.]) self.metric.var_y.dtype torch.float32 self.metric.corr_xy tensor([0.]) self.metric.corr_xy.dtype torch.float32 self.metric.n_total tensor([0.]) self.mae.sum_abs_error.dtype torch.float32 self.mae.total.dtype torch.int64 /opt/conda/envs/py3.9/lib/python3.9/site-packages/lightning_fabric/connector.py:558: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead! [2024-01-17 20:10:01,703] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores IPU available: False, using: 0 IPUs HPU available: False, using: 0 HPUs initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/4 Before DeepSpeed initialization self.metric2.preds tensor([0.]) self.metric2.preds.dtype torch.float32 self.metric.mean_x tensor([0.]) self.metric.mean_x.dtype torch.float32 self.metric.mean_y tensor([0.]) self.metric.dtype torch.float32 self.metric.var_x tensor([0.]) self.metric.var_x.dtype torch.float32 self.metric.var_y tensor([0.]) self.metric.var_y.dtype torch.float32 self.metric.corr_xy tensor([0.]) self.metric.corr_xy.dtype torch.float32 self.metric.n_total tensor([0.]) self.mae.sum_abs_error.dtype torch.float32 self.mae.total.dtype torch.int64 Before DeepSpeed initialization self.metric2.preds tensor([0.]) self.metric2.preds.dtype torch.float32 self.metric.mean_x tensor([0.]) self.metric.mean_x.dtype torch.float32 self.metric.mean_y tensor([0.]) self.metric.dtype torch.float32 self.metric.var_x tensor([0.]) self.metric.var_x.dtype torch.float32 self.metric.var_y tensor([0.]) self.metric.var_y.dtype torch.float32 self.metric.corr_xy tensor([0.]) self.metric.corr_xy.dtype torch.float32 self.metric.n_total tensor([0.]) self.mae.sum_abs_error.dtype torch.float32 self.mae.total.dtype torch.int64 Before DeepSpeed initialization self.metric2.preds tensor([0.]) self.metric2.preds.dtype torch.float32 self.metric.mean_x tensor([0.]) self.metric.mean_x.dtype torch.float32 self.metric.mean_y tensor([0.]) self.metric.dtype torch.float32 self.metric.var_x tensor([0.]) self.metric.var_x.dtype torch.float32 self.metric.var_y tensor([0.]) self.metric.var_y.dtype torch.float32 self.metric.corr_xy tensor([0.]) self.metric.corr_xy.dtype torch.float32 self.metric.n_total tensor([0.]) self.mae.sum_abs_error.dtype torch.float32 self.mae.total.dtype torch.int64 [2024-01-17 20:10:06,324] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-01-17 20:10:06,346] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-01-17 20:10:06,346] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/4 initializing deepspeed distributed: GLOBAL_RANK: 3, MEMBER: 4/4 initializing deepspeed distributed: GLOBAL_RANK: 2, MEMBER: 3/4 Enabling DeepSpeed FP16. Model parameters and inputs will be cast to `float16`. You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1,2,3] LOCAL_RANK: 2 - CUDA_VISIBLE_DEVICES: [0,1,2,3] LOCAL_RANK: 3 - CUDA_VISIBLE_DEVICES: [0,1,2,3] LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3] [2024-01-17 20:10:12,542] [WARNING] [engine.py:1163:_do_optimizer_sanity_check] **** You are using ZeRO with an untested optimizer, proceed with caution ***** | Name | Type | Params ---------------------------------------------- 0 | layer | Linear | 1.1 K 1 | metric | PearsonCorrCoef | 0 2 | metric2 | SpearmanCorrCoef | 0 3 | mae | MeanAbsoluteError | 0 ---------------------------------------------- 1.1 K Trainable params 0 Non-trainable params 1.1 K Total params 0.004 Total estimated model params size (MB) /opt/conda/envs/py3.9/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance. /opt/conda/envs/py3.9/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:293: The number of training batches (8) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch. Epoch 0: 0%| | 0/8 [00:00

Expected behavior

Expecting metrics to be of type float32 when precision set to 16 and logging of metrics to be non-zero.

Environment

Output of torch.utils.collect_env

``` Collecting environment information... PyTorch version: 2.1.2+cu121 Is debug build: False CUDA used to build PyTorch: 12.1 ROCM used to build PyTorch: N/A OS: Ubuntu 22.04.3 LTS (x86_64) GCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 Clang version: Could not collect CMake version: version 3.22.1 Libc version: glibc-2.35 Python version: 3.9.18 | packaged by conda-forge | (main, Dec 23 2023, 16:33:10) [GCC 12.3.0] (64-bit runtime) Python platform: Linux-5.10.0-27-cloud-amd64-x86_64-with-glibc2.35 Is CUDA available: True CUDA runtime version: 12.1.105 CUDA_MODULE_LOADING set to: LAZY GPU models and configuration: GPU 0: NVIDIA L4 GPU 1: NVIDIA L4 GPU 2: NVIDIA L4 GPU 3: NVIDIA L4 Nvidia driver version: 535.86.10 cuDNN version: Could not collect HIP runtime version: N/A MIOpen runtime version: N/A Is XNNPACK available: True CPU: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Address sizes: 46 bits physical, 48 bits virtual Byte Order: Little Endian CPU(s): 48 On-line CPU(s) list: 0-47 Vendor ID: GenuineIntel Model name: Intel(R) Xeon(R) CPU @ 2.20GHz CPU family: 6 Model: 85 Thread(s) per core: 2 Core(s) per socket: 24 Socket(s): 1 Stepping: 7 BogoMIPS: 4400.36 Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves arat avx512_vnni md_clear arch_capabilities Hypervisor vendor: KVM Virtualization type: full L1d cache: 768 KiB (24 instances) L1i cache: 768 KiB (24 instances) L2 cache: 24 MiB (24 instances) L3 cache: 38.5 MiB (1 instance) NUMA node(s): 1 NUMA node0 CPU(s): 0-47 Vulnerability Gather data sampling: Not affected Vulnerability Itlb multihit: Not affected Vulnerability L1tf: Not affected Vulnerability Mds: Mitigation; Clear CPU buffers; SMT Host state unknown Vulnerability Meltdown: Not affected Vulnerability Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown Vulnerability Retbleed: Mitigation; Enhanced IBRS Vulnerability Spec rstack overflow: Not affected Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Mitigation; Clear CPU buffers; SMT Host state unknown Versions of relevant libraries: [pip3] numpy==1.24.1 [pip3] pytorch-lightning==2.1.3 [pip3] pytorch-ranger==0.1.1 [pip3] torch==2.1.2+cu121 [pip3] torch-optimizer==0.3.0 [pip3] torchaudio==2.1.2+cu121 [pip3] torchmetrics==1.3.0 [pip3] torchvision==0.16.2+cu121 [pip3] triton==2.1.0 [conda] numpy 1.24.1 pypi_0 pypi [conda] pytorch-lightning 2.1.3 pypi_0 pypi [conda] pytorch-ranger 0.1.1 pypi_0 pypi [conda] torch 2.1.2+cu121 pypi_0 pypi [conda] torch-optimizer 0.3.0 pypi_0 pypi [conda] torchaudio 2.1.2+cu121 pypi_0 pypi [conda] torchmetrics 1.3.0 pypi_0 pypi [conda] torchvision 0.16.2+cu121 pypi_0 pypi [conda] triton 2.1.0 pypi_0 pypi ```

Additional context

All tests run within a VSCode devcontainer

Lightning-AI / torchmetrics