FP8 Unable to achieve the expected FLOPS indicator in 4090

Hi experts, I tried to use Transformer Engine to detects flops that 4090 can achieve using fp8.I used te.Linear for my evaluation and got a maximum TFLOPS of only 150+。For fp16, the maximum is only 80+TFLOPS。 What I know is that 4090 can achieve 660TFLOPS in fp8 and 330TFLOPS in fp16，and 82.6TFLOPS for fp16 with non-Tensorcore. Hence, does Transformer_engine fail to call tensorcore?

Can you tell me what went wrong, Thanks!

Docker image: nvcr.io/nvidia/pytorch:23.08-py3 Here is my code and the results：

import os
import numpy as np
import torch
import transformer_engine.pytorch as te
from time import perf_counter
from torch import nn
from transformer_engine.common import recipe
import matplotlib.pyplot as plt
# Configure parallel groups

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
sizes = 0
iter_size = 64
warmup_iters = 10
timing_iters = 100
fp8_recipe = recipe.DelayedScaling(fp8_format=recipe.Format.HYBRID,amax_history_len=32, amax_compute_algo="max")
fp8_recipe.reduce_amax = True
type = "HYBRID"
data_type = torch.float16
fp8_scale = False
title = f"Single Linear with {data_type} and FP8={fp8_scale} in 4090"
torch.cuda.device(0)
def check_flops(model,inp,str):
    torch.cuda.synchronize()

    #for warmup
    for _ in range(warmup_iters):
            if fp8_scale:
                with te.fp8_autocast(enabled=True,fp8_recipe=fp8_recipe):
                    model(inp)
            else:
                model(inp)
    torch.cuda.synchronize()

    #start
    start = perf_counter()
    for _ in range(timing_iters):
        if fp8_scale:
            with te.fp8_autocast(enabled=True,fp8_recipe=fp8_recipe):
                model(inp)
        else:
            model(inp)
    torch.cuda.synchronize()
    end = perf_counter()
    time = (end - start)
    FLOPS = cal_flop() / (time/timing_iters)
    print(f"FLOPS = {FLOPS/1e12} TFLOPS in {str}")
    return FLOPS/1e12

def cal_flop():
    return sizes * sizes * sizes

flops_record = []
for i in range(iter_size):
    sizes = (i + 1) * 256
    model = te.Linear(sizes,sizes)
    model.to(dtype=torch.float16).cuda()
    inp = torch.randn(sizes,sizes,dtype=torch.float16).to('cuda')
    flops = check_flops(model,inp,sizes)
    flops_record.append(flops)

#convert into graph
x = np.array([(i+1)*256 for i in range(iter_size)])
plt.figure( figsize=(20,10) )
plt.plot(x, flops_record, label=type)

plt.xlabel('Sizes')
plt.ylabel('TFLOPS')
plt.title(title)

plt.legend()

plt.savefig(f"{title}.png")

Single Linear with torch float16 and FP8=True in 4090

NVIDIA / TransformerEngine

FP8 Unable to achieve the expected FLOPS indicator in 4090 #606