A library for accelerating Transformer models on NVIDIA GPUs, including using 8-bit floating point (FP8) precision on Hopper and Ada GPUs, to provide better performance with lower memory utilization in both training and inference.
Hi experts, I tried to use Transformer Engine to detects flops that 4090 can achieve using fp8.I used te.Linear for my evaluation and got a maximum TFLOPS of only 150+。For fp16, the maximum is only 80+TFLOPS。
What I know is that 4090 can achieve 660TFLOPS in fp8 and 330TFLOPS in fp16,and 82.6TFLOPS for fp16 with non-Tensorcore. Hence, does Transformer_engine fail to call tensorcore?
Can you tell me what went wrong, Thanks!
Docker image: nvcr.io/nvidia/pytorch:23.08-py3Here is my code and the results:
import os
import numpy as np
import torch
import transformer_engine.pytorch as te
from time import perf_counter
from torch import nn
from transformer_engine.common import recipe
import matplotlib.pyplot as plt
# Configure parallel groups
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
sizes = 0
iter_size = 64
warmup_iters = 10
timing_iters = 100
fp8_recipe = recipe.DelayedScaling(fp8_format=recipe.Format.HYBRID,amax_history_len=32, amax_compute_algo="max")
fp8_recipe.reduce_amax = True
type = "HYBRID"
data_type = torch.float16
fp8_scale = False
title = f"Single Linear with {data_type} and FP8={fp8_scale} in 4090"
torch.cuda.device(0)
def check_flops(model,inp,str):
torch.cuda.synchronize()
#for warmup
for _ in range(warmup_iters):
if fp8_scale:
with te.fp8_autocast(enabled=True,fp8_recipe=fp8_recipe):
model(inp)
else:
model(inp)
torch.cuda.synchronize()
#start
start = perf_counter()
for _ in range(timing_iters):
if fp8_scale:
with te.fp8_autocast(enabled=True,fp8_recipe=fp8_recipe):
model(inp)
else:
model(inp)
torch.cuda.synchronize()
end = perf_counter()
time = (end - start)
FLOPS = cal_flop() / (time/timing_iters)
print(f"FLOPS = {FLOPS/1e12} TFLOPS in {str}")
return FLOPS/1e12
def cal_flop():
return sizes * sizes * sizes
flops_record = []
for i in range(iter_size):
sizes = (i + 1) * 256
model = te.Linear(sizes,sizes)
model.to(dtype=torch.float16).cuda()
inp = torch.randn(sizes,sizes,dtype=torch.float16).to('cuda')
flops = check_flops(model,inp,sizes)
flops_record.append(flops)
#convert into graph
x = np.array([(i+1)*256 for i in range(iter_size)])
plt.figure( figsize=(20,10) )
plt.plot(x, flops_record, label=type)
plt.xlabel('Sizes')
plt.ylabel('TFLOPS')
plt.title(title)
plt.legend()
plt.savefig(f"{title}.png")
Hi experts, I tried to use Transformer Engine to detects flops that 4090 can achieve using fp8.I used te.Linear for my evaluation and got a maximum TFLOPS of only 150+。For fp16, the maximum is only 80+TFLOPS。 What I know is that 4090 can achieve 660TFLOPS in fp8 and 330TFLOPS in fp16,and 82.6TFLOPS for fp16 with non-Tensorcore. Hence, does Transformer_engine fail to call tensorcore?
Can you tell me what went wrong, Thanks!
Docker image: nvcr.io/nvidia/pytorch:23.08-py3 Here is my code and the results: