Open CharlieL7 opened 1 week ago
--fp8
--int8
@24 = gpu::code_object[code_object=8920,symbol_name=mlir_quantizelinear_quant_dot_dequantizelinear_add_add,global=1769472,local=256,](@18,@21,@23,@15,@22) -> half_type, {64, 384, 2304}, {884736, 2304, 1} @25 = load[offset=603979776,end=622854144](@1) -> fp8e4m3fnuz_type, {64, 12, 64, 384}, {294912, 24576, 384, 1} @26 = slice[axes={2},starts={768},ends={1536}](@24) -> half_type, {64, 384, 768}, {884736, 2304, 1} @27 = reshape_lazy[dims={64, 384, 12, 64}](@26) -> half_type, {64, 384, 12, 64}, {884736, 2304, 64, 1} @28 = transpose[permutation={0, 2, 3, 1}](@27) -> half_type, {64, 12, 64, 384}, {884736, 64, 1, 2304} @29 = gpu::code_object[code_object=6816,symbol_name=quantizelinear_kernel,global=1179648,local=256,](@28,@25) -> fp8e4m3fnuz_type, {64, 12, 64, 384}, {294912, 24576, 384, 1} @30 = load[offset=150994944,end=603979776](@1) -> float_type, {64, 12, 384, 384}, {1769472, 147456, 384, 1} @31 = gpu::code_object[code_object=7000,symbol_name=mlir_slice_reshape_transpose_quantizelinear_quant_dot,global=3538944,local=256,](@24,@29,@30) -> float_type, {64, 12, 384, 384}, {1769472, 147456, 384, 1}
bin/driver perf /codes/distilgpt2_1_fp16_gpu.onnx --fp8 --fill1 input_ids --input-dim @input_ids 64 384 --batch 64
@26
@31
--fp8
flag and probably also--int8
bin/driver perf /codes/distilgpt2_1_fp16_gpu.onnx --fp8 --fill1 input_ids --input-dim @input_ids 64 384 --batch 64
@26
and@31
. The quantizelinear kernel remains unfused.