qaic-exec mxfp6 different with microscaling

In https://www.qualcomm.com/developer/blog/2024/01/qualcomm-cloud-ai-100-accelerates-large-language-model-inference-2x-using-microscaling-mx the AI 100 got significant results in LLM inference. Now I'm using AI 100 on AWS dl2q learning the -mxfp6-matmul given by qaic-run --exec, another way to learn mxfp6 is code in https://github.com/microsoft/microxcaling. But I found some inconsistencies between them, as shown below:

Microscaling

import torch
import sys
import numpy as np
sys.path.append('/root/microxcaling')
from mx.mx_ops import _quantize_mx

if __name__ == '__main__':
    scale_bits = 8
    elem_format = 'fp6_e2m3'
    block_size = 8
    round = 'even'
    rnd_mode = 'round'
    flush_fp32_subnorms = False
    device = 'cpu'
    custom_cuda = False

    print(f"\nformat: {elem_format}, block_size: {block_size}, round {round}, rnd_mode {rnd_mode}, flush_fp32_subnorms: {flush_fp32_subnorms}")

    x1 = torch.tensor([0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 8.0], device='cpu')
    print("x: ", x1.shape, x1, flush = True)

    y1 = _quantize_mx(x1, scale_bits, elem_format,
                      block_size=block_size,
                      axes=[-1],
                      round=round,
                        rnd_mode=rnd_mode,
                      flush_fp32_subnorms=flush_fp32_subnorms,
                      custom_cuda=custom_cuda)
    print("x: ", y1.shape, y1, flush = True)

Outputs:

format: fp6_e2m3, block_size: 8, round even, rnd_mode round, flush_fp32_subnorms: False
x:  torch.Size([8]) tensor([0.0156, 0.0312, 0.0625, 0.1250, 0.2500, 0.5000, 1.0000, 8.0000])
y:  torch.Size([8]) tensor([0.0000, 0.0000, 0.0000, 0.0000, 0.2500, 0.5000, 1.0000, 8.0000])

0.0156, 0.0312, 0.0625, 0.1250 were clamp to 0

AI 100

import torch
import torch.nn as nn
import torch.nn.functional as F

# Model defined
class MyModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MyModel, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        x = self.linear(x)
        return x

input_size = 32
output_size = 1
model = MyModel(input_size, output_size)

data=torch.ones((1,32))

w_data = torch.zeros((1, 32))
w_data[0][:8] = torch.tensor([0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 8.0])

model.linear.weight.data = w_data
model.linear.bias.data =  torch.zeros(output_size)

a = torch.ones(1, input_size)
print(model(data)) # got sum([0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 8.0])
torch.onnx.export(model,    # PyTorch model
    a,                      # Input tensor
    'my_model_a.onnx',        # Output file
    export_params = True,      # Export the model parameters
    input_names   = ['input'], # Input tensor names
    output_names  = ['output'] # Output tensor names
)

After onnx model generated, quant it to qaic-model with

qaic-exec \
-m=my_model_a.onnx \
  -aic-hw \
  -aic-hw-version=2.0 \
  -convert-to-fp16 \
  -mxfp6-matmul\
  -aic-num-cores=4 \
  -compile-only \
  -aic-binary-dir=qaic_my_modela

Finally, run qaic model with py API:

import qaic
import numpy
import pickle
import sys

print('qaic_my_modela/programqpc.bin')

qaic_sess = qaic.Session(model_path=f'{dirs}/programqpc.bin', num_activations=1, time_passes=True, aic_perf_metrics=True)

input_shape, input_type = qaic_sess.model_input_shape_dict['input']
print("input_shape: ", input_shape)
print("input_type: ", input_type)

input_data = numpy.ones(input_shape, numpy.float32)
input_dict = {'input': input_data}

output_shape, output_type = qaic_sess.model_output_shape_dict['output']
print("output_shape: ", output_shape)
print("output_type: ", output_type)

qaic_sess.setup()                   # Load the model to the device.
output = qaic_sess.run(input_dict)  # Execute on AIC100

print("output: ", output)

Output:

input_shape:  (1, 32)
input_type:  float32
output_shape:  (1, 1)
output_type:  float32
output:  {'output': array([[ 9.50]], dtype=float32)}

And the results is: 9.5, which is different from expected 9.75 (0.25+0.5+1+8)

Question

Are the implementations of qaic-mxfp6 and microscaling not completely consistent?

quic / cloud-ai-sdk

qaic-exec mxfp6 different with microscaling #7

Microscaling

AI 100

Question