quic / qidk

Other
105 stars 23 forks source link

The output from qnn is different with pytorch and onnx model #44

Closed xuli-vecml closed 1 month ago

xuli-vecml commented 1 month ago

I have a workflow where I create a PyTorch model, export it to an ONNX format, and then use QNN to convert it into .so library files. The model is a simple linear forward model with a single layer.

However, when I test the model with the same input across all three versions (PyTorch, ONNX, and QNN), the output from the QNN model differs significantly from the outputs of the PyTorch and ONNX models (which are almost identical).

I have three scripts:

1.  simple_model.py  to build the PyTorch model and export it to ONNX.
2.  run_debug.sh to run QNN commands and compile the model.
3.  compare_output_diff.py to compare the outputs of all three models.

simple_model.py

import torch
import torch.nn as nn

class SimpleForwardModel(nn.Module):
    def __init__(self, hidden_size=8):
        super(SimpleForwardModel, self).__init__()
        self.hidden_size = hidden_size
        self.linear = nn.Linear(hidden_size, hidden_size)

    def forward(self, input_tensor):
        return self.linear(input_tensor)

if __name__ == "__main__":
    # Instantiate the model
    model = SimpleForwardModel()

    # Create dummy input
    batch_size = 1
    seq_length = 128
    hidden_size = 8
    dummy_input = torch.randn(batch_size, seq_length, hidden_size, dtype=torch.float32)

    # Save the model to simple.pt
    torch.save(model, 'simple.pt')
    print("Model saved to simple.pt")

    # Convert to ONNX
    torch.onnx.export(model, dummy_input, "simple.onnx",
                      input_names=['input'],
                      output_names=['output'],
                      dynamic_axes={'input': {0: 'batch_size', 1: 'sequence_length'},
                                    'output': {0: 'batch_size', 1: 'sequence_length'}})
    print("Model exported to simple.onnx")

compare_output_diff.py

import torch
import onnx
import onnxruntime as ort
import numpy as np
import argparse
import os
import sys
from simple_model import *

import numpy as np
import os
import subprocess
import os

torch.manual_seed(0)

def save_as_rawtensor(array, filename):
    with open(filename, 'wb') as f:
        f.write(array.tobytes())

def generate_qnn_inputs(sample):
    input = np.array(sample['tokens'], dtype=np.float32)
    # Create a directory for input files if it doesn't exist
    os.makedirs('qnn_inputs', exist_ok=True)

    # Save each input as a separate .rawtensor file
    print(f"input: {input.dtype}")
    print(f"input: {input.shape}")
    save_as_rawtensor(input, 'qnn_inputs/input.rawtensor')

    # Create the input list file
    with open('qnn_input_list.txt', 'w') as f:
        f.write("input:=qnn_inputs/input.rawtensor\n")

# run cmd : qnn-net-run  --input_list qnn_input_list.txt --backend ${QNN_SDK_ROOT}/lib/x86_64-linux-clang/libQnnCpu.so --model ../minilm-L6-v2/model_libs/x86_64-linux-clang/libminilm-l6-v2.so

def run_qnn_net_run(model_path):
    qnn_sdk_root = os.environ.get('QNN_SDK_ROOT')
    if not qnn_sdk_root:
        raise EnvironmentError("QNN_SDK_ROOT environment variable is not set")

    cmd = [
        "qnn-net-run",
        "--input_list", "qnn_input_list.txt",
        "--backend", f"{qnn_sdk_root}/lib/x86_64-linux-clang/libQnnCpu.so",
        "--model", model_path
    ]

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("Command executed successfully")
        print("Output:", result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error executing command:")
        print("Return code:", e.returncode)
        print("Error output:", e.stderr)
        raise

def get_qnn_embedding(model_path, sample):
    # Call the function after generating inputs
    generate_qnn_inputs(sample)
    run_qnn_net_run(model_path)
    # the raw output is save in output/Result_0/last_hidden_state.raw, load and transform to numpy array
    with open('output/Result_0/output.raw', 'rb') as f:
        result = f.read()
    result = np.frombuffer(result, dtype=np.float32)
    print(result.shape)
    return result.reshape(1, sample['tokens'].shape[1], -1)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--pt_model_path", type=str, default="simple.pt")
    parser.add_argument("--onnx_model_path", type=str, default="simple.fixed.onnx")
    parser.add_argument("--qnn_model_path", type=str, default="simple_model_qnn.so")
    return parser.parse_args()

def load_pt_model(model_path):
    return torch.load(model_path)

def load_onnx_model(model_path):
    return ort.InferenceSession(model_path)

def generate_random_input(batch_size=1, seq_length=128):
    input = torch.randn(batch_size, seq_length, 8).to(torch.float32)
    return input

def run_pt_model(model, input):
    with torch.no_grad():
        outputs = model(input)
    return outputs[0].numpy()

def run_onnx_model(session, input):
    ort_inputs = {
        'input': input.numpy()
    }
    ort_outputs = session.run(None, ort_inputs)
    return ort_outputs[0]

def run_qnn_model(model_path, input):
    sample = {
        'tokens': input  # Assuming batch size is 1
    }
    return get_qnn_embedding(model_path, sample)

def compare_outputs(pt_output, onnx_output, qnn_output):
    print(f"pt_output: {pt_output.shape}")
    print(f"onnx_output: {onnx_output.shape}")
    print(f"qnn_output: {qnn_output.shape}")
    pt_onnx_diff = np.abs(pt_output - onnx_output).mean()
    pt_qnn_diff = np.abs(pt_output - qnn_output).mean()

    print(f"Mean difference between PyTorch and ONNX: {pt_onnx_diff}")
    print(f"Mean difference between PyTorch and QNN: {pt_qnn_diff}")

def main():
    args = parse_args()

    # Load models
    pt_model = load_pt_model(args.pt_model_path)
    print(f"torch model: {pt_model}")

    onnx_model = load_onnx_model(args.onnx_model_path)

    # Generate random input
    input = generate_random_input()

    # Run models
    pt_output = run_pt_model(pt_model, input)
    onnx_output = run_onnx_model(onnx_model, input)
    qnn_output = run_qnn_model(args.qnn_model_path, input)
    # qnn_output = 0
    # Compare outputs
    compare_outputs(pt_output, onnx_output, qnn_output)

if __name__ == "__main__":
    main()

run_debug.sh

export ANDROID_NDK_ROOT="$HOME/env/android-ndk-r26d"
export QNN_SDK_ROOT="$HOME/env/qairt/2.25.0.240728" #Folder contains lib
export LD_LIBRARY_PATH="$HOME/miniconda3/envs/qnn/lib:$HOME/miniconda3/pkgs/libcxx-14.0.0-hf52228f_0/lib:$QNN_SDK_ROOT/lib/x86_64-linux-clang/:$LD_LIBRARY_PATH"
export PATH="$ANDROID_NDK_ROOT:$QNN_SDK_ROOT/bin/x86_64-linux-clang:$PATH"
export PYTHONPATH="$ANDROID_NDK_ROOT/build:$QNN_SDK_ROOT/lib/python/:$PYTHONPATH"
export QNN_TARGET_ARCH=aarch64-android

python simple_model.py

python -m onnxruntime.tools.make_dynamic_shape_fixed --dim_param batch_size --dim_value 1 ./simple.onnx ./simple.batch_fixed.onnx
python -m onnxruntime.tools.make_dynamic_shape_fixed --dim_param sequence_length --dim_value 128 ./simple.batch_fixed.onnx ./simple.fixed.onnx

mkdir simple_model/
qnn-onnx-converter --input_network ./simple.fixed.onnx --output_path ./simple_model/simple_model.cpp

qnn-model-lib-generator \
  -c ./simple_model/simple_model.cpp \
  -b ./simple_model/simple_model.bin \
  -o ./simple_model/model_libs 

python compare_output_diff.py --qnn_model_path ./simple_model/model_libs/x86_64-linux-clang/libsimple_model.so

When I run bash run_debug.sh I got all the work done and get the result: Mean difference between PyTorch and ONNX: 2.6827365218196064e-08 Mean difference between PyTorch and QNN: 0.662140965461731

Its not related with the structure of the network as I have tried many others, they got the similar result.

Is there anything wrong when I compile the qnn model?

mkdir simple_model/
qnn-onnx-converter --input_network ./simple.fixed.onnx --output_path ./simple_model/simple_model.cpp

qnn-model-lib-generator \
  -c ./simple_model/simple_model.cpp \
  -b ./simple_model/simple_model.bin \
  -o ./simple_model/model_libs 

Or the way I create input and run the qnn model?


def save_as_rawtensor(array, filename):
    with open(filename, 'wb') as f:
        f.write(array.tobytes())

def generate_qnn_inputs(sample):
    input = np.array(sample['tokens'], dtype=np.float32)
    # Create a directory for input files if it doesn't exist
    os.makedirs('qnn_inputs', exist_ok=True)

    # Save each input as a separate .rawtensor file
    print(f"input: {input.dtype}")
    print(f"input: {input.shape}")
    save_as_rawtensor(input, 'qnn_inputs/input.rawtensor')

    # Create the input list file
    with open('qnn_input_list.txt', 'w') as f:
        f.write("input:=qnn_inputs/input.rawtensor\n")

# run cmd : qnn-net-run  --input_list qnn_input_list.txt --backend ${QNN_SDK_ROOT}/lib/x86_64-linux-clang/libQnnCpu.so --model ../minilm-L6-v2/model_libs/x86_64-linux-clang/libminilm-l6-v2.so

def run_qnn_net_run(model_path):
    qnn_sdk_root = os.environ.get('QNN_SDK_ROOT')
    if not qnn_sdk_root:
        raise EnvironmentError("QNN_SDK_ROOT environment variable is not set")

    cmd = [
        "qnn-net-run",
        "--input_list", "qnn_input_list.txt",
        "--backend", f"{qnn_sdk_root}/lib/x86_64-linux-clang/libQnnCpu.so",
        "--model", model_path
    ]

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("Command executed successfully")
        print("Output:", result.stdout)
    except subprocess.CalledProcessError as e:
        print("Error executing command:")
        print("Return code:", e.returncode)
        print("Error output:", e.stderr)
        raise

def get_qnn_embedding(model_path, sample):
    # Call the function after generating inputs
    generate_qnn_inputs(sample)
    run_qnn_net_run(model_path)
    # the raw output is save in output/Result_0/last_hidden_state.raw, load and transform to numpy array
    with open('output/Result_0/output.raw', 'rb') as f:
        result = f.read()
    result = np.frombuffer(result, dtype=np.float32)
    print(result.shape)
    return result.reshape(1, sample['tokens'].shape[1], -1)