Implemented QNN new frontend end to end inference in demo_qnn.cpp, which is more consistent with new demos and user friendly. It uses QNN for Qwen1.8B prefilling, and uses cpu for Qwen1.8B-q4k decoding. The QNN_OLD_FRONTEND in build_qnn_android.sh should be set to OFF.
example of qnn new frontend modeling:
class QwenNPU_CPUDecoder final : public Module {
int hidden_size;
int num_heads;
int head_dim;
int num_key_value_heads;
int num_key_value_groups;
Layer input_layernorm;
Layer pre_attn_quantize;
QwenDecoderNPUPart1 part1;
QwenQKVmm qkv_mm;
QwenDecoderNPUPart2 part2;
public:
QwenNPU_CPUDecoder() = default;
QwenNPU_CPUDecoder(const QWenConfig &config, const QWenNameConfig &names, const string &base_name) {
hidden_size = config.hidden_size;
num_heads = config.num_attention_heads;
head_dim = config.hidden_size / num_heads;
num_key_value_heads = config.num_key_value_heads;
num_key_value_groups = num_heads / num_key_value_heads;
input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._attn_norm_name);
pre_attn_quantize = Quantize(true, base_name + names._attn_base_name + names._q_proj_name + ".quantize");
part1 = QwenDecoderNPUPart1(config, names, base_name + names._attn_base_name);
part1.to(MLLM_QNN);
qkv_mm = QwenQKVmm(config, names, base_name + names._attn_base_name);
qkv_mm.to(MLLM_CPU);
part2 = QwenDecoderNPUPart2(config, names, base_name);
part2.to(MLLM_QNN);
}
vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
auto x = input_layernorm(inputs[0]);
x = pre_attn_quantize(x);
if (x.device() != MLLM_QNN) {
x = Tensor::toQNN({x})[0];
}
auto q_k_v = part1({x}); // q,k,v
auto o_x = qkv_mm(q_k_v)[0];
if (o_x.device() != MLLM_QNN) {
o_x = Tensor::toQNN({o_x})[0];
}
if (inputs[0].device() != MLLM_QNN) {
inputs[0] = Tensor::toQNN({inputs[0]})[0];
}
x = part2({o_x, inputs[0]})[0];
return {x};
}
};
Refactor
Improve QNN OP performance, which has up to 500+tokens/s prefilling speed in main_qwen_npu.cpp
Support warmup() for QNN graph building before calling execute
QNN Backend New Features and Refactor
QNN new frontend
Implemented QNN new frontend end to end inference in demo_qnn.cpp, which is more consistent with new demos and user friendly. It uses QNN for Qwen1.8B prefilling, and uses cpu for Qwen1.8B-q4k decoding. The
QNN_OLD_FRONTEND
in build_qnn_android.sh should be set to OFF.example of qnn new frontend modeling:
Refactor