RK3588推理MiniCPM-V 2.6(Qwen2 7B)当`base_domain_id`设置为非0时`rkllm_init()`段错误

run_rknn.py

import os
import time
import numpy as np
from rkllm_binding import *
from rknnlite.api.rknn_lite import RKNNLite
import signal
import cv2

MODEL_PATH = "qwen.rkllm"
VISION_ENCODER_PATH = "vision_transformer.rknn"
handle = None
img_size = 448

# exit on ctrl-c
def signal_handler(signal, frame):
    print("Ctrl-C pressed, exiting...")
    global handle
    if handle:
        abort(handle)
        destroy(handle)
    exit(0)

signal.signal(signal.SIGINT, signal_handler)

# export RKLLM_LOG_LEVEL=1
os.environ["RKLLM_LOG_LEVEL"] = "1"

inference_count = 0
inference_start_time = 0
def result_callback(result, userdata, state):
    global inference_start_time
    global inference_count
    if state == LLMCallState.RKLLM_RUN_NORMAL:
        if inference_count == 0:
            first_token_time = time.time()
            print(f"Time to first token: {first_token_time - inference_start_time:.2f} seconds")
        inference_count += 1
        print(result.contents.text.decode(), end="", flush=True)
    elif state == LLMCallState.RKLLM_RUN_FINISH:
        print("\n\n(finished)")
    elif state == LLMCallState.RKLLM_RUN_ERROR:
        print("\nError occurred during LLM call")

# Initialize vision encoder
vision_encoder = RKNNLite(verbose=False)
model_size = os.path.getsize(VISION_ENCODER_PATH)
print(f"Start loading vision encoder model (size: {model_size / 1024 / 1024:.2f} MB)")
start_time = time.time()
vision_encoder.load_rknn(VISION_ENCODER_PATH)
end_time = time.time()
print(f"Vision encoder loaded in {end_time - start_time:.2f} seconds (speed: {model_size / (end_time - start_time) / 1024 / 1024:.2f} MB/s)")
vision_encoder.init_runtime()

# image embedding
img_path = "test.jpg"

normalize_mean = [0.5, 0.5, 0.5]
normalize_std = [0.5, 0.5, 0.5]

img = cv2.imread(img_path)
img = cv2.resize(img, (img_size, img_size))
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = img.astype(np.float32)
img = (img - normalize_mean) / normalize_std
img = img[np.newaxis, :, :, :]
print(img.shape)
start_time = time.time()
image_embeddings = vision_encoder.inference(inputs=[img.astype(np.float32)], data_type="float32", data_format="nhwc")[0]
end_time = time.time()
print(f"Vision encoder inference time: {end_time - start_time:.2f} seconds")
print(image_embeddings.shape)
print(image_embeddings)

#vision_encoder.release() # free memory, rockchip plz fix this

# Initialize RKLLM
param = create_default_param()
param.model_path = MODEL_PATH.encode()
param.img_start = "<image>".encode()
param.img_end = "</image>".encode()
# param.img_content = "<unk>".encode()
extend_param = RKLLMExtendParam()
extend_param.base_domain_id = 1  # iommu domain 0 for vision encoder
param.extend_param = extend_param
model_size = os.path.getsize(MODEL_PATH)
print(f"Start loading language model (size: {model_size / 1024 / 1024:.2f} MB)")
start_time = time.time()
handle = init(param, result_callback)
end_time = time.time()
print(f"Language model loaded in {end_time - start_time:.2f} seconds (speed: {model_size / (end_time - start_time) / 1024 / 1024:.2f} MB/s)")

# Create input
prompt = """<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<image>
详细介绍一下这张图片. 如果没有图片，就重复一下这句话上面的文字。<|im_end|>
<|im_start|>assistant

"""

rkllm_input = create_rkllm_input(RKLLMInputType.RKLLM_INPUT_MULTIMODAL, prompt=prompt, image_embed=image_embeddings.astype(np.float32))

# Create inference parameters
infer_param = RKLLMInferParam()
infer_param.mode = RKLLMInferMode.RKLLM_INFER_GENERATE.value

# Run RKLLM
print("Start inference...")
inference_start_time = time.time()
run(handle, rkllm_input, infer_param, None)

# Clean up
destroy(handle)

rkllm_binding.py

import ctypes
import numpy as np
from enum import IntEnum
from typing import Callable, Any

# Load the shared library
_lib = ctypes.CDLL("librkllmrt.so")  # Adjust the library name if necessary

# Define enums
class LLMCallState(IntEnum):
    RKLLM_RUN_NORMAL = 0
    RKLLM_RUN_WAITING = 1
    RKLLM_RUN_FINISH = 2
    RKLLM_RUN_ERROR = 3
    RKLLM_RUN_GET_LAST_HIDDEN_LAYER = 4

class RKLLMInputType(IntEnum):
    RKLLM_INPUT_PROMPT = 0
    RKLLM_INPUT_TOKEN = 1
    RKLLM_INPUT_EMBED = 2
    RKLLM_INPUT_MULTIMODAL = 3

class RKLLMInferMode(IntEnum):
    RKLLM_INFER_GENERATE = 0
    RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1

# Define structures
class RKLLMExtendParam(ctypes.Structure):
    _fields_ = [
        ("base_domain_id", ctypes.c_int32),
        ("reserved", ctypes.c_uint8 * 112)
    ]

class RKLLMParam(ctypes.Structure):
    _fields_ = [
        ("model_path", ctypes.c_char_p),
        ("max_context_len", ctypes.c_int32),
        ("max_new_tokens", ctypes.c_int32),
        ("top_k", ctypes.c_int32),
        ("top_p", ctypes.c_float),
        ("temperature", ctypes.c_float),
        ("repeat_penalty", ctypes.c_float),
        ("frequency_penalty", ctypes.c_float),
        ("presence_penalty", ctypes.c_float),
        ("mirostat", ctypes.c_int32),
        ("mirostat_tau", ctypes.c_float),
        ("mirostat_eta", ctypes.c_float),
        ("skip_special_token", ctypes.c_bool),
        ("is_async", ctypes.c_bool),
        ("img_start", ctypes.c_char_p),
        ("img_end", ctypes.c_char_p),
        ("img_content", ctypes.c_char_p),
        ("extend_param", RKLLMExtendParam)
    ]

class RKLLMLoraAdapter(ctypes.Structure):
    _fields_ = [
        ("lora_adapter_path", ctypes.c_char_p),
        ("lora_adapter_name", ctypes.c_char_p),
        ("scale", ctypes.c_float)
    ]

class RKLLMEmbedInput(ctypes.Structure):
    _fields_ = [
        ("embed", ctypes.POINTER(ctypes.c_float)),
        ("n_tokens", ctypes.c_size_t)
    ]

class RKLLMTokenInput(ctypes.Structure):
    _fields_ = [
        ("input_ids", ctypes.POINTER(ctypes.c_int32)),
        ("n_tokens", ctypes.c_size_t)
    ]

class RKLLMMultiModelInput(ctypes.Structure):
    _fields_ = [
        ("prompt", ctypes.c_char_p),
        ("image_embed", ctypes.POINTER(ctypes.c_float)),
        ("n_image_tokens", ctypes.c_size_t)
    ]

class RKLLMInput(ctypes.Structure):
    class _InputUnion(ctypes.Union):
        _fields_ = [
            ("prompt_input", ctypes.c_char_p),
            ("embed_input", RKLLMEmbedInput),
            ("token_input", RKLLMTokenInput),
            ("multimodal_input", RKLLMMultiModelInput)
        ]
    _fields_ = [
        ("input_type", ctypes.c_int),
        ("_input", _InputUnion)
    ]

class RKLLMLoraParam(ctypes.Structure):
    _fields_ = [
        ("lora_adapter_name", ctypes.c_char_p)
    ]

class RKLLMPromptCacheParam(ctypes.Structure):
    _fields_ = [
        ("save_prompt_cache", ctypes.c_int),
        ("prompt_cache_path", ctypes.c_char_p)
    ]

class RKLLMInferParam(ctypes.Structure):
    _fields_ = [
        ("mode", ctypes.c_int),
        ("lora_params", ctypes.POINTER(RKLLMLoraParam)),
        ("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam))
    ]

class RKLLMResultLastHiddenLayer(ctypes.Structure):
    _fields_ = [
        ("hidden_states", ctypes.POINTER(ctypes.c_float)),
        ("embd_size", ctypes.c_int),
        ("num_tokens", ctypes.c_int)
    ]

class RKLLMResult(ctypes.Structure):
    _fields_ = [
        ("text", ctypes.c_char_p),
        ("token_id", ctypes.c_int32),
        ("last_hidden_layer", RKLLMResultLastHiddenLayer)
    ]

# Define callback type
LLMResultCallback = ctypes.CFUNCTYPE(None, ctypes.POINTER(RKLLMResult), ctypes.c_void_p, ctypes.c_int)

# Define function prototypes
_lib.rkllm_createDefaultParam.restype = RKLLMParam
_lib.rkllm_init.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.POINTER(RKLLMParam), LLMResultCallback]
_lib.rkllm_init.restype = ctypes.c_int
_lib.rkllm_load_lora.argtypes = [ctypes.c_void_p, ctypes.POINTER(RKLLMLoraAdapter)]
_lib.rkllm_load_lora.restype = ctypes.c_int
_lib.rkllm_load_prompt_cache.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
_lib.rkllm_load_prompt_cache.restype = ctypes.c_int
_lib.rkllm_release_prompt_cache.argtypes = [ctypes.c_void_p]
_lib.rkllm_release_prompt_cache.restype = ctypes.c_int
_lib.rkllm_destroy.argtypes = [ctypes.c_void_p]
_lib.rkllm_destroy.restype = ctypes.c_int
_lib.rkllm_run.argtypes = [ctypes.c_void_p, ctypes.POINTER(RKLLMInput), ctypes.POINTER(RKLLMInferParam), ctypes.c_void_p]
_lib.rkllm_run.restype = ctypes.c_int
_lib.rkllm_run_async.argtypes = [ctypes.c_void_p, ctypes.POINTER(RKLLMInput), ctypes.POINTER(RKLLMInferParam), ctypes.c_void_p]
_lib.rkllm_run_async.restype = ctypes.c_int
_lib.rkllm_abort.argtypes = [ctypes.c_void_p]
_lib.rkllm_abort.restype = ctypes.c_int
_lib.rkllm_is_running.argtypes = [ctypes.c_void_p]
_lib.rkllm_is_running.restype = ctypes.c_int

# Python wrapper functions
def create_default_param() -> RKLLMParam:
    return _lib.rkllm_createDefaultParam()

def init(param: RKLLMParam, callback: Callable[[RKLLMResult, Any, LLMCallState], None]) -> ctypes.c_void_p:
    handle = ctypes.c_void_p()
    c_callback = LLMResultCallback(callback)
    status = _lib.rkllm_init(ctypes.byref(handle), ctypes.byref(param), c_callback)
    if status != 0:
        raise RuntimeError(f"Failed to initialize RKLLM: {status}")
    return handle

def load_lora(handle: ctypes.c_void_p, lora_adapter: RKLLMLoraAdapter) -> None:
    status = _lib.rkllm_load_lora(handle, ctypes.byref(lora_adapter))
    if status != 0:
        raise RuntimeError(f"Failed to load Lora adapter: {status}")

def load_prompt_cache(handle: ctypes.c_void_p, prompt_cache_path: str) -> None:
    status = _lib.rkllm_load_prompt_cache(handle, prompt_cache_path.encode())
    if status != 0:
        raise RuntimeError(f"Failed to load prompt cache: {status}")

def release_prompt_cache(handle: ctypes.c_void_p) -> None:
    status = _lib.rkllm_release_prompt_cache(handle)
    if status != 0:
        raise RuntimeError(f"Failed to release prompt cache: {status}")

def destroy(handle: ctypes.c_void_p) -> None:
    status = _lib.rkllm_destroy(handle)
    if status != 0:
        raise RuntimeError(f"Failed to destroy RKLLM: {status}")

def run(handle: ctypes.c_void_p, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata: Any) -> None:
    status = _lib.rkllm_run(handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), userdata)
    if status != 0:
        raise RuntimeError(f"Failed to run RKLLM: {status}")

def run_async(handle: ctypes.c_void_p, rkllm_input: RKLLMInput, rkllm_infer_params: RKLLMInferParam, userdata: Any) -> None:
    status = _lib.rkllm_run_async(handle, ctypes.byref(rkllm_input), ctypes.byref(rkllm_infer_params), userdata)
    if status != 0:
        raise RuntimeError(f"Failed to run RKLLM asynchronously: {status}")

def abort(handle: ctypes.c_void_p) -> None:
    status = _lib.rkllm_abort(handle)
    if status != 0:
        raise RuntimeError(f"Failed to abort RKLLM: {status}")

def is_running(handle: ctypes.c_void_p) -> bool:
    return _lib.rkllm_is_running(handle) == 0

# Helper function to convert numpy array to C array
def numpy_to_c_array(arr: np.ndarray, c_type):
    return arr.ctypes.data_as(ctypes.POINTER(c_type))

# Helper function to create RKLLMInput
def create_rkllm_input(input_type: RKLLMInputType, **kwargs) -> RKLLMInput:
    rkllm_input = RKLLMInput()
    rkllm_input.input_type = input_type.value

    if input_type == RKLLMInputType.RKLLM_INPUT_PROMPT:
        rkllm_input._input.prompt_input = kwargs['prompt'].encode()
    elif input_type == RKLLMInputType.RKLLM_INPUT_EMBED:
        embed = kwargs['embed']
        rkllm_input._input.embed_input.embed = numpy_to_c_array(embed, ctypes.c_float)
        rkllm_input._input.embed_input.n_tokens = embed.shape[1]
    elif input_type == RKLLMInputType.RKLLM_INPUT_TOKEN:
        tokens = kwargs['tokens']
        rkllm_input._input.token_input.input_ids = numpy_to_c_array(tokens, ctypes.c_int32)
        rkllm_input._input.token_input.n_tokens = tokens.shape[1]
    elif input_type == RKLLMInputType.RKLLM_INPUT_MULTIMODAL:
        rkllm_input._input.multimodal_input.prompt = kwargs['prompt'].encode()
        image_embed = kwargs['image_embed']
        rkllm_input._input.multimodal_input.image_embed = numpy_to_c_array(image_embed, ctypes.c_float)
        rkllm_input._input.multimodal_input.n_image_tokens = image_embed.shape[1]

    return rkllm_input

报错：

W rknn-toolkit-lite2 version: 2.2.0
Start loading vision encoder model (size: 942.29 MB)
Vision encoder loaded in 16.59 seconds (speed: 56.79 MB/s)
I RKNN: [04:02:01.796] RKNN Runtime Information, librknnrt version: 2.2.0 (c195366594@2024-09-14T12:18:56)
I RKNN: [04:02:01.796] RKNN Driver Information, version: 0.9.8
I RKNN: [04:02:01.797] RKNN Model Information, version: 6, toolkit version: 2.2.0(compiler version: 2.2.0 (c195366594@2024-09-14T12:24:14)), target: RKNPU v2, target platform: rk3588, framework name: ONNX, framework layout: NCHW, model inference type: dynamic_shape
(1, 448, 448, 3)
Vision encoder inference time: 4.82 seconds
(1, 64, 3584)
Start loading language model (size: 7810.02 MB)
I rkllm: rkllm-runtime version: 1.1.0, rknpu driver version: 0.9.8, platform: RK3588

I RKNN: [04:02:27.763] RKNN Driver Information, version: 0.9.8
fish: Job 1, 'python ./run_rknn.py' terminated by signal SIGSEGV (Address boundary error)

backtrace:

(gdb) bt
#0  0x0000007fe306b740 in  () at /lib/librkllmrt.so
#1  0x0000007fe306c248 in  () at /lib/librkllmrt.so
#2  0x0000007fe30664e4 in  () at /lib/librkllmrt.so
#3  0x0000007fe3064ed0 in  () at /lib/librkllmrt.so
#4  0x0000007fe2fc089c in  () at /lib/librkllmrt.so
#5  0x0000007fe2f769b8 in  () at /lib/librkllmrt.so
#6  0x0000007fe2f9db30 in  () at /lib/librkllmrt.so
#7  0x0000007fe2eee488 in  () at /lib/librkllmrt.so
#8  0x0000007fe2ed8094 in  () at /lib/librkllmrt.so
#9  0x0000007fe2ed5c84 in rkllm_init () at /lib/librkllmrt.so

airockchip / rknn-llm

RK3588推理MiniCPM-V 2.6(Qwen2 7B)当`base_domain_id`设置为非0时`rkllm_init()`段错误 #99