Open pupumao opened 4 months ago
@AllentDan is it related to #1789 ?
@AllentDan is it related to #1789 ?
No. https://github.com/InternLM/lmdeploy/pull/1789 only handles stuck problems instead of illegal mem access
. And as mentioned above, the bug can not be reproduced in A100.
There is a peculiar situation where it seems that when the inference of the torch module inside ImageEncoder and the inference of the language model from turbomind are synchronized, this issue arises. In the vl_async_engine, if I use a torch rand for the features and do not perform inference with ImageEncoder, this problem does not occur. My experimental model is Llava. If I start a completely independent thread in Llava to continuously loop and perform encode_images, this issue will occur more quickly.
My experimental model is Llava. If I start a completely independent thread in Llava to continuously loop and perform encode_images, this issue will occur more quickly.
@pupumao Could you share your experimental code?
The code of ImageEncoder is updated recently, could you try the latest code and see if the issue still happend ?
My experimental model is Llava. If I start a completely independent thread in Llava to continuously loop and perform encode_images, this issue will occur more quickly.
@pupumao Could you share your experimental code?
The code of ImageEncoder is updated recently, could you try the latest code and see if the issue still happend ?
I tried this latest code, issue still happend
this is my experimental code: lmdeploy/vl/model/llava.py
# Copyright (c) OpenMMLab. All rights reserved.
# Modified from
# https://github.com/haotian-liu/LLaVA.git
import warnings
from contextlib import contextmanager
from typing import List, Union
import torch
from PIL.Image import Image
from transformers import AutoModelForCausalLM
from lmdeploy.utils import get_logger
from lmdeploy.vl.model.base import VisonModel
from lmdeploy.vl.model.utils import disable_logging, rewrite_ctx
logger = get_logger('lmdeploy')
def check_llava_install():
"""check llava install."""
try:
import llava # noqa: F401
except ImportError:
raise ImportError(
'To use LlavaVLModel, please install llava by '
'pip install git+https://github.com/haotian-liu/LLaVA.git --no-deps' # noqa: E501
)
def _clip_vision_tower_load_model(self, **kwargs):
logger.info(f'CLIPVisionTower.load_model: {self.vision_tower_name}')
from transformers import (CLIPImageProcessor, CLIPVisionConfig,
CLIPVisionModel)
self.image_processor = CLIPImageProcessor.from_pretrained(
self.vision_tower_name)
config = CLIPVisionConfig.from_pretrained(self.vision_tower_name,
trust_remote_code=True)
self.vision_tower = CLIPVisionModel._from_config(config=config)
self.vision_tower.requires_grad_(False)
self.is_loaded = True
@contextmanager
def init_llava_vision_tower(config):
"""skip download vision model if possible."""
if getattr(config, 'unfreeze_mm_vision_tower', False):
origin_func_path = [
'llava.model.multimodal_encoder.clip_encoder.CLIPVisionTower.load_model' # noqa: E501
]
rewrite_func = [_clip_vision_tower_load_model]
with rewrite_ctx(origin_func_path, rewrite_func):
yield
else:
yield
class LlavaVisionModel(VisonModel):
"""Llava visual model."""
def __init__(self, model_path, with_llm: bool = False):
self.with_llm = with_llm
self.model_path = model_path
self.build_model()
self.start_work()
def build_model(self):
"""build model & load weights."""
# check llava install
check_llava_install()
# currently, only support llava llama
from llava.model.language_model.llava_llama import ( # noqa
LlavaConfig, LlavaLlamaForCausalLM)
self.config = LlavaConfig.from_pretrained(self.model_path)
assert self.config.model_type in ['llava', 'llava_llama'], \
'currently, only support llava llama'
from accelerate import init_empty_weights
# init empty model, skip layer initialization
with init_empty_weights(), warnings.catch_warnings(), \
init_llava_vision_tower(self.config):
warnings.simplefilter('ignore')
self.config.quantization_config = {
} # disable vision part quantization
model = AutoModelForCausalLM.from_config(self.config,
trust_remote_code=True)
if not self.with_llm:
del model.lm_head
del model.model.embed_tokens
del model.model.layers
del model.model.norm
else:
self.vl_model = model
# init empty vision_tower,
with init_llava_vision_tower(self.config):
vision_tower = model.get_vision_tower()
vision_tower.is_loaded = False
vision_tower.load_model()
# for llava-v1.5, the vit is not in llm ckpt
vision_tower.to(dtype=torch.half)
from accelerate import load_checkpoint_and_dispatch
with disable_logging():
load_checkpoint_and_dispatch(
model=model,
checkpoint=self.model_path,
device_map='auto' if not self.with_llm else {'': 'cpu'},
no_split_module_classes=['CLIPEncoderLayer'],
dtype=torch.half)
self.model = model.model
self.vision_tower = model.model.vision_tower.half()
self.mm_projector = model.model.mm_projector.half()
def start_work(self):
from threading import Thread
self.work_thd = Thread(target=self.encode_images_work)
self.work_thd.start()
def encode_images_work(self):
print(f'encode_images_work in')
while True:
images = torch.rand(1, 3, 336, 336).cuda().half()
with torch.no_grad():
image_features = self.vision_tower(images)
image_features = self.mm_projector(image_features)
def encode_images(self, images: torch.Tensor) -> torch.Tensor:
"""encode images."""
# image_features = self.vision_tower(images)
# image_features = self.mm_projector(image_features)
# return image_features
fake = torch.rand(images.shape[0], 576, 4096).cuda().half()
return fake
def preprocess(
self,
images: List[Image]) -> Union[torch.Tensor, List[torch.Tensor]]:
"""preprocess."""
# TODO: gpu processor
from llava.mm_utils import process_images
images = [x.convert('RGB') for x in images]
image_processor = self.vision_tower.image_processor
outputs = process_images(images, image_processor, self.config)
return outputs
@torch.no_grad()
def forward(self, images: List[Image]) -> List[torch.Tensor]:
"""forward."""
from llava.model.llava_arch import (get_anyres_image_grid_shape,
unpad_image)
image_sizes = [x.size for x in images]
images = self.preprocess(images)
if isinstance(images, list):
images = [
x.to(device=self.vision_tower.device, dtype=torch.float16)
for x in images
]
else:
images = images.to(device=self.vision_tower.device,
dtype=torch.float16)
if type(images) is list or images.ndim == 5:
if type(images) is list:
images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
concat_images = torch.cat([image for image in images], dim=0)
image_features = self.encode_images(concat_images)
split_sizes = [image.shape[0] for image in images]
image_features = torch.split(image_features, split_sizes, dim=0)
mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type',
'flat')
image_aspect_ratio = getattr(self.config, 'image_aspect_ratio',
'square')
if mm_patch_merge_type == 'flat':
image_features = [x.flatten(0, 1) for x in image_features]
elif mm_patch_merge_type.startswith('spatial'):
new_image_features = []
for image_idx, image_feature in enumerate(image_features):
if image_feature.shape[0] > 1:
base_image_feature = image_feature[0]
image_feature = image_feature[1:]
height = width = self.vision_tower.num_patches_per_side
assert height * width == base_image_feature.shape[0]
if image_aspect_ratio == 'anyres':
num_patch_width, num_patch_height = \
get_anyres_image_grid_shape(
image_sizes[image_idx],
self.config.image_grid_pinpoints,
self.vision_tower.config.image_size)
image_feature = image_feature.view(
num_patch_height, num_patch_width, height,
width, -1)
else:
raise NotImplementedError
if 'unpad' in mm_patch_merge_type:
image_feature = image_feature.permute(
4, 0, 2, 1, 3).contiguous()
image_feature = image_feature.flatten(1,
2).flatten(
2, 3)
image_feature = unpad_image(
image_feature, image_sizes[image_idx])
image_feature = torch.cat((
image_feature,
self.model.image_newline[:, None, None].expand(
*image_feature.shape[:-1], 1).to(
image_feature.device)),
dim=-1)
image_feature = image_feature.flatten(1,
2).transpose(
0, 1)
else:
image_feature = image_feature.permute(
0, 2, 1, 3, 4).contiguous()
image_feature = image_feature.flatten(0, 3)
image_feature = torch.cat(
(base_image_feature, image_feature), dim=0)
else:
image_feature = image_feature[0]
if 'unpad' in mm_patch_merge_type:
image_feature = torch.cat(
(image_feature,
self.model.image_newline[None].to(
image_feature.device)),
dim=0)
new_image_features.append(image_feature)
image_features = new_image_features
else:
raise ValueError('Unexpected mm_patch_merge_type: '
f'{self.config.mm_patch_merge_type}')
else:
image_features = self.encode_images(images)
image_features = [x for x in image_features]
return image_features
My experimental model is Llava. If I start a completely independent thread in Llava to continuously loop and perform encode_images, this issue will occur more quickly.
@pupumao Could you share your experimental code? The code of ImageEncoder is updated recently, could you try the latest code and see if the issue still happend ?
I tried this latest code, issue still happend
this is my experimental code: lmdeploy/vl/model/llava.py
# Copyright (c) OpenMMLab. All rights reserved. # Modified from # https://github.com/haotian-liu/LLaVA.git import warnings from contextlib import contextmanager from typing import List, Union import torch from PIL.Image import Image from transformers import AutoModelForCausalLM from lmdeploy.utils import get_logger from lmdeploy.vl.model.base import VisonModel from lmdeploy.vl.model.utils import disable_logging, rewrite_ctx logger = get_logger('lmdeploy') def check_llava_install(): """check llava install.""" try: import llava # noqa: F401 except ImportError: raise ImportError( 'To use LlavaVLModel, please install llava by ' 'pip install git+https://github.com/haotian-liu/LLaVA.git --no-deps' # noqa: E501 ) def _clip_vision_tower_load_model(self, **kwargs): logger.info(f'CLIPVisionTower.load_model: {self.vision_tower_name}') from transformers import (CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel) self.image_processor = CLIPImageProcessor.from_pretrained( self.vision_tower_name) config = CLIPVisionConfig.from_pretrained(self.vision_tower_name, trust_remote_code=True) self.vision_tower = CLIPVisionModel._from_config(config=config) self.vision_tower.requires_grad_(False) self.is_loaded = True @contextmanager def init_llava_vision_tower(config): """skip download vision model if possible.""" if getattr(config, 'unfreeze_mm_vision_tower', False): origin_func_path = [ 'llava.model.multimodal_encoder.clip_encoder.CLIPVisionTower.load_model' # noqa: E501 ] rewrite_func = [_clip_vision_tower_load_model] with rewrite_ctx(origin_func_path, rewrite_func): yield else: yield class LlavaVisionModel(VisonModel): """Llava visual model.""" def __init__(self, model_path, with_llm: bool = False): self.with_llm = with_llm self.model_path = model_path self.build_model() self.start_work() def build_model(self): """build model & load weights.""" # check llava install check_llava_install() # currently, only support llava llama from llava.model.language_model.llava_llama import ( # noqa LlavaConfig, LlavaLlamaForCausalLM) self.config = LlavaConfig.from_pretrained(self.model_path) assert self.config.model_type in ['llava', 'llava_llama'], \ 'currently, only support llava llama' from accelerate import init_empty_weights # init empty model, skip layer initialization with init_empty_weights(), warnings.catch_warnings(), \ init_llava_vision_tower(self.config): warnings.simplefilter('ignore') self.config.quantization_config = { } # disable vision part quantization model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=True) if not self.with_llm: del model.lm_head del model.model.embed_tokens del model.model.layers del model.model.norm else: self.vl_model = model # init empty vision_tower, with init_llava_vision_tower(self.config): vision_tower = model.get_vision_tower() vision_tower.is_loaded = False vision_tower.load_model() # for llava-v1.5, the vit is not in llm ckpt vision_tower.to(dtype=torch.half) from accelerate import load_checkpoint_and_dispatch with disable_logging(): load_checkpoint_and_dispatch( model=model, checkpoint=self.model_path, device_map='auto' if not self.with_llm else {'': 'cpu'}, no_split_module_classes=['CLIPEncoderLayer'], dtype=torch.half) self.model = model.model self.vision_tower = model.model.vision_tower.half() self.mm_projector = model.model.mm_projector.half() def start_work(self): from threading import Thread self.work_thd = Thread(target=self.encode_images_work) self.work_thd.start() def encode_images_work(self): print(f'encode_images_work in') while True: images = torch.rand(1, 3, 336, 336).cuda().half() with torch.no_grad(): image_features = self.vision_tower(images) image_features = self.mm_projector(image_features) def encode_images(self, images: torch.Tensor) -> torch.Tensor: """encode images.""" # image_features = self.vision_tower(images) # image_features = self.mm_projector(image_features) # return image_features fake = torch.rand(images.shape[0], 576, 4096).cuda().half() return fake def preprocess( self, images: List[Image]) -> Union[torch.Tensor, List[torch.Tensor]]: """preprocess.""" # TODO: gpu processor from llava.mm_utils import process_images images = [x.convert('RGB') for x in images] image_processor = self.vision_tower.image_processor outputs = process_images(images, image_processor, self.config) return outputs @torch.no_grad() def forward(self, images: List[Image]) -> List[torch.Tensor]: """forward.""" from llava.model.llava_arch import (get_anyres_image_grid_shape, unpad_image) image_sizes = [x.size for x in images] images = self.preprocess(images) if isinstance(images, list): images = [ x.to(device=self.vision_tower.device, dtype=torch.float16) for x in images ] else: images = images.to(device=self.vision_tower.device, dtype=torch.float16) if type(images) is list or images.ndim == 5: if type(images) is list: images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] concat_images = torch.cat([image for image in images], dim=0) image_features = self.encode_images(concat_images) split_sizes = [image.shape[0] for image in images] image_features = torch.split(image_features, split_sizes, dim=0) mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat') image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square') if mm_patch_merge_type == 'flat': image_features = [x.flatten(0, 1) for x in image_features] elif mm_patch_merge_type.startswith('spatial'): new_image_features = [] for image_idx, image_feature in enumerate(image_features): if image_feature.shape[0] > 1: base_image_feature = image_feature[0] image_feature = image_feature[1:] height = width = self.vision_tower.num_patches_per_side assert height * width == base_image_feature.shape[0] if image_aspect_ratio == 'anyres': num_patch_width, num_patch_height = \ get_anyres_image_grid_shape( image_sizes[image_idx], self.config.image_grid_pinpoints, self.vision_tower.config.image_size) image_feature = image_feature.view( num_patch_height, num_patch_width, height, width, -1) else: raise NotImplementedError if 'unpad' in mm_patch_merge_type: image_feature = image_feature.permute( 4, 0, 2, 1, 3).contiguous() image_feature = image_feature.flatten(1, 2).flatten( 2, 3) image_feature = unpad_image( image_feature, image_sizes[image_idx]) image_feature = torch.cat(( image_feature, self.model.image_newline[:, None, None].expand( *image_feature.shape[:-1], 1).to( image_feature.device)), dim=-1) image_feature = image_feature.flatten(1, 2).transpose( 0, 1) else: image_feature = image_feature.permute( 0, 2, 1, 3, 4).contiguous() image_feature = image_feature.flatten(0, 3) image_feature = torch.cat( (base_image_feature, image_feature), dim=0) else: image_feature = image_feature[0] if 'unpad' in mm_patch_merge_type: image_feature = torch.cat( (image_feature, self.model.image_newline[None].to( image_feature.device)), dim=0) new_image_features.append(image_feature) image_features = new_image_features else: raise ValueError('Unexpected mm_patch_merge_type: ' f'{self.config.mm_patch_merge_type}') else: image_features = self.encode_images(images) image_features = [x for x in image_features] return image_features
@irexyc I cloned the latest code from github, and build from source, then use this llava experiment code with self.start_work() which started a seperate thread for inference, also got the error
I add traceback in c++ code, i got two error position of "an illegal memory access" in different experiments:
stack trace:
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__nv_bfloat16>::Finish(turbomind::GenerationState&)+0x16a
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__nv_bfloat16>::InternalThreadEntry(int)+0x982
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : ()+0x27da84
/lib64/libpthread.so.0 : ()+0x7ea5
/lib64/libc.so.6 : clone()+0x6d
terminate called after throwing an instance of 'std::runtime_error'
what(): [TM][ERROR] CUDA runtime error: an illegal memory access was encountered .../lmdeploy/src/turbomind/models/llama/LlamaBatch.h:136
Aborted (core dumped)
stack trace:
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::NcclGuard::~NcclGuard()+0x106
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__nv_bfloat16>::AllocatePersistantBuffer(unsigned long, int)+0xa0b
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__nv_bfloat16>::LlamaBatch(turbomind::EngineParams const&, int, int, turbomind::LlamaV2<__nv_bfloat16>*)+0x771
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaV2<__nv_bfloat16>::LlamaV2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, turbomind::LlamaAttentionParams const&, int, int, int, int, bool, turbomind::EngineParams const&, turbomind::LoraParams const&, std::shared_ptr<turbomind::LlamaV2<__nv_bfloat16>::SharedState>, turbomind::LlamaWeight<__nv_bfloat16>*, turbomind::NcclParam, CUstream_st*, turbomind::cublasMMWrapper*, turbomind::IAllocator*, bool, cudaDeviceProp*)+0x448
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : LlamaTritonModel<__nv_bfloat16>::createSharedModelInstance(int, int, std::pair<std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> >, std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> > >, std::shared_ptr<turbomind::AbstractCustomComm>)+0x5af
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : LlamaTritonModel<__nv_bfloat16>::createModelInstance(int, int, CUstream_st*, std::pair<std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> >, std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> > >, std::shared_ptr<turbomind::AbstractCustomComm>)+0x6bf
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : ()+0xb506d
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : ()+0xcdfa2
python : ()+0x1445a6
python : _PyObject_MakeTpCall()+0x26b
python : ()+0x150866
python : _PyEval_EvalFrameDefault()+0x4c12
python : ()+0x1506d8
python : _PyEval_EvalFrameDefault()+0x2d83
python : _PyFunction_Vectorcall()+0x6c
python : _PyEval_EvalFrameDefault()+0x72c
python : _PyFunction_Vectorcall()+0x6c
python : _PyEval_EvalFrameDefault()+0x72c
python : ()+0x150804
python : ()+0x228372
python : ()+0x228324
/lib64/libpthread.so.0 : ()+0x7ea5
/lib64/libc.so.6 : clone()+0x6d
terminate called after throwing an instance of 'std::runtime_error'
what(): [TM][ERROR] CUDA runtime error: an illegal memory access was encountered .../lmdeploy/src/turbomind/utils/nccl_utils.cc:227
Aborted (core dumped)
Thanks for investigating the problem!
Please set environment variable TM_DEBUG_LEVEL=DEBUG
before trying to get the stacktrace. It synchronize kernel launches to get the accurate position of where things go wrong.
Thanks for investigating the problem!
Please set environment variable
TM_DEBUG_LEVEL=DEBUG
before trying to get the stacktrace. It synchronize kernel launches to get the accurate position of where things go wrong.
@lzhangzz
[TM][DEBUG] Set logger level by DEBUG
[TM][DEBUG] turbomind::Allocator<turbomind::AllocatorType::CUDA>::Allocator(int)
[WARNING] gemm_config.in is not found; using default GEMM algo
[TM][DEBUG] turbomind::cublasMMWrapper::cublasMMWrapper(cublasHandle_t, cublasLtHandle_t, cudaStream_t, turbomind::cublasAlgoMap*, std::mutex*, turbomind::IAllocator*)
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = void; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x302000000 with size 33554432
[TM][DEBUG] turbomind::LlamaV2<T>::LlamaV2(size_t, size_t, size_t, size_t, size_t, size_t, float, const turbomind::LlamaAttentionParams&, int, int, int, int, bool, const turbomind::EngineParams&, const turbomind::LoraParams&, std::shared_ptr<turbomind::LlamaV2<T>::SharedState>, turbomind::LlamaWeight<T>*, turbomind::NcclParam, cudaStream_t, turbomind::cublasMMWrapper*, turbomind::IAllocator*, bool, cudaDeviceProp*) [with T = __half; size_t = long unsigned int; cudaStream_t = CUstream_st*]
[TM][INFO] NCCL group_id = 0
[TM][INFO] [BlockManager] block_size = 32 MB
[TM][INFO] [BlockManager] max_block_count = 755
[TM][INFO] [BlockManager] chunk_size = 755
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x304000000 with size 25333596160
[TM][DEBUG] void turbomind::LlamaBatch<T>::AllocateBuffer(size_t, size_t, int) [with T = __half; size_t = long unsigned int]
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = __half; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ea000000 with size 33619968
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = __half; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ec010000 with size 33619968
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee020000 with size 16416
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = __half; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee024200 with size 1048576
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = __half; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee124200 with size 1048576
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee224200 with size 2101248
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee425200 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee425400 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee425600 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee425800 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee425a00 with size 544
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = long unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee425e00 with size 66592
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ee436400 with size 16384000
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8ef3d6400 with size 16384000
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0376400 with size 524288
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f03f6400 with size 524288
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0476400 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0476600 with size 4202496
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = bool; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0878600 with size 128
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0878800 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0878a00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0878c00 with size 32768
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0880c00 with size 32768
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e00200 with size 32768
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e08200 with size 32768
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e10200 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e10400 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e10600 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e10800 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e10a00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = long long unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e10c00 with size 1024
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = long long unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0888c00 with size 1024
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = curandStateXORWOW; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e11000 with size 6144
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = curandStateXORWOW; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0889000 with size 6144
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f088a800 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e12800 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f088aa00 with size 2101248
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = curandStateXORWOW; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0a8ba00 with size 6144
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0a8d200 with size 2101248
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = curandStateXORWOW; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0c8e200 with size 6144
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0c8fa00 with size 2101248
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = curandStateXORWOW; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8f0e90a00 with size 6144
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f12c7c00000 with size 2101248
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e12a00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e12c00 with size 544
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = long unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e13000 with size 66560
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e23400 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e23600 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = bool; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e23800 with size 256
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e23a00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e23c00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e23e00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = bool; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24000 with size 256
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24200 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24400 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24600 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = bool; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24800 with size 256
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24a00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = unsigned int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f17c7e24c00 with size 512
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = int; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x7f12d7a00000 with size 2101248
[TM][DEBUG] void turbomind::ftNcclStreamSynchronize(turbomind::NcclParam, turbomind::NcclParam, cudaStream_t) start
stack trace:
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::NcclGuard::~NcclGuard()+0x106
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__half>::AllocatePersistantBuffer(unsigned long, int)+0x88b
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__half>::LlamaBatch(turbomind::EngineParams const&, int, int, turbomind::LlamaV2<__half>*)+0x771
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaV2<__half>::LlamaV2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, float, turbomind::LlamaAttentionParams const&, int, int, int, int, bool, turbomind::EngineParams const&, turbomind::LoraParams const&, std::shared_ptr<turbomind::LlamaV2<__half>::SharedState>, turbomind::LlamaWeight<__half>*, turbomind::NcclParam, CUstream_st*, turbomind::cublasMMWrapper*, turbomind::IAllocator*, bool, cudaDeviceProp*)+0x448
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : LlamaTritonModel<__half>::createSharedModelInstance(int, int, std::pair<std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> >, std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> > >, std::shared_ptr<turbomind::AbstractCustomComm>)+0x5c1
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : LlamaTritonModel<__half>::createModelInstance(int, int, CUstream_st*, std::pair<std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> >, std::vector<turbomind::NcclParam, std::allocator<turbomind::NcclParam> > >, std::shared_ptr<turbomind::AbstractCustomComm>)+0x6bf
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : ()+0xb506d
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : ()+0xcdfa2
python : ()+0x1445a6
python : _PyObject_MakeTpCall()+0x26b
python : ()+0x150866
python : _PyEval_EvalFrameDefault()+0x4c12
python : ()+0x1506d8
python : _PyEval_EvalFrameDefault()+0x2d83
python : _PyFunction_Vectorcall()+0x6c
python : _PyEval_EvalFrameDefault()+0x72c
python : _PyFunction_Vectorcall()+0x6c
python : _PyEval_EvalFrameDefault()+0x72c
python : ()+0x150804
python : ()+0x228372
python : ()+0x228324
/lib64/libpthread.so.0 : ()+0x7ea5
/lib64/libc.so.6 : clone()+0x6d
terminate called after throwing an instance of 'std::runtime_error'
what(): [TM][ERROR] CUDA runtime error: an illegal memory access was encountered .../lmdeploy/src/turbomind/utils/nccl_utils.cc:227
Aborted (core dumped)
Thanks for investigating the problem!
Please set environment variable
TM_DEBUG_LEVEL=DEBUG
before trying to get the stacktrace. It synchronize kernel launches to get the accurate position of where things go wrong.
@lzhangzz here is part of log for the other failed case:
2024-06-24 14:08:56,256 - lmdeploy - INFO - ImageEncoder forward 1 images, cost 0.044s
2024-06-24 14:08:56,256 - lmdeploy - INFO - ImageEncoder done 1 images, left 0 images.
2024-06-24 14:08:56,257 - lmdeploy - INFO - ImageEncoder received 1 images, left 1 images.
2024-06-24 14:08:56,257 - lmdeploy - INFO - ImageEncoder process 1 images, left 0 images.
2024-06-24 14:08:56,257 - lmdeploy - INFO - prompt="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <IMAGE_TOKEN>\ndescribe this image ASSISTANT:", gen_config=EngineGenerationConfig(n=1, max_new_tokens=1, top_p=0.8, top_k=40, temperature=0.8, repetition_penalty=1.0, ignore_eos=False, random_seed=14629431361338060508, stop_words=[2], bad_words=None, min_new_tokens=None, skip_special_tokens=True, logprobs=None), prompt_token_id=[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255, 29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568, 6089, 304, 278, 5199, 29915, 29879, 5155, 29889, 3148, 1001, 29901, 29871, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29871, 13, 2783, 29581, 445, 1967, 319, 1799, 9047, 13566, 29901], adapter_name=None.
2024-06-24 14:08:56,257 - lmdeploy - INFO - session_id=7, history_tokens=0, input_tokens=622, max_new_tokens=1, seq_start=True, seq_end=True, step=0, prep=True
[TM][DEBUG] Set logger level by DEBUG
[TM][DEBUG] std::shared_ptr<std::unordered_map<std::basic_string<char>, triton::Tensor> > LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::basic_string<char>, triton::Tensor> >, turbomind::AbstractInstanceComm*) [with T = __half]
[TM][DEBUG] std::unordered_map<std::basic_string<char>, turbomind::Tensor> LlamaTritonModelInstance<T>::convert_inputs(std::shared_ptr<std::unordered_map<std::basic_string<char>, triton::Tensor> >) [with T = __half]
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8c7027c00 with size 16416
[TM][DEBUG] void* turbomind::IAllocator::reMalloc(T*, size_t, bool, bool) [with T = float; size_t = long unsigned int]
[TM][DEBUG] Cannot find buffer (nil), mallocing new one.
[TM][DEBUG] virtual void* turbomind::Allocator<turbomind::AllocatorType::CUDA>::malloc(size_t, bool, bool)
[TM][DEBUG] malloc buffer 0x8c702be00 with size 32
[TM][DEBUG] bool turbomind::TensorMap::isExist(const string&) const for key: CORRID
[TM][DEBUG] T turbomind::Tensor::getVal() const [with T = long unsigned int] start
[TM][DEBUG] getVal with type x, but data type is: u8
[TM][DEBUG] T turbomind::Tensor::getVal(size_t) const [with T = long unsigned int; size_t = long unsigned int] start
[TM][DEBUG] getVal with type x, but data type is: u8
[TM][DEBUG] bool turbomind::TensorMap::isExist(const string&) const for key: START
[TM][DEBUG] T turbomind::Tensor::getVal() const [with T = int] start
[TM][DEBUG] T turbomind::Tensor::getVal(size_t) const [with T = int; size_t = long unsigned int] start
[TM][DEBUG] bool turbomind::TensorMap::isExist(const string&) const for key: END
[TM][DEBUG] T turbomind::Tensor::getVal() const [with T = int] start
[TM][DEBUG] T turbomind::Tensor::getVal(size_t) const [with T = int; size_t = long unsigned int] start
[TM][DEBUG] bool turbomind::TensorMap::isExist(const string&) const for key: STOP
[TM][DEBUG] T turbomind::Tensor::getVal() const [with T = int] start
[TM][DEBUG] T turbomind::Tensor::getVal(size_t) const [with T = int; size_t = long unsigned int] start
[TM][INFO] [forward] Enqueue requests
[TM][INFO] [forward] Wait for requests to complete ...
stack trace:
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__half>::Finish(turbomind::GenerationState&)+0x16a
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : turbomind::LlamaBatch<__half>::InternalThreadEntry(int)+0x982
.../lmdeploy/lmdeploy/lib/_turbomind.cpython-310-x86_64-linux-gnu.so : ()+0x27da84
/lib64/libpthread.so.0 : ()+0x7ea5
/lib64/libc.so.6 : clone()+0x6d
terminate called after throwing an instance of 'std::runtime_error'
what(): [TM][ERROR] CUDA runtime error: an illegal memory access was encountered .../lmdeploy/src/turbomind/models/llama/LlamaBatch.h:136
Aborted (core dumped)
Checklist
Describe the bug
we use vl_pipeline following Batch prompts inference
we got this error:
here is the code in LlamaBatch.h:
Reproduction
command to reproduce: CUDA_VISIBLE_DEVICES=0 python test.py
We found that the smaller the setting of max_batch_size, the easier it is to reproduce this issue; if the size of the prompt_list is equal to max_batch_size, the issue basically does not occur. We also found that it cannot be reproduced on A100
Environment