Open NiloufarAb opened 4 weeks ago
Hello @NiloufarAb! I'm here to assist you with any bugs, questions, or contributions you may have while waiting for a human maintainer.
To run HuggingFaceEmbedding
on multiple GPUs, you need to modify the class to support multi-GPU setups using torch.nn.DataParallel
. The current implementation does not support this out of the box. Here is an example of how you can modify the HuggingFaceEmbedding
class to use multiple GPUs:
import torch
class HuggingFaceEmbedding(BaseEmbedding):
# ... [other parts of the class]
def __init__(
self,
model_name: Optional[str] = None,
tokenizer_name: Optional[str] = None,
pooling: Optional[str] = None,
max_length: Optional[int] = None,
query_instruction: Optional[str] = None,
text_instruction: Optional[str] = None,
normalize: bool = True,
model: Optional[Any] = None,
tokenizer: Optional[Any] = None,
embed_batch_size: int = DEFAULT_EMBED_BATCH_SIZE,
cache_folder: Optional[str] = None,
trust_remote_code: bool = False,
device: Optional[str] = None,
callback_manager: Optional[CallbackManager] = None,
parallel_process: bool = False,
target_devices: Optional[List[str]] = None,
**model_kwargs,
):
try:
from transformers import AutoModel, AutoTokenizer
except ImportError:
raise ImportError(
"HuggingFaceEmbedding requires transformers to be installed.\n"
"Please install transformers with `pip install transformers`."
)
self._device = device or infer_torch_device()
self._parallel_process = parallel_process
self._target_devices = target_devices
cache_folder = cache_folder or get_cache_dir()
if model is None: # Use model_name with AutoModel
model_name = (
model_name
if model_name is not None
else DEFAULT_HUGGINGFACE_EMBEDDING_MODEL
)
model = AutoModel.from_pretrained(
model_name, cache_dir=cache_folder, trust_remote_code=trust_remote_code
)
elif model_name is None: # Extract model_name from model
model_name = model.name_or_path
self._model = model
if torch.cuda.device_count() > 1:
self._model = torch.nn.DataParallel(self._model)
self._model.to(self._device)
if tokenizer is None: # Use tokenizer_name with AutoTokenizer
tokenizer_name = (
model_name or tokenizer_name or DEFAULT_HUGGINGFACE_EMBEDDING_MODEL
)
tokenizer = AutoTokenizer.from_pretrained(
tokenizer_name, cache_dir=cache_folder
)
elif tokenizer_name is None: # Extract tokenizer_name from model
tokenizer_name = tokenizer.name_or_path
self._tokenizer = tokenizer
if max_length is None:
try:
max_length = int(self._model.config.max_position_embeddings)
except AttributeError as exc:
raise ValueError(
"Unable to find max_length from model config. Please specify max_length."
) from exc
if not pooling:
pooling = get_pooling_mode(model_name)
try:
pooling = Pooling(pooling)
except ValueError as exc:
raise NotImplementedError(
f"Pooling {pooling} unsupported, please pick one in"
f" {[p.value for p in Pooling]}."
) from exc
super().__init__(
embed_batch_size=embed_batch_size,
callback_manager=callback_manager,
model_name=model_name,
tokenizer_name=tokenizer_name,
max_length=max_length,
pooling=pooling,
normalize=normalize,
query_instruction=query_instruction,
text_instruction=text_instruction,
)
# ... [other methods of the class]
This modification uses torch.nn.DataParallel
to distribute the model across multiple GPUs. Note that DataParallel
is a simple way to parallelize across multiple GPUs but may not be the most efficient for all use cases. For more advanced parallelism, consider using torch.nn.parallel.DistributedDataParallel
[1][2].
To continue talking to Dosu, mention @dosu.
Question Validation
Question
I'm trying to load an embedding model from HuggingFace on multiple available GPUs using this code:
embed_model = HuggingFaceEmbedding(self._base_model, parallel_process = True, target_devices=["cuda:0", "cuda:1", "cuda:2","cuda:3"])
However, it still uses the first GPU and throws cuda out of memory error. Could someone help me with that?