I'm encountering the following error when running my code (see below) with multi-GPUs (single GPU works fine).

stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
RuntimeError: Tensors must be CUDA and dense

My code:

from dataclasses import dataclass, field
from typing import Optional

import torch
import tyro
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline, HfArgumentParser

from trl import (
from trl.core import LengthSampler
from PIL import Image


model_name = "meta-llama/Llama-2-7b-chat-hf"

def hex_string_to_hex(hex_string):
    hex_values = hex_string.replace(",", " ").replace("0x", " ")

    sections = hex_values.split()  # Split the string into sections

    # Iterate through the sections and add leading zeros if needed
    result = []
    for section in sections:
        if len(section) == 1:
            section = "0" + section
        elif len(section) == 2:

    # Join the sections back together with spaces
    return " ".join(result)

class ScriptArguments:
    ppo_config: PPOConfig = field(
        default_factory=lambda: PPOConfig(
            # query_dataset="imdb",
            # reward_model="sentiment-analysis:lvwerra/distilbert-imdb",
    use_peft: bool = True
    """whether to use peft"""
    peft_config: Optional[LoraConfig] = field(
        default_factory=lambda: LoraConfig(
    trust_remote_code: bool = field(
        default=True, metadata={"help": "Enable `trust_remote_code`"}

args = tyro.cli(ScriptArguments)

# We then define the arguments to pass to the sentiment analysis pipeline.
# We set `return_all_scores` to True to get the sentiment score for each token.
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

# Below is an example function to build the dataset. In our case, we use the IMDB dataset
# from the `datasets` library. One should customize this function to train the model on
# its own dataset.
def build_dataset(
    tokenizer_name=model_name, input_min_text_length=2, input_max_text_length=8
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

        dataloader (``):
            The dataloader for the dataset.
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = ds = load_dataset(

    # ds = ds.rename_columns({"text": "review"})
    # ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["context"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds =, batched=False)
    return ds

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

def main():
    # We retrieve the dataloader by calling the `build_dataset` function.
    dataset = build_dataset()

    # set seed before initializing value head for deterministic eval

    # Now let's build the model, the reference model, and the tokenizer.
    if args.use_peft:
        peft_config = args.peft_config
        ref_model = None
        # Copy the model to each device
        device_map = {"": Accelerator().local_process_index}
        ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
            args.ppo_config.model_name, trust_remote_code=args.trust_remote_code
        device_map = None
        peft_config = None

    model = AutoModelForCausalLMWithValueHead.from_pretrained(

    tokenizer = AutoTokenizer.from_pretrained(args.ppo_config.model_name)

    # Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # We then build the PPOTrainer, passing the model, the reference model, the tokenizer
    ppo_trainer = PPOTrainer(

    # We then build the sentiment analysis pipeline, passing the model name and the
    # sentiment analysis pipeline arguments. Let's also make sure to set the device
    # to the same device as the PPOTrainer.
    device = ppo_trainer.accelerator.device
    if ppo_trainer.accelerator.num_processes == 1:
        device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
    # ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
    # task, model_name = args.ppo_config.reward_model.split(":")

    # if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
    #     with ds_plugin.zero3_init_context_manager(enable=False):
    #         sentiment_pipe = pipeline(task, model=model_name, device=device)
    # else:
    #     sentiment_pipe = pipeline(task, model=model_name, device=device)

    # # Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
    # if sentiment_pipe.tokenizer.pad_token_id is None:
    #     sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id

    # if sentiment_pipe.model.config.pad_token_id is None:
    #     sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id

    # We then define the arguments to pass to the `generate` function. These arguments
    # are passed to the `generate` function of the PPOTrainer, which is a wrapper around
    # the `generate` function of the trained model.
    generation_kwargs = {
        "min_length": -1,
        "top_k": 0.0,
        "top_p": 1.0,
        "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "max_new_tokens": 32,

    for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        query_tensors = batch["input_ids"]

        # Get response from gpt2
        response_tensors = ppo_trainer.generate(
            # generate_ref_response=False,
        batch["response"] = tokenizer.batch_decode(response_tensors)
        # batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)

        # Compute sentiment score
        # texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        # pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
        # rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

        rewards = []
        for r in zip(batch["response"]):
            # Todo:: seeds rename
                with open("./seeds/" + str(epoch), "wb") as file:
      "./seeds/" + str(epoch))
            except Exception as e:

        rewards = [torch.tensor(reward) for reward in rewards]

        # rewards = [torch.tensor(1.0) for i in batch["response"]]
        # ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
        # ref_pipe_outputs = sentiment_pipe(ref_texts, **sent_kwargs)
        # ref_rewards = [torch.tensor(output[1]["score"]) for output in ref_pipe_outputs]

        # batch["ref_rewards"] = ref_rewards

        # Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

if __name__ == "__main__":


younesbelkada commented 10 months ago

Hi @harrison4ride What is the TRL version you are using and can you share the full traceback of the issue? 🙏

harrison4ride commented 10 months ago

Thank you for your reply. My TRL version is 0.7.1 and full traceback is:

harrison4ride commented 10 months ago

I fixed this error by upgrading TRL to version 0.7.2. But I encountered another error after each step the GPU memory usage increased causing OOM, is it normal the GPU memory usage increase in each step? And are there any ways to reduce the memory usage beside reduce the batch_size.

fedem96 commented 9 months ago

But I encountered another error after each step the GPU memory usage increased causing OOM, is it normal the GPU memory usage increase in each step? And are there any ways to reduce the memory usage beside reduce the batch_size.

I'm also having OOM issues from trl 0.7.2 (also 0.7.3 and 0.7.4). Only versions up to 0.7.1 don't have this memory issue

harrison4ride commented 9 months ago

But I encountered another error after each step the GPU memory usage increased causing OOM, is it normal the GPU memory usage increase in each step? And are there any ways to reduce the memory usage beside reduce the batch_size.

I'm also having OOM issues from trl 0.7.2 (also 0.7.3 and 0.7.4). Only versions up to 0.7.1 don't have this memory issue

Good to know, I fix OOM by empty cuda cache after each step torch.cuda.empty_cache().