Open asifehmad opened 1 year ago
try setting this up os.getenv('WORLD_SIZE', '1')
try setting this up os.getenv('WORLD_SIZE', '1')
Hi, I used it something like this:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import deepspeed
world_size = os.getenv('WORLD_SIZE', '2')
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True,device_map='balanced')
ds_model = deepspeed.init_inference(
model=model,
mp_size=world_size,
dtype=torch.float16,
replace_method="auto",
replace_with_kernel_inject=True)
import time
start = time.time()
prompt = 'What is the Capital of France? '
inputs = tokenizer.encode("<human>: prompt \n<bot>:", return_tensors='pt').to(model.device)
outputs = ds_model.generate(inputs, max_new_tokens=256)
output_str = tokenizer.decode(outputs[0])
print(output_str)
end = time.time()
print('Inference Time is:', end - start)
I put the code in model.py
and ran it using deepspeed --num_gpus 2 model.py
but got this error:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when chec
king argument for argument index in method wrapper_CUDA__index_select)
[2023-08-15 09:03:03,284] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 3411
[2023-08-15 09:03:03,581] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 3412
[2023-08-15 09:03:03,582] [ERROR] [launch.py:321:sigkill_handler] ['/usr/bin/python', '-u', 'model.py', '--local_rank=1']
exits with return code = 1
Used setting world_size = os.getenv('WORLD_SIZE', '1')
too but got the same above error.
try setting this up os.getenv('WORLD_SIZE', '1')
Okay, so it works now for this script:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import deepspeed
import time
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_fast=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True,device_map=local_rank)
ds_model = deepspeed.init_inference(
model=model, mp_size=world_size, dtype=torch.float16, replace_method="auto", replace_with_kernel_inject=True)
start = time.time()
prompt = 'Write a detailed note on AI'
inputs = tokenizer.encode(f"<human>: {prompt} \n<bot>:", return_tensors='pt').to(model.device)
outputs = ds_model.generate(inputs, max_new_tokens=2000)
output_str = tokenizer.decode(outputs[0])
print(output_str)
end = time.time()
print('Inference Time is:', end - start)
It is giving me two responses instead of one, this is a new problem I am facing.
Hello again came from discuss.huggingface :)
How can I modify this code to this solution ? Which part should I change to run on 2 gpu?
from huggingface_hub import hf_hub_download
import torch
import os
from open_flamingo import create_model_and_transforms
from accelerate import Accelerator
from einops import repeat
from PIL import Image
import sys
sys.path.append('..')
from src.utils import FlamingoProcessor
from demo_utils import image_paths, clean_generation
def main():
accelerator = Accelerator() #when using cpu: cpu=True
device = accelerator.device
print('Loading model..')
# >>> add your local path to Llama-7B (v1) model here:
llama_path = '../models/llama-7b-hf'
if not os.path.exists(llama_path):
raise ValueError('Llama model not yet set up, please check README for instructions!')
model, image_processor, tokenizer = create_model_and_transforms(
clip_vision_encoder_path="ViT-L-14",
clip_vision_encoder_pretrained="openai",
lang_encoder_path=llama_path,
tokenizer_path=llama_path,
cross_attn_every_n_layers=4
)
# load med-flamingo checkpoint:
checkpoint_path = hf_hub_download("med-flamingo/med-flamingo", "model.pt")
print(f'Downloaded Med-Flamingo checkpoint to {checkpoint_path}')
model.load_state_dict(torch.load(checkpoint_path, map_location=device), strict=False)
processor = FlamingoProcessor(tokenizer, image_processor)
# go into eval model and prepare:
model = accelerator.prepare(model)
is_main_process = accelerator.is_main_process
model.eval()
"""
Step 1: Load images
"""
demo_images = [Image.open(path) for path in image_paths]
"""
Step 2: Define multimodal few-shot prompt
"""
# example few-shot prompt:
prompt = "You are a helpful medical assistant. You are being provided with images, a question about the image and an answer. Follow the examples and answer the last question. <image>Question: What is/are the structure near/in the middle of the brain? Answer: pons.<|endofchunk|><image>Question: Is there evidence of a right apical pneumothorax on this chest x-ray? Answer: yes.<|endofchunk|><image>Question: Is/Are there air in the patient's peritoneal cavity? Answer: no.<|endofchunk|><image>Question: Does the heart appear enlarged? Answer: yes.<|endofchunk|><image>Question: What side are the infarcts located? Answer: bilateral.<|endofchunk|><image>Question: Which image modality is this? Answer: mr flair.<|endofchunk|><image>Question: Where is the largest mass located in the cerebellum? Answer:"
"""
Step 3: Preprocess data
"""
print('Preprocess data')
pixels = processor.preprocess_images(demo_images)
pixels = repeat(pixels, 'N c h w -> b N T c h w', b=1, T=1)
tokenized_data = processor.encode_text(prompt)
"""
Step 4: Generate response
"""
# actually run few-shot prompt through model:
print('Generate from multimodal few-shot prompt')
generated_text = model.generate(
vision_x=pixels.to(device),
lang_x=tokenized_data["input_ids"].to(device),
attention_mask=tokenized_data["attention_mask"].to(device),
max_new_tokens=10,
)
response = processor.tokenizer.decode(generated_text[0])
response = clean_generation(response)
print(f'{response=}')
if __name__ == "__main__":
main()
Hello again came from discuss.huggingface :)
How can I modify this code to this solution ? Which part should I change to run on 2 gpu?
from huggingface_hub import hf_hub_download import torch import os from open_flamingo import create_model_and_transforms from accelerate import Accelerator from einops import repeat from PIL import Image import sys sys.path.append('..') from src.utils import FlamingoProcessor from demo_utils import image_paths, clean_generation
def main(): accelerator = Accelerator() #when using cpu: cpu=True
device = accelerator.device print('Loading model..') # >>> add your local path to Llama-7B (v1) model here: llama_path = '../models/llama-7b-hf' if not os.path.exists(llama_path): raise ValueError('Llama model not yet set up, please check README for instructions!') model, image_processor, tokenizer = create_model_and_transforms( clip_vision_encoder_path="ViT-L-14", clip_vision_encoder_pretrained="openai", lang_encoder_path=llama_path, tokenizer_path=llama_path, cross_attn_every_n_layers=4 ) # load med-flamingo checkpoint: checkpoint_path = hf_hub_download("med-flamingo/med-flamingo", "model.pt") print(f'Downloaded Med-Flamingo checkpoint to {checkpoint_path}') model.load_state_dict(torch.load(checkpoint_path, map_location=device), strict=False) processor = FlamingoProcessor(tokenizer, image_processor) # go into eval model and prepare: model = accelerator.prepare(model) is_main_process = accelerator.is_main_process model.eval() """ Step 1: Load images """ demo_images = [Image.open(path) for path in image_paths] """ Step 2: Define multimodal few-shot prompt """ # example few-shot prompt: prompt = "You are a helpful medical assistant. You are being provided with images, a question about the image and an answer. Follow the examples and answer the last question. <image>Question: What is/are the structure near/in the middle of the brain? Answer: pons.<|endofchunk|><image>Question: Is there evidence of a right apical pneumothorax on this chest x-ray? Answer: yes.<|endofchunk|><image>Question: Is/Are there air in the patient's peritoneal cavity? Answer: no.<|endofchunk|><image>Question: Does the heart appear enlarged? Answer: yes.<|endofchunk|><image>Question: What side are the infarcts located? Answer: bilateral.<|endofchunk|><image>Question: Which image modality is this? Answer: mr flair.<|endofchunk|><image>Question: Where is the largest mass located in the cerebellum? Answer:" """ Step 3: Preprocess data """ print('Preprocess data') pixels = processor.preprocess_images(demo_images) pixels = repeat(pixels, 'N c h w -> b N T c h w', b=1, T=1) tokenized_data = processor.encode_text(prompt) """ Step 4: Generate response """ # actually run few-shot prompt through model: print('Generate from multimodal few-shot prompt') generated_text = model.generate( vision_x=pixels.to(device), lang_x=tokenized_data["input_ids"].to(device), attention_mask=tokenized_data["attention_mask"].to(device), max_new_tokens=10, ) response = processor.tokenizer.decode(generated_text[0]) response = clean_generation(response) print(f'{response=}')
if name == "main":
main()
Hi, can you write your code again in the proper format?
Hi, can you write your code again in the proper format?
Hello, I've just edited my comment to write code in the proper format.
Hi, I meet your same proplem, this my code:
from transformers import FuyuForCausalLM, AutoTokenizer, FuyuProcessor, FuyuImageProcessor
from PIL import Image
import os
from accelerate import Accelerator
import deepspeed
import time
pretrained_path = "./fuyu-8b"
accelerator = Accelerator()
device = accelerator.device
tokenizer = AutoTokenizer.from_pretrained(pretrained_path,use_fast=True,device_map="auto")
image_processor = FuyuImageProcessor()
processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
model = FuyuForCausalLM.from_pretrained(pretrained_path)
model = accelerator.prepare(model)
is_main_process = accelerator.is_local_main_process
model.eval()
text_prompt = "Generate a coco-style caption.\n"
image_path = "./1163433113_img.jpg" # https://huggingface.co/adept-hf-collab/fuyu-8b/blob/main/bus.png
image_pil = Image.open(image_path)
model_inputs = processor(text=text_prompt, images=[image_pil], device=device)
for k, v in model_inputs.items():
model_inputs[k] = v.to(device)
generation_output = model.generate(**model_inputs, max_new_tokens=7)
generation_text = processor.batch_decode(generation_output[:, -7:], skip_special_tokens=True)
my machine contain 4 gpus(A10); I still got this error please, help me.
try setting this up os.getenv('WORLD_SIZE', '1')
actually, i have 4 gpus on the machine, why I can only use one?
Hi, I loaded this model from HF using the below code on
2xA100s
:Then I loaded it using
deepspeed.init_inference()
as:and then for inferences I used:
Unfortunately, I got this error:
I tried the
mp_size
as1
too, but in vain.Any guidance/help would be highly appreciated, thanks in anticipation!