Closed bltcn closed 1 year ago
可能24G显存微调不是太够?我也不确定。你用qlora训练试试呢,看是否报错
和数据量有关系嘛?样例数据是能跑的,但是我用30w数据微调就不行了
哦,目前实现的数据集是一下子全读进内存。你可能需要改一下写法,init的时候只读取一个图片路径,等到getitem的时候再读到内存里处理。
谢谢,是FewShotDataset的init吧,我这样修改是否正确
class FewShotDataset(Dataset): def init(self, path, processor, tokenizer, args): max_seq_length = args.max_source_length + args.max_target_length with open(path, 'r', encoding='utf-8') as f: data = json.load(f) self.images = [] self.input_ids = [] self.labels = [] for item in data:
image = processor(Image.open(item['img']).convert('RGB'))
input0 = tokenizer.encode("<img>", add_special_tokens=False)
input1 = [tokenizer.pad_token_id] * args.image_length
input2 = tokenizer.encode("</img>问:"+item['prompt']+"\n答:", add_special_tokens=False)
a_ids = sum([input0, input1, input2], [])
b_ids = tokenizer.encode(text=item['label'], add_special_tokens=False)
if len(a_ids) > args.max_source_length - 1:
a_ids = a_ids[: args.max_source_length - 1]
if len(b_ids) > args.max_target_length - 2:
b_ids = b_ids[: args.max_target_length - 2]
pre_image = len(input0)
input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)
context_length = input_ids.index(tokenizer.bos_token_id)
mask_position = context_length - 1
labels = [-100] * context_length + input_ids[mask_position+1:]
pad_len = max_seq_length - len(input_ids)
input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
labels = labels + [tokenizer.pad_token_id] * pad_len
if args.ignore_pad_token_for_loss:
labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
#修改,不载入图片,等到后续需要的时候再载入,只记录图片路径
#self.images.append(image)
self.images.append(item['img'])
self.input_ids.append(input_ids)
self.labels.append(labels)
self.pre_image = pre_image
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
return {
# 修改,载入图片
#"image": self.images[idx],
"image": processor(Image.open(self.images[idx]).convert('RGB'))
"input_ids": self.input_ids[idx],
"labels": self.labels[idx],
"pre_image": self.pre_image
}
环境:nvidia a10 24g显存,docker:nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04,cpu:Intel® Xeon® Silver 4314×2,mem:256G