Closed ZouJoelin closed 1 year ago
Hi, thank you for your interest in our work.
Yes, you need to skip removed images since they are not owned by us (they are owned by Reddit users), and we cannot directly distribute them.
Below is a sample code of the data/mpchat_nrp.py
file. It skips any examples that contain removed images, either in the dialogue or persona images.
class MpchatClipClipNrpDataset(Dataset):
def __init__(self,
args,
tokenizer,
clip_processor,
mode):
super(MpchatClipClipNrpDataset, self).__init__()
assert mode in ['train', 'val', 'test']
self.args = args
self.clip_processor = clip_processor
self.mode = mode
self.examples = []
with open(os.path.join(args.dialog_data_dir, 'mpchat_nrp.json'), 'r') as fp:
data = json.load(fp)[f'{mode}']
num_examples = 0
for dialog_idx, dialog in enumerate(data):
main_author = dialog['main_author']
turn_indices = []
for turn_idx, author in enumerate(dialog['authors']):
if main_author == author:
turn_indices.append(turn_idx)
dialog_subreddit = dialog['subreddit']
for turn_idx in turn_indices:
context = ' '.join(dialog['messages'][:turn_idx])
response = dialog['messages'][turn_idx]
persona_sentences = ' '.join([f"{x['title']}" for x in dialog['candidate_personas']])
persona_fpaths = [os.path.join(args.persona_image_data_dir, x['file_name']) for x in dialog['candidate_personas']]
if dialog['has_image']:
fname_context = dialog['file_name']
dialog_fpath = os.path.join(args.dialog_image_data_dir, fname_context)
else:
dialog_fpath = ''
# Skip example if any image does not exist
skip_example = False
for persona_fpath in persona_fpaths:
if not os.path.exists(persona_fpath):
skip_example = True
if dialog_fpath != '' and not os.path.exists(dialog_fpath):
skip_example = True
if skip_example:
continue
if mode == 'train':
self.examples.append((context, response, dialog_fpath, persona_sentences, persona_fpaths, mode))
else:
assert response == dialog['nrp_candidate_responses'][turn_idx][0]
self.examples.append((context, dialog['nrp_candidate_responses'][turn_idx], dialog_fpath, persona_sentences, persona_fpaths, num_examples, 0, mode))
num_examples += 1
print(f'num. of {mode} dataset: {len(self.examples)}')
Please let me know if you have any questions.
Great, It works excellent. Really appreciate your help. BTW, I am very impressive with your efficiency. Also, I ran into "CUDA out of memory" error after that, for which I would open up another issue. Could you give me a hand for that too?
Hello, How to deal with removed image? May I add codes to skip removed image data, if so, where and how should I add such codes?