nlpyang / PreSumm

code for EMNLP 2019 paper Text Summarization with Pretrained Encoders
MIT License
1.28k stars 464 forks source link

Step 4. Format to Simpler Json Files #98

Open fatmalearning opened 4 years ago

fatmalearning commented 4 years ago

form Step 3 I trained my own dataset and obtained to json files for Step 4. Format to Simpler Json Files: I get this error

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Abstractive/pre-dataset/maping_data/mapping_valid.txt'

how can i get urls file for my own dataset ? could you help me ?

shashankMadan-designEsthetics commented 4 years ago

The same problem, I don't think they have coded it to preprocess any data except CNN/dailymail.

v-zmiycharov commented 4 years ago

Same problem here.

shashankMadan-designEsthetics commented 4 years ago
def custom_format_to_lines(args):
    corpus_mapping = {}
    train_files = []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        train_files.append(f)

    corpora = {'train': train_files}
    for corpus_type in ['train']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
def custom_format_to_bert(args):
        if (args.dataset != ''):
            datasets = [args.dataset]
            print('dataset')
        else:
            datasets = ['train']
        for corpus_type in datasets:
            a_lst = []
            print('.' + corpus_type + '.0.json')
            for json_f in glob.glob(pjoin(args.raw_path, '.' + corpus_type + '.0.json')):
                print(json_f)
                real_name = json_f.split('/')[-1]
                print(real_name)
                a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt'))))
            print(a_lst)
            pool = Pool(args.n_cpus)
            for d in pool.imap(_format_to_bert, a_lst):
                pass

            pool.close()
            pool.join()

def _format_to_bert(params):
    corpus_type, json_file, args, save_file = params
    is_test = corpus_type == 'test'
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        source, tgt = d['src'], d['tgt']

        sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3)
        if (args.lower):
            source = [' '.join(s).lower().split() for s in source]
            tgt = [' '.join(s).lower().split() for s in tgt]
        b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, is_test=is_test)
        # b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer)

        if (b_data is None):
            continue
        src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data
        b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs,
                       "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids,
                       'src_txt': src_txt, "tgt_txt": tgt_txt}
        datasets.append(b_data_dict)
    logger.info('Processed instances %d' % len(datasets))
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()

I wrote some custom functions just edited the old ones to my own need, you should add them in data_builder.py

Also noticed something called args.dataset, haven't personally used it but see if it fits your use case. You could read the src code data_builder.py and know more.

Tobias289 commented 4 years ago

Hi, im trying to make a dataset in swedish and im running into some problems with the preprocessing.py code and the changes above. Could you point me in the right direction as to where encoding might be an issue in the changes that you made?

Dod-o commented 4 years ago

same problem.

they only consider the cnn/dm...

LusianaSiahaan commented 2 years ago

@shashankMadan-designEsthetics is your modified only generate train files?