OpenMOSS / MOSS

An open-source tool-augmented conversational language model from Fudan University
https://txsun1997.github.io/blogs/moss.html
Apache License 2.0
11.9k stars 1.14k forks source link

How to fix can not find "./sft_data/train.jsonl" #282

Open liujiabin20810 opened 1 year ago

liujiabin20810 commented 1 year ago

we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl

Here is the code:

# prepare_moss_sft.py
"""
prepare train and val dataset for for moss-sft

This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.
"""
import os
import json
import copy
import torch
from transformers import AutoTokenizer

def tokenzie_dataset(file, tokenizer):
    """
    read json file and tokenzie it.
    """
    data = []
    no_loss_spans = []
    with open(file, "r") as f:
        try:
            sample = json.load(f)
            chat = sample['chat']
            num_turns = int(sample['num_turns'])

            meta_instruction = sample['meta_instruction']
            instruction_ids = tokenizer.encode(meta_instruction)
            assert isinstance(instruction_ids, list) and len(instruction_ids) > 0

            input_ids = copy.deepcopy(instruction_ids)
            _no_loss_spans = [(0, len(instruction_ids))]

            for i in range(num_turns):
                cur_turn_ids = []
                cur_no_loss_spans = []
                cur_turn = chat[f'turn_{i+1}']
                for key, value in cur_turn.items():

                    cur_ids = tokenizer.encode(value)

                    if key == 'Tool Responses':
                        # The format tokens (<|Results|>:...<eor>\n) should have losses. 
                        cur_no_loss_spans.append((len(input_ids + cur_turn_ids) + 5, len(input_ids + cur_turn_ids + cur_ids) - 2))    

                    assert isinstance(cur_ids, list) and len(cur_ids) > 0

                    cur_turn_ids.extend(cur_ids)

                if len(input_ids + cur_turn_ids) > 2048:
                    break

                input_ids.extend(cur_turn_ids)
                _no_loss_spans.extend(cur_no_loss_spans)

            assert len(input_ids) > 0 and len(input_ids) <= 2048

            data = input_ids
            no_loss_spans = _no_loss_spans
        except json.JSONDecodeError as e:
            print(f"Error parsing {file_path}: {e}")

    return data, no_loss_spans

def combine_json(dir, tokenizer, split):
    files = []
    # there are maybe several json files in dir 
    for item in os.listdir(dir):
        item_path = os.path.join(dir, item)
        if os.path.isfile(item_path) and item_path.endswith(".json"):  # only json file
            files.append(item_path)

    data = []
    no_loss_spans = []
    for file in files:
        # one file to be one sample
        _data, _no_loss_spans = tokenzie_dataset(file, tokenizer)
        data.append(_data)
        no_loss_spans.append(_no_loss_spans)

    # split data to train data and val data
    train_len = int(len(data) * split)
    train_data = data[:train_len]
    val_data = data[train_len:]

    train_len = int(len(no_loss_spans) * split)
    train_no_loss_spans =  no_loss_spans[:train_len]
    val_no_loss_spans =  no_loss_spans[train_len:]

    print("train len:", len(train_data), len(train_no_loss_spans))
    print("val len:", len(val_data), len(val_no_loss_spans))

    return train_data, train_no_loss_spans, val_data, val_no_loss_spans

def prepare_dataset(data_dir, out_dir, tokenizer, split = 0.85):
    '''
    parse all json files in a directory
    '''
    train_data = []
    train_no_loss_spans = []
    val_data = []
    val_no_loss_spans = []
    for root, dirs, _ in os.walk(data_dir):
        for dir in dirs:
            dir =  os.path.join(root, dir)
            print(dir)
            _train_data, _train_no_loss_spans, _val_data, _val_no_loss_spans = combine_json(dir, tokenizer, split)
            train_data += _train_data
            val_data += _val_data
            train_no_loss_spans += _train_no_loss_spans
            val_no_loss_spans += _val_no_loss_spans

    data_type = ['train', 'val']
    data_file = os.path.join(out_dir, f'{data_type[0]}_data')
    no_loss_spans_file = os.path.join(out_dir, f'{data_type[0]}_no_loss_spans')

    torch.save(train_data, data_file)
    torch.save(train_no_loss_spans, no_loss_spans_file)

    data_file = os.path.join(out_dir, f'{data_type[1]}_data')
    no_loss_spans_file = os.path.join(out_dir, f'{data_type[1]}_no_loss_spans')
    torch.save(val_data, data_file)
    torch.save(val_no_loss_spans, no_loss_spans_file)

    print("train samples:", len(train_data), len(train_no_loss_spans))
    print("val samples:", len(val_data), len(val_no_loss_spans))

if __name__ == "__main__":
    # tokenizer
    model_name_or_path = "fnlp/moss-moon-003-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
    tokenizer.eos_token_id = 106068 # The eos_token_id of base model is 106028. We need map the eos token to <eom> (its token id is 106068)

    indir = "./SFT_data/conversations/conversation_without_plugins/"
    outdir = "./sft_data/"
    prepare_dataset(indir, outdir, tokenizer)

This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.

usun1997 commented 1 year ago

you said 'we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl Here is the code:' in the beginning, and then you said 'This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.' wtf?

liujiabin20810 commented 1 year ago

you said 'we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl Here is the code:' in the beginning, and then you said 'This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.' wtf?

Maybe I didn't explain clearly. the load_data method in finetune_moss.py has two ways to load data, one way is reading dataset from jsonl type file and extract the content, saving to train_data and val_data. Another way is reading directly from train_data and val_data saved before.

The code above is saving the train_data and val_data , not train.jsonl.

usun1997 commented 1 year ago

you said 'we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl Here is the code:' in the beginning, and then you said 'This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.' wtf?

Maybe I didn't explain clearly. the load_data method in finetune_moss.py has two ways to load data, one way is reading dataset from jsonl type file and extract the content, saving to train_data and val_data. Another way is reading directly from train_data and val_data saved before.

The code above is saving the train_data and val_data , not train.jsonl.

I got you. I have successfully generated the train_data and val_data and finetuned my custom model. I would say that your shared code is extremely helpful! Thanks a lot!