Open ssalawu opened 4 years ago
same problem, anyone know how to solve it?
In the end, I just moved to a Linux machine to use a multi-gpu setup or for Windows, use a Windows machine with a single gpu as even setting the number of gpus to use to 1 didn't work in my Windows setup.
I get the above error when performing fine-tuning using fast-bert. I'm environment is Win10 (2 gpu), Python 3.7, Fast-Bert 1.6.5, cuda 10.1. The code I'm using is as below:
`# set parameters logger = logging.getLogger() device = torch.device("cuda")
read custom text for fine tunning
df = pd.read_csv('data/text.csv') texts = df['Tweet'].astype(str).tolist()
args = Box({ "seed": 42, "task_name": 'twitter_cyberbullying', "model_name": 'bert-base-uncased', "model_type": 'bert', "train_batch_size": 16, "learning_rate": 4e-5, "num_train_epochs": 20, "fp16": True, "fp16_opt_level": "O2", "warmup_steps": 1000, "logging_steps": 0, "max_seq_length": 512, "multi_gpu": True if torch.cuda.device_count() > 1 else False })
DATA_PATH = Path('../lm_data/') LOG_PATH = Path('../logs') MODEL_PATH = Path('../lmmodel{}/'.format(args.model_type))
DATA_PATH.mkdir(exist_ok=True) MODEL_PATH.mkdir(exist_ok=True) LOG_PATH.mkdir(exist_ok=True)
databunch_lm = BertLMDataBunch.from_raw_corpus( data_dir=DATA_PATH, text_list=texts, tokenizer=args.model_name, batch_size_per_gpu=args.train_batch_size, max_seq_length=args.max_seq_length, multi_gpu=args.multi_gpu, model_type=args.model_type, logger=logger)
learner = BertLMLearner.from_pretrained_model( dataBunch=databunch_lm, pretrained_path=args.model_name, output_dir=MODEL_PATH, metrics=[], device=device, logger=logger, multi_gpu=args.multi_gpu, logging_steps=args.logging_steps, fp16_opt_level=args.fp16_opt_level)
learner.fit(epochs=6, lr=6e-5, validate=True, # Evaluate the model after each epoch schedule_type="warmup_cosine", optimizer_type="lamb")
learner.save_model() `