[Bug, needs reproduction] [low priority] Weird loss behavior with `pfeiffer+inv` config

Seeing something very strange when I try to train with pfeiffer+inv adapters. I'll try to replicate this after some more GPUs free up on my end, and no problem if no one can check this (probably hard to replicate and I'd bet it'll mysteriously go away at some point :P)

{'loss': 1604337.28, 'learning_rate': 0.0009997200000000002, 'epoch': 0.02} then loss = 0.0 by the next logging period, at 1k steps.

This is using this script (below) and the current main branch. It happens the same way both with commit f55ab013599088a35c87a880ba13a6d912e27ef4 and the current bloom branch.

# axis
LANG="de"
MAX_TRAIN_SAMPLES=100_000
BIGS_MODEL="bigscience/bloom-1b3" # "/users/zyong2/data/zyong2/huggingface/bigscience/tr5b-1B3-multilingual-alpha-checkpoints"
ADPT_REDUCTION_FACTOR=16
adapter_config="pfeiffer+inv"

ADPT_STRAT="emb-and-adpt"
EMB_STRAT="overlap-replace"

tokenizer_dir=./trained_tokenizers//tok_bloom-1b3_de_oscar_100000samples_24000vocab_replace
cache_dir=./cache #"/users/zyong2/data/zyong2/huggingface/"
output_dir="./KEEP_RESULTS/de/1b3-postlora-"$adapter_config"-"$MAX_TRAIN_SAMPLES"samples-"$EMB_STRAT"-"$ADPT_STRAT-"$ADPT_REDUCTION_FACTOR"reduction""
logging_dir="./KEEP_RESULTS/de/1b3-postlora-"$adapter_config"-"$MAX_TRAIN_SAMPLES"samples-"$EMB_STRAT"-"$ADPT_STRAT-"$ADPT_REDUCTION_FACTOR"reduction""
mkdir -p $output_dir
mkdir -p $logging_dir

cp ./run_clm_adpt_verify.sh $output_dir/run_clm_adpt.sh

# CUDA_VISIBLE_DEVICES=4 python ../../../dev/multilingual-modeling/scripts/lang_adapt/madx_run_clm.py \
CUDA_VISIBLE_DEVICES=4 python ./madx_run_clm.py \
    --fp16 \
    --seed 0 \
    --model_name_or_path $BIGS_MODEL \
    --tokenizer_name $tokenizer_dir \
    --dataset_name oscar \
    --cache_dir $cache_dir \
    --dataset_config_name "unshuffled_deduplicated_$LANG" \
    --logging_dir $logging_dir \
    --report_to "tensorboard" \
    --learning_rate 0.001 \
    --do_train \
    --do_eval \
    --output_dir $output_dir \
    --preprocessing_num_workers 8 \
    --overwrite_output_dir \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 8 \
    --per_device_eval_batch_size 1 \
    --eval_accumulation_steps 8 \
    --eval_steps 5000 \
    --logging_steps 100 \
    --evaluation_strategy "steps" \
    --max_eval_samples 5000 \
    --save_steps 25000 \
    --save_strategy "steps" \
    --max_train_samples $MAX_TRAIN_SAMPLES \
    --max_steps 50000 \
    --train_adapter \
    --lang_adapt_strategies $ADPT_STRAT \
    --embedding_strategies $EMB_STRAT \
    --adapter_reduction_factor $ADPT_REDUCTION_FACTOR \
    --adapter_config ${adapter_config} \
    --language $LANG \
    --load_best_model_at_end
    # --gradient_checkpointing

bigscience-workshop / multilingual-modeling

[Bug, needs reproduction] [low priority] Weird loss behavior with `pfeiffer+inv` config #40