add dataset prepare module

ready to merge, shouldn't have breaking change. now wikipedia-nq/run.py can use datasets hosted on hugging face directly.

CUDA_VISIBLE_DEVICES=0 python run.py \
  --output_dir temp_out \
  --model_name_or_path bert-base-uncased \
  --do_train \
  --save_steps 20000 \
  --dataset_name Tevatron/wikipedia-nq/train \
  --fp16 \
  --per_device_train_batch_size 2 \
  --train_n_passages 2 \
  --learning_rate 1e-5 \
  --q_max_len 32 \
  --p_max_len 156 \
  --num_train_epochs 1

CUDA_VISIBLE_DEVICES=0 python run.py \
  --do_encode \
  --output_dir=temp_out \
  --model_name_or_path bert-base-uncased \
  --fp16 \
  --per_device_eval_batch_size 156 \
  --dataset_name Tevatron/wikipedia-nq/corpus \
  --encoded_save_path temp_out/000.pt \
  --encode_num_shard 50 --encode_shard_index 0

luyug / Dense

add dataset prepare module #2