D3 taking an exaggerate RAM consume

very nice work,
I was trying to run the D3 example in my VM (4 GPU 208 GB total RAM) and I went out of memory.
---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
Cell In[1], line 27
     11 positive_samples = [
     12     "How much in miles is a ten K run?",
     13     "When is the Jimmy Buffett concert coming to the E center in Camden NJ?",
   (...)
     16     "How old was Elvis Presley when he died?"
     17 ]
     19 negative_samples = [
     20     "What is the daily requirement of folic acid for an expectant mother?",
     21     "What type of bridge is the Golden Gate Bridge?",
   (...)
     24     "What college football team did Knute Rockne build into a power?"
     25 ]
---> 27 hypotheses, hypothesis_scores = imodelsx.explain_dataset_d3(
     28     pos=positive_samples, # List[str] of positive examples
     29     neg=negative_samples, # another List[str]
     30     num_steps=20,
     31     num_folds=2,
     32     batch_size=16,
     33 )
     35 print('learned hypotheses', hypotheses)
     36 print('corresponding scores', hypothesis_scores)

File /opt/conda/lib/python3.10/site-packages/imodelsx/d3/d3.py:89, in explain_dataset_d3(pos, neg, proposer_name, verifier_name, save_folder, num_steps, num_folds, batch_size, verbose)
     87     print('\nStep 2/3: propose hypothesis...')
     88 pos2score, neg2score = extreme_vals['pos2score'], extreme_vals['neg2score']
---> 89 proposer = init_proposer(proposer_name)
     90 proposed_hypotheses = proposer.propose_hypothesis(pos2score, neg2score)
     91 pkl.dump(proposed_hypotheses, open(os.path.join(
     92     save_folder, '02_proposed_hypotheses.pkl'), 'wb'))

File /opt/conda/lib/python3.10/site-packages/imodelsx/d3/step2_proposer.py:182, in init_proposer(proposer_name)
    180 def init_proposer(proposer_name):
    181     if proposer_name[:2] == 't5':
--> 182         return T5Proposer(proposer_name[2:])
    183     if proposer_name[:4] == 'gpt3':
    184         return T5Proposer(proposer_name[4:])

File /opt/conda/lib/python3.10/site-packages/imodelsx/d3/step2_proposer.py:123, in T5Proposer.__init__(self, model_name, verbose)
    120 if verbose:
    121     print('loading model')
    122 self.model = transformers.T5ForConditionalGeneration.from_pretrained(
--> 123     model_name).half().to(device)
    124 self.model.eval()
    125 if verbose:

File /opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py:2861, in PreTrainedModel.to(self, *args, **kwargs)
   2856     if dtype_present_in_args:
   2857         raise ValueError(
   2858             "You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired"
   2859             " `dtype` by passing the correct `torch_dtype` argument."
   2860         )
-> 2861 return super().to(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1174, in Module.to(self, *args, **kwargs)
   1171         else:
   1172             raise
-> 1174 return self._apply(convert)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:780, in Module._apply(self, fn, recurse)
    778 if recurse:
    779     for module in self.children():
--> 780         module._apply(fn)
    782 def compute_should_use_set_data(tensor, tensor_applied):
    783     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    784         # If the new tensor has compatible tensor type as the existing tensor,
    785         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    790         # global flag to let the user control whether they want the future
    791         # behavior of overwriting the existing tensor or not.

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:780, in Module._apply(self, fn, recurse)
    778 if recurse:
    779     for module in self.children():
--> 780         module._apply(fn)
    782 def compute_should_use_set_data(tensor, tensor_applied):
    783     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    784         # If the new tensor has compatible tensor type as the existing tensor,
    785         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    790         # global flag to let the user control whether they want the future
    791         # behavior of overwriting the existing tensor or not.

    [... skipping similar frames: Module._apply at line 780 (4 times)]

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:780, in Module._apply(self, fn, recurse)
    778 if recurse:
    779     for module in self.children():
--> 780         module._apply(fn)
    782 def compute_should_use_set_data(tensor, tensor_applied):
    783     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    784         # If the new tensor has compatible tensor type as the existing tensor,
    785         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    790         # global flag to let the user control whether they want the future
    791         # behavior of overwriting the existing tensor or not.

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:805, in Module._apply(self, fn, recurse)
    801 # Tensors stored in modules are graph leaves, and we don't want to
    802 # track autograd history of `param_applied`, so we have to use
    803 # `with torch.no_grad():`
    804 with torch.no_grad():
--> 805     param_applied = fn(param)
    806 p_should_use_set_data = compute_should_use_set_data(param, param_applied)
    808 # subclasses may have multiple child tensors so we need to use swap_tensors

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1160, in Module.to.<locals>.convert(t)
   1153     if convert_to_format is not None and t.dim() in (4, 5):
   1154         return t.to(
   1155             device,
   1156             dtype if t.is_floating_point() or t.is_complex() else None,
   1157             non_blocking,
   1158             memory_format=convert_to_format,
   1159         )
-> 1160     return t.to(
   1161         device,
   1162         dtype if t.is_floating_point() or t.is_complex() else None,
   1163         non_blocking,
   1164     )
   1165 except NotImplementedError as e:
   1166     if str(e) == "Cannot copy out of meta tensor; no data!":

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 4.94 MiB is free. Including non-PyTorch memory, this process has 15.76 GiB memory in use. Of the allocated memory 15.31 GiB is allocated by PyTorch, and 77.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
csinva / imodelsX

D3 taking an exaggerate RAM consume #14