Closed chbensch closed 3 years ago
Therefore, when I try to concatenate larger datasets (5x 35GB data sets) I also get an out of memory error, since over 90GB of swap space was used at the time of the crash:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-6-9766d77530b9> in <module>
20 print(file_name)
21 cv_batch = load_from_disk(file_name)
---> 22 cv_sampled_train = concatenate_datasets([cv_sampled_train, cv_batch])
23
24 print("Saving to disk!")
C:\ProgramData\Anaconda3\lib\site-packages\datasets\arrow_dataset.py in concatenate_datasets(dsets, info, split, axis)
2891
2892 # Concatenate tables
-> 2893 table = concat_tables([dset._data for dset in dsets if len(dset._data) > 0], axis=axis)
2894 table = update_metadata_with_features(table, None)
2895
C:\ProgramData\Anaconda3\lib\site-packages\datasets\table.py in concat_tables(tables, axis)
837 if len(tables) == 1:
838 return tables[0]
--> 839 return ConcatenationTable.from_tables(tables, axis=axis)
840
841
C:\ProgramData\Anaconda3\lib\site-packages\datasets\table.py in from_tables(cls, tables, axis)
697 return result
698
--> 699 blocks = to_blocks(tables[0])
700 for table in tables[1:]:
701 table_blocks = to_blocks(table)
C:\ProgramData\Anaconda3\lib\site-packages\datasets\table.py in to_blocks(table)
669 return [[InMemoryTable(table)]]
670 elif isinstance(table, ConcatenationTable):
--> 671 return copy.deepcopy(table.blocks)
672 else:
673 return [[table]]
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
151 copier = getattr(x, "__deepcopy__", None)
152 if copier is not None:
--> 153 y = copier(memo)
154 else:
155 reductor = dispatch_table.get(cls)
C:\ProgramData\Anaconda3\lib\site-packages\datasets\table.py in __deepcopy__(self, memo)
143 # by adding it to the memo, self.table won't be copied
144 memo[id(self.table)] = self.table
--> 145 return _deepcopy(self, memo)
146
147 def __getstate__(self):
C:\ProgramData\Anaconda3\lib\site-packages\datasets\table.py in _deepcopy(x, memo)
62 memo[id(x)] = result
63 for k, v in x.__dict__.items():
---> 64 setattr(result, k, copy.deepcopy(v, memo))
65 return result
66
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
173
174 # If is its own copy, don't memoize.
C:\ProgramData\Anaconda3\lib\copy.py in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
262 if deep and args:
263 args = (deepcopy(arg, memo) for arg in args)
--> 264 y = func(*args)
265 if deep:
266 memo[id(x)] = y
C:\ProgramData\Anaconda3\lib\copy.py in <genexpr>(.0)
261 deep = memo is not None
262 if deep and args:
--> 263 args = (deepcopy(arg, memo) for arg in args)
264 y = func(*args)
265 if deep:
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
170 y = x
171 else:
--> 172 y = _reconstruct(x, memo, *rv)
173
174 # If is its own copy, don't memoize.
C:\ProgramData\Anaconda3\lib\copy.py in _reconstruct(x, memo, func, args, state, listiter, dictiter, deepcopy)
262 if deep and args:
263 args = (deepcopy(arg, memo) for arg in args)
--> 264 y = func(*args)
265 if deep:
266 memo[id(x)] = y
C:\ProgramData\Anaconda3\lib\copy.py in <genexpr>(.0)
261 deep = memo is not None
262 if deep and args:
--> 263 args = (deepcopy(arg, memo) for arg in args)
264 y = func(*args)
265 if deep:
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_tuple(x, memo, deepcopy)
208
209 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 210 y = [deepcopy(a, memo) for a in x]
211 # We're not going to put the tuple in the memo, but it's still important we
212 # check for it, in case the tuple contains recursive mutable structures.
C:\ProgramData\Anaconda3\lib\copy.py in <listcomp>(.0)
208
209 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 210 y = [deepcopy(a, memo) for a in x]
211 # We're not going to put the tuple in the memo, but it's still important we
212 # check for it, in case the tuple contains recursive mutable structures.
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_tuple(x, memo, deepcopy)
208
209 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 210 y = [deepcopy(a, memo) for a in x]
211 # We're not going to put the tuple in the memo, but it's still important we
212 # check for it, in case the tuple contains recursive mutable structures.
C:\ProgramData\Anaconda3\lib\copy.py in <listcomp>(.0)
208
209 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 210 y = [deepcopy(a, memo) for a in x]
211 # We're not going to put the tuple in the memo, but it's still important we
212 # check for it, in case the tuple contains recursive mutable structures.
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
144 copier = _deepcopy_dispatch.get(cls)
145 if copier is not None:
--> 146 y = copier(x, memo)
147 else:
148 if issubclass(cls, type):
C:\ProgramData\Anaconda3\lib\copy.py in _deepcopy_list(x, memo, deepcopy)
203 append = y.append
204 for a in x:
--> 205 append(deepcopy(a, memo))
206 return y
207 d[list] = _deepcopy_list
C:\ProgramData\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
159 reductor = getattr(x, "__reduce_ex__", None)
160 if reductor is not None:
--> 161 rv = reductor(4)
162 else:
163 reductor = getattr(x, "__reduce__", None)
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\io.pxi in pyarrow.lib.Buffer.__reduce_ex__()
C:\ProgramData\Anaconda3\lib\site-packages\pyarrow\io.pxi in pyarrow.lib.Buffer.to_pybytes()
MemoryError:
Hi ! this looks like an important issue. Let me try to reproduce this. Cc @samsontmr this might be related to the memory issue you have in #2134
@lhoestq Just went to open a similar issue.
It seems like deep copying (tested on master) the dataset object writes the table's record batches (dset._data._batches
) into RAM.
To find the bug, I modified the _deepcopy
function in table.py
as follows:
def _deepcopy(x, memo: dict):
"""deepcopy a regular class instance"""
import psutil # pip install this package
import time
cls = x.__class__
result = cls.__new__(cls)
memo[id(x)] = result
for k, v in x.__dict__.items():
print("="* 50)
print("Current memory:", psutil.virtual_memory().percent)
print(f"Saving object {k} with value {v}")
setattr(result, k, copy.deepcopy(v, memo))
time.sleep(5)
print("Memory after copy:", psutil.virtual_memory().percent)
return result
Test script:
import copy
from datasets import load_dataset
bk = load_dataset("bookcorpus", split="train")
bk_copy = copy.deepcopy(bk)
Thanks for the insights @mariosasko ! I'm working on a fix. Since this is a big issue I'll make a patch release as soon as this is fixed
Hi @samsontmr @TaskManager91 the fix is on the master branch, feel free to install datasets
from source and let us know if you still have issues
We just released datasets
1.6.2 that includes the fix :)
thanks it works like a charm! :)
Describe the bug
When I try to concatenate 2 datasets (10GB each) , the entire data is loaded into memory instead of being written directly to disk.
Interestingly, this happens when trying to save the new dataset to disk or concatenating it again.
Steps to reproduce the bug
Expected results
The data should be loaded into memory in batches and then saved directly to disk.
Actual results
The entire data set is loaded into the memory and then saved to the hard disk.
Versions
Paste the output of the following code: