I have tried to execute the code in two machines: one with 14GB and the other with 28 GB of RAM. The result is:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-10-443b3ed80ee7> in <module>
3
4 # Pull all of the data
----> 5 oj_sales_files = OjSalesSimulated.get_file_dataset()
6
After further investigation it does not seem a memory problem when profiling the memory of the Compute Instance.
It looks like a parsing problem:
MemoryError Traceback (most recent call last)
in
3
4 # Pull all of the data
----> 5 oj_sales_files = OjSalesSimulated.get_file_dataset()
6
7 # Pull only the first `dataset_maxfiles` files
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/opendatasets/_oj_sales_simulated.py in get_file_dataset(cls, enable_telemetry)
33 open_datasets = cls._get_open_dataset(
34 enable_telemetry=enable_telemetry)
---> 35 return open_datasets.get_file_dataset()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/opendatasets/accessories/open_dataset_base.py in _instance_get_file_dataset(self, start_date, end_date, enable_telemetry, **kwargs)
289 if DefaultArgKey.ENDDATE.value in kwargs:
290 kwargs.pop(DefaultArgKey.ENDDATE.value)
--> 291 return self.__class__.get_file_dataset(start_date, end_date, enable_telemetry, **kwargs)
292
293 def _to_spark_dataframe(self):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/opendatasets/accessories/_loggerfactory.py in wrapper(*args, **kwargs)
138 with _LoggerFactory.track_activity(logger, func.__name__, activity_type, custom_dimensions) as al:
139 try:
--> 140 return func(*args, **kwargs)
141 except Exception as e:
142 al.activity_info['error_message'] = str(e)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/opendatasets/accessories/open_dataset_base.py in get_file_dataset(cls, start_date, end_date, enable_telemetry, **kwargs)
266 if end_date:
267 kwargs[DefaultArgKey.ENDDATE.value] = end_date
--> 268 return cls._blob_accessor.get_file_dataset(**kwargs)
269
270 def _instance_get_file_dataset(
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/opendatasets/dataaccess/_blob_accessor.py in get_file_dataset(self, **kwargs)
151 properties["opendatasets"] = self.id
152 ds = FileDataset._create(self.get_file_dataflow(
--> 153 **kwargs), properties=properties)
154 ds._telemetry_info = _DatasetTelemetryInfo(
155 entry_point='PythonSDK:OpenDataset')
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/opendatasets/dataaccess/_blob_accessor.py in get_file_dataflow(self, **kwargs)
158 def get_file_dataflow(self, **kwargs) -> dprep.Dataflow:
159 self._check_dataprep()
--> 160 dflow = dprep.Dataflow.get_files(self.get_urls(**kwargs))
161 dflow = self._filter_file_dataflow(dflow, **kwargs)
162 # skip for now and wait for DataPrep Official release to studio
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py in get_files(path)
2396 """
2397 Expands the path specified by reading globs and files in folders and outputs one record per file found.
-> 2398
2399 :param path: The path or paths to expand.
2400 :return: A new Dataflow.
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py in _path_to_get_files_block(path, archive_options)
2469
2470 self._set_values_to_find(replace_dict, find)
-> 2471
2472 if replace_with is None:
2473 replace_dict['replaceWithType'] = FieldType.NULL
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/dataflow.py in _get_files(path, archive_options)
2490 error_replace_with = str(error_replace_with) if error_replace_with is not None else None
2491 return self.add_step('Microsoft.DPrep.ReplaceBlock', {
-> 2492 'columns': column_selection_to_selector_value(columns),
2493 'valueToFindType': replace_dict['valueToFindType'],
2494 'stringValueToFind': replace_dict['stringValueToFind'],
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py in get_engine_api()
17 if not _engine_api:
18 _engine_api = EngineAPI()
---> 19
20 from .._dataset_resolver import register_dataset_resolver
21 register_dataset_resolver(_engine_api.requests_channel)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py in __init__(self)
118 return typedefinitions.ExecuteInspectorCommonResponse.from_pod(response) if response is not None else None
119
--> 120 @update_aml_env_vars(get_engine_api)
121 def execute_inspectors(self, message_args: List[typedefinitions.ExecuteInspectorsMessageArguments], cancellation_token: CancellationToken = None) -> Dict[str, typedefinitions.ExecuteInspectorCommonResponse]:
122 response = self._message_channel.send_message('Engine.ExecuteInspectors', message_args, cancellation_token)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py in connect_to_requests_channel()
105 @update_aml_env_vars(get_engine_api)
106 def execute_anonymous_activity(self, message_args: typedefinitions.ExecuteAnonymousActivityMessageArguments, cancellation_token: CancellationToken = None) -> None:
--> 107 response = self._message_channel.send_message('Engine.ExecuteActivity', message_args, cancellation_token)
108 return response
109
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/_aml_helper.py in wrapper(op_code, message, cancellation_token)
36 if len(changed) > 0:
37 engine_api_func().update_environment_variable(changed)
---> 38 return send_message_func(op_code, message, cancellation_token)
39
40 return wrapper
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/api.py in sync_host_secret(self, message_args, cancellation_token)
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py in send_message(self, op_code, message, cancellation_token)
273 self._process = self._process_opener()
274 self._renew_response_thread()
--> 275 self._renew_wait_thread()
276 _LoggerFactory.trace(log, 'MultiThreadMessageChannel_create_engine', { 'engine_pid': self._process.pid } )
277 with self._messages_lock:
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py in process_responses()
221 self._responses_thread = Thread(target=process_responses, daemon=True)
222 self._responses_thread.start()
--> 223
224 def on_relaunch(self, callback: Callable[[], None]):
225 self._relaunch_callback = callback
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/azureml/dataprep/api/engineapi/engine.py in _read_response(self, caller)
146 parsed = json.loads(string)
147 finally:
--> 148 if parsed is None: # Exception is being thrown
149 print('Line read from engine could not be parsed as JSON. Line:')
150 try:
MemoryError: Engine process terminated. This is most likely due to system running out of memory. Please retry with increased memory. |session_id=10efdc4c-45af-4701-ba6c-bbc5ee225681
# Trying a workaround:
If according with the documentation with the OjSalesSimulated dataset (https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/open-datasets/dataset-oj-sales-simulated.md), I tried the following:
`oj_sales_files = OjSalesSimulated.get_file_dataset(num_files=10)`
```
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
3
4 # Pull all of the data
----> 5 oj_sales_files = OjSalesSimulated.get_file_dataset(num_files=10)
6
7 # Pull only the first `dataset_maxfiles` files
TypeError: get_file_dataset() got an unexpected keyword argument 'num_files'
```
The method does not accept num_files param.
If the library does not support the num_files param what is the hardware recommendation to download the dataset?
I was trying to follow the following instructions about training many models y Azure Machine Learning: https://github.com/microsoft/solution-accelerator-many-models
When trying to prepare the data and Pull all of the data inside the dataset, a memory error raises when executing (in Python):
oj_sales_files = OjSalesSimulated.get_file_dataset()
It is a memory exception:
I have tried to execute the code in two machines: one with 14GB and the other with 28 GB of RAM. The result is:
After further investigation it does not seem a memory problem when profiling the memory of the Compute Instance.
It looks like a parsing problem:
MemoryError Traceback (most recent call last)