Closed technocreep closed 1 week ago
I'm going to follow the following steps to help you solve the GitHub issue:
load_from_tsfile_to_dataframe
method from sktime.datasets
and the _load_from_tsfile_to_dataframe
method in the DataLoader
class into a single method.read_train_test_files
method in the DataLoader
class to extract the code for reading different file formats (.tsv
, .txt
, .ts
, .arff
) into separate methods.load_data
method in the DataLoader
class to extract the code for downloading and extracting data into a separate method.Here are the changes we decided to make. I'm done making edits and now I'm just validating the changes using a linter to catch any mistakes like syntax errors or undefined variables:
fedot_ind/tools/loader.py
Remove the `_load_from_tsfile_to_dataframe` method from the `DataLoader` class since it is a duplicate of the `load_from_tsfile_to_dataframe` method from `sktime.datasets`. Update
---
+++
@@ -1,439 +0,0 @@
- def _load_from_tsfile_to_dataframe(
- self,
- full_file_path_and_name,
- return_separate_X_and_y=True,
- replace_missing_vals_with='NaN'):
- """Loads data from a .ts file into a Pandas DataFrame.
- Taken from https://github.com/ChangWeiTan/TS-Extrinsic-Regression/blob/master/utils/data_loader.py
-
- Args:
- full_file_path_and_name: The full pathname of the .ts file to read. return_separate_X_and_y: true if X
- and Y values should be returned as separate Data Frames (X) and a numpy array (y),
- false otherwise.
- replace_missing_vals_with: The value that missing values in the text file should be replaced with prior to
- parsing.
-
- Returns:
- If ``return_separate_X_and_y`` then a tuple containing a DataFrame and a numpy array containing the
- relevant time-series and corresponding class values. If not ``return_separate_X_and_y`` then a single
- DataFrame containing all time-series and (if relevant) a column ``class_vals`` the associated class values.
-
- """
- # Initialize flags and variables used when parsing the file
- metadata_started = False
- data_started = False
- has_problem_name_tag = False
- has_timestamps_tag = False
- has_univariate_tag = False
- has_class_labels_tag = False
- has_target_labels_tag = False
- has_data_tag = False
- previous_timestamp_was_float = None
- previous_timestamp_was_int = None
- previous_timestamp_was_timestamp = None
- num_dimensions = None
- is_first_case = True
- instance_list = []
- class_val_list = []
- line_num = 0
- TsFileParseException = Exception
- encoding = self.predict_encoding(full_file_path_and_name)
- with open(full_file_path_and_name, 'r', encoding=encoding) as file:
- dataset_name = os.path.basename(full_file_path_and_name)
- for line in tqdm(
- file.readlines(),
- desc='Loading data',
- leave=False,
- postfix=dataset_name,
- unit='lines'):
- line = line.strip().lower()
- if line:
- if line.startswith("@problemname"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len == 1:
- raise TsFileParseException(
- "problemname tag requires an associated value")
- has_problem_name_tag = True
- metadata_started = True
- elif line.startswith("@timestamps"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len != 2:
- raise TsFileParseException(
- "timestamps tag requires an associated Boolean value")
- elif tokens[1] == "true":
- timestamps = True
- elif tokens[1] == "false":
- timestamps = False
- else:
- raise TsFileParseException(
- "invalid timestamps value")
- has_timestamps_tag = True
- metadata_started = True
- elif line.startswith("@univariate"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len != 2:
- raise TsFileParseException(
- "univariate tag requires an associated Boolean value")
- elif tokens[1] == "true":
- pass
- elif tokens[1] == "false":
- pass
- else:
- raise TsFileParseException(
- "invalid univariate value")
- has_univariate_tag = True
- metadata_started = True
- elif line.startswith("@classlabel"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len == 1:
- raise TsFileParseException(
- "classlabel tag requires an associated Boolean value")
- if tokens[1] == "true":
- class_labels = True
- elif tokens[1] == "false":
- class_labels = False
- else:
- raise TsFileParseException(
- "invalid classLabel value")
- if token_len == 2 and class_labels:
- raise TsFileParseException(
- "if the classlabel tag is true then class values must be supplied")
- has_class_labels_tag = True
- class_label_list = [token.strip()
- for token in tokens[2:]]
- metadata_started = True
- elif line.startswith("@targetlabel"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len == 1:
- raise TsFileParseException(
- "targetlabel tag requires an associated Boolean value")
- if tokens[1] == "true":
- target_labels = True
- elif tokens[1] == "false":
- target_labels = False
- else:
- raise TsFileParseException(
- "invalid targetLabel value")
- has_target_labels_tag = True
- class_val_list = []
- metadata_started = True
- elif line.startswith("@data"):
- if line != "@data":
- raise TsFileParseException(
- "data tag should not have an associated value")
- if data_started and not metadata_started:
- raise TsFileParseException(
- "metadata must come before data")
- else:
- has_data_tag = True
- data_started = True
- elif data_started:
- incomplete_regression_meta_data = not has_problem_name_tag or not has_timestamps_tag or \
- not has_univariate_tag or not has_target_labels_tag or \
- not has_data_tag
- incomplete_classification_meta_data = \
- not has_problem_name_tag or not has_timestamps_tag \
- or not has_univariate_tag or not has_class_labels_tag \
- or not has_data_tag
- if incomplete_regression_meta_data and incomplete_classification_meta_data:
- raise TsFileParseException(
- "a full set of metadata has not been provided before the data")
- line = line.replace("?", replace_missing_vals_with)
- if timestamps:
- has_another_value = False
- has_another_dimension = False
- timestamps_for_dimension = []
- values_for_dimension = []
- this_line_num_dimensions = 0
- line_len = len(line)
- char_num = 0
- while char_num < line_len:
- while char_num < line_len and str.isspace(
- line[char_num]):
- char_num += 1
- if char_num < line_len:
- if line[char_num] == ":":
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- instance_list[this_line_num_dimensions].append(
- pd.Series())
- this_line_num_dimensions += 1
- has_another_value = False
- has_another_dimension = True
- timestamps_for_dimension = []
- values_for_dimension = []
- char_num += 1
- else:
- if line[char_num] != "(" and target_labels:
- class_val = line[char_num:].strip()
- class_val_list.append(
- float(class_val))
- char_num = line_len
- has_another_value = False
- has_another_dimension = False
- timestamps_for_dimension = []
- values_for_dimension = []
- else:
- if line[char_num] != "(" and not target_labels:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " does not start with a '('")
- char_num += 1
- tuple_data = ""
- while char_num < line_len and line[char_num] != ")":
- tuple_data += line[char_num]
- char_num += 1
- if char_num >= line_len or line[char_num] != ")":
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " does not end with a ')'")
- char_num += 1
- while char_num < line_len and str.isspace(
- line[char_num]):
- char_num += 1
- if char_num >= line_len:
- has_another_value = False
- has_another_dimension = False
- elif line[char_num] == ",":
- has_another_value = True
- has_another_dimension = False
- elif line[char_num] == ":":
- has_another_value = False
- has_another_dimension = True
- char_num += 1
- last_comma_index = tuple_data.rfind(
- ',')
- if last_comma_index == -1:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1)
- + " contains a tuple that has no comma inside of it")
- try:
- value = tuple_data[last_comma_index + 1:]
- value = float(value)
- except ValueError:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1)
- + " contains a tuple that does not have a valid numeric value")
- timestamp = tuple_data[0: last_comma_index]
- try:
- timestamp = int(timestamp)
- timestamp_is_int = True
- timestamp_is_timestamp = False
- except ValueError:
- timestamp_is_int = False
- if not timestamp_is_int:
- try:
- timestamp = float(
- timestamp)
- timestamp_is_float = True
- timestamp_is_timestamp = False
- except ValueError:
- timestamp_is_float = False
- if not timestamp_is_int and not timestamp_is_float:
- try:
- timestamp = timestamp.strip()
- timestamp_is_timestamp = True
- except ValueError:
- timestamp_is_timestamp = False
- if not timestamp_is_timestamp and not timestamp_is_int \
- and not timestamp_is_float:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains a tuple that has an invalid timestamp '"
- + timestamp + "'")
- if previous_timestamp_was_float is not None \
- and previous_timestamp_was_float and not timestamp_is_float:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains tuples where the timestamp format is inconsistent")
- if previous_timestamp_was_int is not \
- None and previous_timestamp_was_int and not timestamp_is_int:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains tuples where the timestamp format is inconsistent")
- if previous_timestamp_was_timestamp is not None \
- and previous_timestamp_was_timestamp and not timestamp_is_timestamp:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains tuples where the timestamp format is inconsistent")
- timestamps_for_dimension += [
- timestamp]
- values_for_dimension += [value]
- if previous_timestamp_was_timestamp is None and timestamp_is_timestamp:
- previous_timestamp_was_timestamp = True
- previous_timestamp_was_int = False
- previous_timestamp_was_float = False
- if previous_timestamp_was_int is None and timestamp_is_int:
- previous_timestamp_was_timestamp = False
- previous_timestamp_was_int = True
- previous_timestamp_was_float = False
- if previous_timestamp_was_float is None and timestamp_is_float:
- previous_timestamp_was_timestamp = False
- previous_timestamp_was_int = False
- previous_timestamp_was_float = True
- if not has_another_value:
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- if timestamp_is_timestamp:
- timestamps_for_dimension = pd.DatetimeIndex(
- timestamps_for_dimension)
- instance_list[this_line_num_dimensions].append(
- pd.Series(index=timestamps_for_dimension,
- data=values_for_dimension))
- this_line_num_dimensions += 1
- timestamps_for_dimension = []
- values_for_dimension = []
- elif has_another_value:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ',' that is not followed by another tuple")
- elif has_another_dimension and target_labels:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ':' while it should list a class value")
- elif has_another_dimension and not target_labels:
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- instance_list[this_line_num_dimensions].append(
- pd.Series(dtype=np.float32))
- this_line_num_dimensions += 1
- num_dimensions = this_line_num_dimensions
- if not has_another_value and not has_another_dimension:
- if num_dimensions is None:
- num_dimensions = this_line_num_dimensions
- if num_dimensions != this_line_num_dimensions:
- raise TsFileParseException(
- "line " +
- str(
- line_num +
- 1) +
- " does not have the same number of dimensions as the previous line of data")
- if has_another_value:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ',' that is not followed by another tuple")
- elif has_another_dimension and target_labels:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ':' while it should list a class value")
- elif has_another_dimension and not target_labels:
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- instance_list[this_line_num_dimensions].append(
- pd.Series())
- this_line_num_dimensions += 1
- num_dimensions = this_line_num_dimensions
- if not has_another_value and num_dimensions != this_line_num_dimensions:
- raise TsFileParseException(
- "line " +
- str(
- line_num +
- 1) +
- "does not have the same number of dimensions as the "
- "previous line of data")
- if target_labels and len(class_val_list) == 0:
- raise TsFileParseException(
- "the cases have no associated class values")
- else:
- dimensions = line.split(":")
- if is_first_case:
- num_dimensions = len(dimensions)
- if target_labels:
- num_dimensions -= 1
- for dim in range(0, num_dimensions):
- instance_list.append([])
- is_first_case = False
- this_line_num_dimensions = len(dimensions)
- if target_labels:
- this_line_num_dimensions -= 1
- if this_line_num_dimensions != num_dimensions:
- print(
- "inconsistent number of dimensions. Expecting " +
- str(num_dimensions) +
- " but have read " +
- str(this_line_num_dimensions))
- for dim in range(0, num_dimensions):
- try:
- dimension = dimensions[dim].strip()
- if dimension:
- data_series = dimension.split(",")
- data_series = [float(i)
- for i in data_series]
- instance_list[dim].append(
- pd.Series(data_series))
- else:
- instance_list[dim].append(pd.Series())
- except Exception:
- _ = 1
- if target_labels:
- try:
- class_val_list.append(
- float(dimensions[num_dimensions].strip()))
- except Exception:
- _ = 1
- line_num += 1
- if line_num:
- complete_regression_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag \
- and has_target_labels_tag and has_data_tag
- complete_classification_meta_data = \
- has_problem_name_tag and has_timestamps_tag \
- and has_univariate_tag and has_class_labels_tag and has_data_tag
- if metadata_started and not complete_regression_meta_data and not complete_classification_meta_data:
- raise TsFileParseException("metadata incomplete")
- elif metadata_started and not data_started:
- raise TsFileParseException(
- "file contained metadata but no data")
- elif metadata_started and data_started and len(instance_list) == 0:
- raise TsFileParseException(
- "file contained metadata but no data")
- data = pd.DataFrame(dtype=np.float32)
- for dim in range(0, num_dimensions):
- data['dim_' + str(dim)] = instance_list[dim]
- if target_labels:
- if return_separate_X_and_y:
- return data, np.asarray(class_val_list)
- else:
- data['class_vals'] = pd.Series(class_val_list)
- return data
- else:
- return data
- else:
- raise TsFileParseException("empty file")
fedot_ind/tools/loader.py
2. In the `read_ts_files` method of the `DataLoader` class, remove the `try`/`except` block that falls back to using `_load_from_tsfile_to_dataframe`, and just use `load_from_tsfile_to_dataframe` directly.
---
+++
@@ -1,16 +1,7 @@
def read_ts_files(self, dataset_name, data_path):
- try:
- x_test, y_test = load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TEST.ts', return_separate_X_and_y=True)
- x_train, y_train = load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN.ts',
- return_separate_X_and_y=True)
- return x_train, y_train, x_test, y_test
- except Exception:
- x_test, y_test = self._load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TEST.ts',
- return_separate_X_and_y=True)
- x_train, y_train = self._load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN.ts',
- return_separate_X_and_y=True)
- return x_train, y_train, x_test, y_test
+ x_test, y_test = load_from_tsfile_to_dataframe(
+ data_path + '/' + dataset_name + f'/{dataset_name}_TEST.ts', return_separate_X_and_y=True)
+ x_train, y_train = load_from_tsfile_to_dataframe(
+ data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN.ts',
+ return_separate_X_and_y=True)
+ return x_train, y_train, x_test, y_test
fedot_ind/tools/loader.py
3. Extract the code for reading `.tsv` files from `read_train_test_files` into a new method called `_read_tsv_files`.
---
+++
@@ -1,13 +1,16 @@
+ def _read_tsv_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_tsv(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, False
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,13 +1,16 @@
+ def _read_tsv_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_tsv(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, False
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,21 +1,20 @@
+ def _read_txt_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_txt_files(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, False
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_txt_files(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_txt_files(dataset_name, data_path)
# If data unpacked as .ts file
elif os.path.isfile(file_path + '.ts'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,29 +1,24 @@
+ def _read_ts_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_ts_files(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, True
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_txt_files(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_txt_files(dataset_name, data_path)
# If data unpacked as .ts file
elif os.path.isfile(file_path + '.ts'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_ts_files(
- dataset_name, data_path)
- is_multi = True
+ x_train, y_train, x_test, y_test, is_multi = self._read_ts_files(dataset_name, data_path)
# If data unpacked as .arff file
elif os.path.isfile(file_path + '.arff'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,37 +1,28 @@
+ def _read_arff_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_arff_files(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, True
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_txt_files(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_txt_files(dataset_name, data_path)
# If data unpacked as .ts file
elif os.path.isfile(file_path + '.ts'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_ts_files(
- dataset_name, data_path)
- is_multi = True
+ x_train, y_train, x_test, y_test, is_multi = self._read_ts_files(dataset_name, data_path)
# If data unpacked as .arff file
elif os.path.isfile(file_path + '.arff'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_arff_files(
- dataset_name, data_path)
- is_multi = True
+ x_train, y_train, x_test, y_test, is_multi = self._read_arff_files(dataset_name, data_path)
elif os.path.isfile(file_path + '.csv'):
self.logger.info(
fedot_ind/tools/loader.py
8. Extract the code for downloading and extracting data from `load_data` into a new method called `_download_and_extract_data`.
---
+++
@@ -1,3 +1,34 @@
+ def _download_and_extract_data(self, dataset_name):
+ self.logger.info('Downloading...')
+
+ # Create temporary folder for downloaded data
+ cache_path = os.path.join(PROJECT_PATH, 'temp_cache/')
+ download_path = cache_path + 'downloads/'
+ temp_data_path = cache_path + 'temp_data/'
+ filename = 'temp_data_{}'.format(dataset_name)
+ for _ in (download_path, temp_data_path):
+ os.makedirs(_, exist_ok=True)
+
+ url = f"http://www.timeseriesclassification.com/aeon-toolkit/{dataset_name}.zip"
+ request.urlretrieve(url, download_path + filename)
+ try:
+ zipfile.ZipFile(
+ download_path +
+ filename).extractall(
+ temp_data_path +
+ dataset_name)
+ except zipfile.BadZipFile:
+ raise FileNotFoundError(
+ f'Cannot extract data: {dataset_name} dataset not found in UCR archive')
+
+ self.logger.info(f'{dataset_name} data downloaded. Unpacking...')
+ train_data, test_data = self.extract_data(
+ dataset_name, temp_data_path)
+
+ shutil.rmtree(cache_path)
+
+ return train_data, test_data
+
def load_data(self, shuffle=True) -> tuple:
"""Load data for classification experiment locally or externally from UCR archive.
@@ -15,37 +46,8 @@
dataset_name=dataset_name, data_path=data_path, shuffle=shuffle)
if train_data is None:
- self.logger.info('Downloading...')
+ train_data, test_data = self._download_and_extract_data(dataset_name)
- # Create temporary folder for downloaded data
- cache_path = os.path.join(PROJECT_PATH, 'temp_cache/')
- download_path = cache_path + 'downloads/'
- temp_data_path = cache_path + 'temp_data/'
- filename = 'temp_data_{}'.format(dataset_name)
- for _ in (download_path, temp_data_path):
- os.makedirs(_, exist_ok=True)
-
- url = f"http://www.timeseriesclassification.com/aeon-toolkit/{dataset_name}.zip"
- request.urlretrieve(url, download_path + filename)
- try:
- zipfile.ZipFile(
- download_path +
- filename).extractall(
- temp_data_path +
- dataset_name)
- except zipfile.BadZipFile:
- raise FileNotFoundError(
- f'Cannot extract data: {dataset_name} dataset not found in UCR archive')
-
- self.logger.info(f'{dataset_name} data downloaded. Unpacking...')
- train_data, test_data = self.extract_data(
- dataset_name, temp_data_path)
-
- shutil.rmtree(cache_path)
-
- # if type(train_data[0])
-
- # return train_data, test_data
self.logger.info('Data read successfully from local folder')
if isinstance(train_data[0].iloc[0, 0], pd.Series):
I'm currently validating your changes using parsers and linters to check for mistakes like syntax errors or undefined variables. If I see any of these errors, I will automatically fix them.
[!TIP] To recreate the pull request, edit the issue title or description.
This is an automated message generated by Sweep AI.
I'm sorry, but it looks like an error has occurred due to a code validation failure. The error message is 401 {"message": "Bad credentials", "documentation_url": "https://docs.github.com/rest", "status": "401"}. Here were the changes I had planned:
fedot_ind/tools/loader.py
Remove the `_load_from_tsfile_to_dataframe` method from the `DataLoader` class since it is a duplicate of the `load_from_tsfile_to_dataframe` method from `sktime.datasets`. Update
---
+++
@@ -1,439 +0,0 @@
- def _load_from_tsfile_to_dataframe(
- self,
- full_file_path_and_name,
- return_separate_X_and_y=True,
- replace_missing_vals_with='NaN'):
- """Loads data from a .ts file into a Pandas DataFrame.
- Taken from https://github.com/ChangWeiTan/TS-Extrinsic-Regression/blob/master/utils/data_loader.py
-
- Args:
- full_file_path_and_name: The full pathname of the .ts file to read. return_separate_X_and_y: true if X
- and Y values should be returned as separate Data Frames (X) and a numpy array (y),
- false otherwise.
- replace_missing_vals_with: The value that missing values in the text file should be replaced with prior to
- parsing.
-
- Returns:
- If ``return_separate_X_and_y`` then a tuple containing a DataFrame and a numpy array containing the
- relevant time-series and corresponding class values. If not ``return_separate_X_and_y`` then a single
- DataFrame containing all time-series and (if relevant) a column ``class_vals`` the associated class values.
-
- """
- # Initialize flags and variables used when parsing the file
- metadata_started = False
- data_started = False
- has_problem_name_tag = False
- has_timestamps_tag = False
- has_univariate_tag = False
- has_class_labels_tag = False
- has_target_labels_tag = False
- has_data_tag = False
- previous_timestamp_was_float = None
- previous_timestamp_was_int = None
- previous_timestamp_was_timestamp = None
- num_dimensions = None
- is_first_case = True
- instance_list = []
- class_val_list = []
- line_num = 0
- TsFileParseException = Exception
- encoding = self.predict_encoding(full_file_path_and_name)
- with open(full_file_path_and_name, 'r', encoding=encoding) as file:
- dataset_name = os.path.basename(full_file_path_and_name)
- for line in tqdm(
- file.readlines(),
- desc='Loading data',
- leave=False,
- postfix=dataset_name,
- unit='lines'):
- line = line.strip().lower()
- if line:
- if line.startswith("@problemname"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len == 1:
- raise TsFileParseException(
- "problemname tag requires an associated value")
- has_problem_name_tag = True
- metadata_started = True
- elif line.startswith("@timestamps"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len != 2:
- raise TsFileParseException(
- "timestamps tag requires an associated Boolean value")
- elif tokens[1] == "true":
- timestamps = True
- elif tokens[1] == "false":
- timestamps = False
- else:
- raise TsFileParseException(
- "invalid timestamps value")
- has_timestamps_tag = True
- metadata_started = True
- elif line.startswith("@univariate"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len != 2:
- raise TsFileParseException(
- "univariate tag requires an associated Boolean value")
- elif tokens[1] == "true":
- pass
- elif tokens[1] == "false":
- pass
- else:
- raise TsFileParseException(
- "invalid univariate value")
- has_univariate_tag = True
- metadata_started = True
- elif line.startswith("@classlabel"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len == 1:
- raise TsFileParseException(
- "classlabel tag requires an associated Boolean value")
- if tokens[1] == "true":
- class_labels = True
- elif tokens[1] == "false":
- class_labels = False
- else:
- raise TsFileParseException(
- "invalid classLabel value")
- if token_len == 2 and class_labels:
- raise TsFileParseException(
- "if the classlabel tag is true then class values must be supplied")
- has_class_labels_tag = True
- class_label_list = [token.strip()
- for token in tokens[2:]]
- metadata_started = True
- elif line.startswith("@targetlabel"):
- if data_started:
- raise TsFileParseException(
- "metadata must come before data")
- tokens = line.split(' ')
- token_len = len(tokens)
- if token_len == 1:
- raise TsFileParseException(
- "targetlabel tag requires an associated Boolean value")
- if tokens[1] == "true":
- target_labels = True
- elif tokens[1] == "false":
- target_labels = False
- else:
- raise TsFileParseException(
- "invalid targetLabel value")
- has_target_labels_tag = True
- class_val_list = []
- metadata_started = True
- elif line.startswith("@data"):
- if line != "@data":
- raise TsFileParseException(
- "data tag should not have an associated value")
- if data_started and not metadata_started:
- raise TsFileParseException(
- "metadata must come before data")
- else:
- has_data_tag = True
- data_started = True
- elif data_started:
- incomplete_regression_meta_data = not has_problem_name_tag or not has_timestamps_tag or \
- not has_univariate_tag or not has_target_labels_tag or \
- not has_data_tag
- incomplete_classification_meta_data = \
- not has_problem_name_tag or not has_timestamps_tag \
- or not has_univariate_tag or not has_class_labels_tag \
- or not has_data_tag
- if incomplete_regression_meta_data and incomplete_classification_meta_data:
- raise TsFileParseException(
- "a full set of metadata has not been provided before the data")
- line = line.replace("?", replace_missing_vals_with)
- if timestamps:
- has_another_value = False
- has_another_dimension = False
- timestamps_for_dimension = []
- values_for_dimension = []
- this_line_num_dimensions = 0
- line_len = len(line)
- char_num = 0
- while char_num < line_len:
- while char_num < line_len and str.isspace(
- line[char_num]):
- char_num += 1
- if char_num < line_len:
- if line[char_num] == ":":
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- instance_list[this_line_num_dimensions].append(
- pd.Series())
- this_line_num_dimensions += 1
- has_another_value = False
- has_another_dimension = True
- timestamps_for_dimension = []
- values_for_dimension = []
- char_num += 1
- else:
- if line[char_num] != "(" and target_labels:
- class_val = line[char_num:].strip()
- class_val_list.append(
- float(class_val))
- char_num = line_len
- has_another_value = False
- has_another_dimension = False
- timestamps_for_dimension = []
- values_for_dimension = []
- else:
- if line[char_num] != "(" and not target_labels:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " does not start with a '('")
- char_num += 1
- tuple_data = ""
- while char_num < line_len and line[char_num] != ")":
- tuple_data += line[char_num]
- char_num += 1
- if char_num >= line_len or line[char_num] != ")":
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " does not end with a ')'")
- char_num += 1
- while char_num < line_len and str.isspace(
- line[char_num]):
- char_num += 1
- if char_num >= line_len:
- has_another_value = False
- has_another_dimension = False
- elif line[char_num] == ",":
- has_another_value = True
- has_another_dimension = False
- elif line[char_num] == ":":
- has_another_value = False
- has_another_dimension = True
- char_num += 1
- last_comma_index = tuple_data.rfind(
- ',')
- if last_comma_index == -1:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1)
- + " contains a tuple that has no comma inside of it")
- try:
- value = tuple_data[last_comma_index + 1:]
- value = float(value)
- except ValueError:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1)
- + " contains a tuple that does not have a valid numeric value")
- timestamp = tuple_data[0: last_comma_index]
- try:
- timestamp = int(timestamp)
- timestamp_is_int = True
- timestamp_is_timestamp = False
- except ValueError:
- timestamp_is_int = False
- if not timestamp_is_int:
- try:
- timestamp = float(
- timestamp)
- timestamp_is_float = True
- timestamp_is_timestamp = False
- except ValueError:
- timestamp_is_float = False
- if not timestamp_is_int and not timestamp_is_float:
- try:
- timestamp = timestamp.strip()
- timestamp_is_timestamp = True
- except ValueError:
- timestamp_is_timestamp = False
- if not timestamp_is_timestamp and not timestamp_is_int \
- and not timestamp_is_float:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains a tuple that has an invalid timestamp '"
- + timestamp + "'")
- if previous_timestamp_was_float is not None \
- and previous_timestamp_was_float and not timestamp_is_float:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains tuples where the timestamp format is inconsistent")
- if previous_timestamp_was_int is not \
- None and previous_timestamp_was_int and not timestamp_is_int:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains tuples where the timestamp format is inconsistent")
- if previous_timestamp_was_timestamp is not None \
- and previous_timestamp_was_timestamp and not timestamp_is_timestamp:
- raise TsFileParseException(
- "dimension " + str(
- this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) +
- " contains tuples where the timestamp format is inconsistent")
- timestamps_for_dimension += [
- timestamp]
- values_for_dimension += [value]
- if previous_timestamp_was_timestamp is None and timestamp_is_timestamp:
- previous_timestamp_was_timestamp = True
- previous_timestamp_was_int = False
- previous_timestamp_was_float = False
- if previous_timestamp_was_int is None and timestamp_is_int:
- previous_timestamp_was_timestamp = False
- previous_timestamp_was_int = True
- previous_timestamp_was_float = False
- if previous_timestamp_was_float is None and timestamp_is_float:
- previous_timestamp_was_timestamp = False
- previous_timestamp_was_int = False
- previous_timestamp_was_float = True
- if not has_another_value:
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- if timestamp_is_timestamp:
- timestamps_for_dimension = pd.DatetimeIndex(
- timestamps_for_dimension)
- instance_list[this_line_num_dimensions].append(
- pd.Series(index=timestamps_for_dimension,
- data=values_for_dimension))
- this_line_num_dimensions += 1
- timestamps_for_dimension = []
- values_for_dimension = []
- elif has_another_value:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ',' that is not followed by another tuple")
- elif has_another_dimension and target_labels:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ':' while it should list a class value")
- elif has_another_dimension and not target_labels:
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- instance_list[this_line_num_dimensions].append(
- pd.Series(dtype=np.float32))
- this_line_num_dimensions += 1
- num_dimensions = this_line_num_dimensions
- if not has_another_value and not has_another_dimension:
- if num_dimensions is None:
- num_dimensions = this_line_num_dimensions
- if num_dimensions != this_line_num_dimensions:
- raise TsFileParseException(
- "line " +
- str(
- line_num +
- 1) +
- " does not have the same number of dimensions as the previous line of data")
- if has_another_value:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ',' that is not followed by another tuple")
- elif has_another_dimension and target_labels:
- raise TsFileParseException(
- "dimension " + str(this_line_num_dimensions + 1) + " on line " + str(
- line_num + 1) + " ends with a ':' while it should list a class value")
- elif has_another_dimension and not target_labels:
- if len(instance_list) < (
- this_line_num_dimensions + 1):
- instance_list.append([])
- instance_list[this_line_num_dimensions].append(
- pd.Series())
- this_line_num_dimensions += 1
- num_dimensions = this_line_num_dimensions
- if not has_another_value and num_dimensions != this_line_num_dimensions:
- raise TsFileParseException(
- "line " +
- str(
- line_num +
- 1) +
- "does not have the same number of dimensions as the "
- "previous line of data")
- if target_labels and len(class_val_list) == 0:
- raise TsFileParseException(
- "the cases have no associated class values")
- else:
- dimensions = line.split(":")
- if is_first_case:
- num_dimensions = len(dimensions)
- if target_labels:
- num_dimensions -= 1
- for dim in range(0, num_dimensions):
- instance_list.append([])
- is_first_case = False
- this_line_num_dimensions = len(dimensions)
- if target_labels:
- this_line_num_dimensions -= 1
- if this_line_num_dimensions != num_dimensions:
- print(
- "inconsistent number of dimensions. Expecting " +
- str(num_dimensions) +
- " but have read " +
- str(this_line_num_dimensions))
- for dim in range(0, num_dimensions):
- try:
- dimension = dimensions[dim].strip()
- if dimension:
- data_series = dimension.split(",")
- data_series = [float(i)
- for i in data_series]
- instance_list[dim].append(
- pd.Series(data_series))
- else:
- instance_list[dim].append(pd.Series())
- except Exception:
- _ = 1
- if target_labels:
- try:
- class_val_list.append(
- float(dimensions[num_dimensions].strip()))
- except Exception:
- _ = 1
- line_num += 1
- if line_num:
- complete_regression_meta_data = has_problem_name_tag and has_timestamps_tag and has_univariate_tag \
- and has_target_labels_tag and has_data_tag
- complete_classification_meta_data = \
- has_problem_name_tag and has_timestamps_tag \
- and has_univariate_tag and has_class_labels_tag and has_data_tag
- if metadata_started and not complete_regression_meta_data and not complete_classification_meta_data:
- raise TsFileParseException("metadata incomplete")
- elif metadata_started and not data_started:
- raise TsFileParseException(
- "file contained metadata but no data")
- elif metadata_started and data_started and len(instance_list) == 0:
- raise TsFileParseException(
- "file contained metadata but no data")
- data = pd.DataFrame(dtype=np.float32)
- for dim in range(0, num_dimensions):
- data['dim_' + str(dim)] = instance_list[dim]
- if target_labels:
- if return_separate_X_and_y:
- return data, np.asarray(class_val_list)
- else:
- data['class_vals'] = pd.Series(class_val_list)
- return data
- else:
- return data
- else:
- raise TsFileParseException("empty file")
fedot_ind/tools/loader.py
2. In the `read_ts_files` method of the `DataLoader` class, remove the `try`/`except` block that falls back to using `_load_from_tsfile_to_dataframe`, and just use `load_from_tsfile_to_dataframe` directly.
---
+++
@@ -1,16 +1,7 @@
def read_ts_files(self, dataset_name, data_path):
- try:
- x_test, y_test = load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TEST.ts', return_separate_X_and_y=True)
- x_train, y_train = load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN.ts',
- return_separate_X_and_y=True)
- return x_train, y_train, x_test, y_test
- except Exception:
- x_test, y_test = self._load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TEST.ts',
- return_separate_X_and_y=True)
- x_train, y_train = self._load_from_tsfile_to_dataframe(
- data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN.ts',
- return_separate_X_and_y=True)
- return x_train, y_train, x_test, y_test
+ x_test, y_test = load_from_tsfile_to_dataframe(
+ data_path + '/' + dataset_name + f'/{dataset_name}_TEST.ts', return_separate_X_and_y=True)
+ x_train, y_train = load_from_tsfile_to_dataframe(
+ data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN.ts',
+ return_separate_X_and_y=True)
+ return x_train, y_train, x_test, y_test
fedot_ind/tools/loader.py
3. Extract the code for reading `.tsv` files from `read_train_test_files` into a new method called `_read_tsv_files`.
---
+++
@@ -1,13 +1,16 @@
+ def _read_tsv_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_tsv(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, False
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,13 +1,16 @@
+ def _read_tsv_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_tsv(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, False
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,21 +1,20 @@
+ def _read_txt_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_txt_files(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, False
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_txt_files(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_txt_files(dataset_name, data_path)
# If data unpacked as .ts file
elif os.path.isfile(file_path + '.ts'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,29 +1,24 @@
+ def _read_ts_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_ts_files(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, True
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_txt_files(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_txt_files(dataset_name, data_path)
# If data unpacked as .ts file
elif os.path.isfile(file_path + '.ts'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_ts_files(
- dataset_name, data_path)
- is_multi = True
+ x_train, y_train, x_test, y_test, is_multi = self._read_ts_files(dataset_name, data_path)
# If data unpacked as .arff file
elif os.path.isfile(file_path + '.arff'):
fedot_ind/tools/loader.py
Rewritten instructions to resolve the error. Update the original_code and new_code blocks as required, ensuring that the
---
+++
@@ -1,37 +1,28 @@
+ def _read_arff_files(self, dataset_name, data_path):
+ self.logger.info(
+ f'Reading data from {data_path + "/" + dataset_name}')
+ x_train, y_train, x_test, y_test = self.read_arff_files(
+ dataset_name, data_path)
+ return x_train, y_train, x_test, y_test, True
+
def read_train_test_files(self, data_path, dataset_name, shuffle=True):
file_path = data_path + '/' + dataset_name + f'/{dataset_name}_TRAIN'
# If data unpacked as .tsv file
if os.path.isfile(file_path + '.tsv'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_tsv(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_tsv_files(dataset_name, data_path)
# If data unpacked as .txt file
elif os.path.isfile(file_path + '.txt'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_txt_files(
- dataset_name, data_path)
- is_multi = False
+ x_train, y_train, x_test, y_test, is_multi = self._read_txt_files(dataset_name, data_path)
# If data unpacked as .ts file
elif os.path.isfile(file_path + '.ts'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_ts_files(
- dataset_name, data_path)
- is_multi = True
+ x_train, y_train, x_test, y_test, is_multi = self._read_ts_files(dataset_name, data_path)
# If data unpacked as .arff file
elif os.path.isfile(file_path + '.arff'):
- self.logger.info(
- f'Reading data from {data_path + "/" + dataset_name}')
- x_train, y_train, x_test, y_test = self.read_arff_files(
- dataset_name, data_path)
- is_multi = True
+ x_train, y_train, x_test, y_test, is_multi = self._read_arff_files(dataset_name, data_path)
elif os.path.isfile(file_path + '.csv'):
self.logger.info(
fedot_ind/tools/loader.py
8. Extract the code for downloading and extracting data from `load_data` into a new method called `_download_and_extract_data`.
---
+++
@@ -1,3 +1,34 @@
+ def _download_and_extract_data(self, dataset_name):
+ self.logger.info('Downloading...')
+
+ # Create temporary folder for downloaded data
+ cache_path = os.path.join(PROJECT_PATH, 'temp_cache/')
+ download_path = cache_path + 'downloads/'
+ temp_data_path = cache_path + 'temp_data/'
+ filename = 'temp_data_{}'.format(dataset_name)
+ for _ in (download_path, temp_data_path):
+ os.makedirs(_, exist_ok=True)
+
+ url = f"http://www.timeseriesclassification.com/aeon-toolkit/{dataset_name}.zip"
+ request.urlretrieve(url, download_path + filename)
+ try:
+ zipfile.ZipFile(
+ download_path +
+ filename).extractall(
+ temp_data_path +
+ dataset_name)
+ except zipfile.BadZipFile:
+ raise FileNotFoundError(
+ f'Cannot extract data: {dataset_name} dataset not found in UCR archive')
+
+ self.logger.info(f'{dataset_name} data downloaded. Unpacking...')
+ train_data, test_data = self.extract_data(
+ dataset_name, temp_data_path)
+
+ shutil.rmtree(cache_path)
+
+ return train_data, test_data
+
def load_data(self, shuffle=True) -> tuple:
"""Load data for classification experiment locally or externally from UCR archive.
@@ -15,37 +46,8 @@
dataset_name=dataset_name, data_path=data_path, shuffle=shuffle)
if train_data is None:
- self.logger.info('Downloading...')
+ train_data, test_data = self._download_and_extract_data(dataset_name)
- # Create temporary folder for downloaded data
- cache_path = os.path.join(PROJECT_PATH, 'temp_cache/')
- download_path = cache_path + 'downloads/'
- temp_data_path = cache_path + 'temp_data/'
- filename = 'temp_data_{}'.format(dataset_name)
- for _ in (download_path, temp_data_path):
- os.makedirs(_, exist_ok=True)
-
- url = f"http://www.timeseriesclassification.com/aeon-toolkit/{dataset_name}.zip"
- request.urlretrieve(url, download_path + filename)
- try:
- zipfile.ZipFile(
- download_path +
- filename).extractall(
- temp_data_path +
- dataset_name)
- except zipfile.BadZipFile:
- raise FileNotFoundError(
- f'Cannot extract data: {dataset_name} dataset not found in UCR archive')
-
- self.logger.info(f'{dataset_name} data downloaded. Unpacking...')
- train_data, test_data = self.extract_data(
- dataset_name, temp_data_path)
-
- shutil.rmtree(cache_path)
-
- # if type(train_data[0])
-
- # return train_data, test_data
self.logger.info('Data read successfully from local folder')
if isinstance(train_data[0].iloc[0, 0], pd.Series):
Feel free to add more details to the issue description so Sweep can better address it. Alternatively, reach out to Kevin or William for help at https://community.sweep.dev/.
[!TIP] To recreate the pull request, edit the issue title or description.
Details
We need to refactor
DataLoader
class located infedot_ind.tools.loader
directory. There are two almost similar methods that need to be merged into single one:load_from_tsfile_to_dataframe
fromsktime.datasets
_load_from_tsfile_to_dataframe
Some of the
DataLoader
class methods looks too huge so we need to shorted them by applying refactoring.Branch
No response