aimclub / FEDOT

Automated modeling and machine learning framework FEDOT
https://fedot.readthedocs.io
BSD 3-Clause "New" or "Revised" License
619 stars 84 forks source link

fix pca #1267

Closed valer1435 closed 3 months ago

valer1435 commented 3 months ago

This is a 🐛 bug fix.

Fixes #4567

docu-mentor[bot] commented 3 months ago

👋 Hi, I'm @docu-mentor, an LLM-powered GitHub app powered by Anyscale Endpoints that gives you actionable feedback on your writing.

Simply create a new comment in this PR that says:

@docu-mentor run

and I will start my analysis. I only look at what you changed in this PR. If you only want me to look at specific files or folders, you can specify them like this:

@docu-mentor run doc/ README.md

In this example, I'll have a look at all files contained in the "doc/" folder and the file "README.md". All good? Let's get started!

pep8speaks commented 3 months ago

Hello @valer1435! Thanks for updating this PR. We checked the lines you've touched for PEP 8 issues, and found:

There are currently no PEP 8 issues detected in this Pull Request. Cheers! :beers:

Comment last updated at 2024-03-13 14:06:52 UTC
github-actions[bot] commented 3 months ago

All PEP8 errors has been fixed, thanks :heart:

Comment last updated at
codecov[bot] commented 3 months ago

Codecov Report

Attention: Patch coverage is 95.45455% with 1 lines in your changes are missing coverage. Please review.

Project coverage is 79.92%. Comparing base (0633078) to head (43c49c0). Report is 2 commits behind head on master.

Files Patch % Lines
...tations/data_operations/sklearn_transformations.py 87.50% 1 Missing :warning:
Additional details and impacted files ```diff @@ Coverage Diff @@ ## master #1267 +/- ## ========================================== + Coverage 79.90% 79.92% +0.02% ========================================== Files 146 146 Lines 10031 10049 +18 ========================================== + Hits 8015 8032 +17 - Misses 2016 2017 +1 ```

:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.

valer1435 commented 3 months ago

@open-code-helper run

open-code-helper[bot] commented 3 months ago

:rocket: Open code helper finished analysing your PR! :rocket:

Take a look at your results:

File fedot/core/constants.py:

from fedot.core.repository.tasks import TaskTypesEnum

MINIMAL_SECONDS_FOR_TUNING = 15
"""Minimal seconds for tuning."""

DEFAULT_TUNING_ITERATIONS_NUMBER = 100000
"""Default number of tuning iterations."""

DEFAULT_API_TIMEOUT_MINUTES = 5.0
"""Default API timeout in minutes."""

DEFAULT_FORECAST_LENGTH = 30
"""Default forecast length."""

COMPOSING_TUNING_PROPORTION = 0.6
"""Proportion of data used for composing tuning."""

BEST_QUALITY_PRESET_NAME = 'best_quality'
"""Name of the preset for best quality."""

FAST_TRAIN_PRESET_NAME = 'fast_train'
"""Name of the preset for fast training."""

AUTO_PRESET_NAME = 'auto'
"""Name of the preset for auto tuning."""

MINIMAL_PIPELINE_NUMBER_FOR_EVALUATION = 100
"""Minimal number of pipelines for evaluation."""

MIN_NUMBER_OF_GENERATIONS = 3
"""Minimum number of generations."""

FRACTION_OF_UNIQUE_VALUES = 0.95
"""Fraction of unique values."""

default_data_split_ratio_by_task = {
    TaskTypesEnum.classification: 0.8,
    TaskTypesEnum.regression: 0.8,
    TaskTypesEnum.ts_forecasting: 0.5
}
"""Default data split ratio by task."""

PCA_MIN_THRESHOLD_TS = 7
"""Minimum threshold for PCA in TS forecasting."""

File fedot/core/operations/evaluation/operation_implementations/data_operations/sklearn_transformations.py:

Here is the improved version of the code with added docstrings and type hints:

import random
from typing import Optional, Tuple

import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA, KernelPCA, PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, StandardScaler

from fedot.core.constants import PCA_MIN_THRESHOLD_TS
from fedot.core.data.data import InputData, OutputData, data_type_is_table
from fedot.core.data.data_preprocessing import convert_into_column, data_has_categorical_features, \
    divide_data_categorical_numerical, find_categorical_columns, replace_inf_with_nans
from fedot.core.operations.evaluation.operation_implementations. \
    implementation_interfaces import DataOperationImplementation, EncodedInvariantImplementation
from fedot.core.operations.operation_parameters import OperationParameters
from fedot.core.repository.dataset_types import DataTypesEnum
from fedot.preprocessing.data_types import TYPE_TO_ID

class ComponentAnalysisImplementation(DataOperationImplementation):
    """
    Class for applying PCA and kernel PCA models from sklearn

    Args:
        params: OperationParameters with the arguments
    """

    def __init__(self, params: Optional[OperationParameters]):
        super().__init__(params)
        self.pca = None
        self.number_of_features = None
        self.number_of_samples = None

    def fit(self, input_data: InputData) -> PCA:
        """
        The method trains the PCA model

        Args:
            input_data: data with features, target and ids for PCA training

        Returns:
            trained PCA model (optional output)
        """

        self.number_of_samples, self.number_of_features = np.array(input_data.features).shape

        if self.number_of_features > 1:
            self.check_and_correct_params(is_ts_data=input_data.data_type is DataTypesEnum.ts)
            self.pca.fit(input_data.features)

        return self.pca

    def transform(self, input_data: InputData) -> OutputData:
        """
        Method for transformation tabular data using PCA

        Args:
            input_data: data with features, target and ids for PCA applying

        Returns:
            data with transformed features attribute
        """

        if self.number_of_features > 1:
            transformed_features = self.pca.transform(input_data.features)
        else:
            transformed_features = input_data.features

        # Update features
        output_data = self._convert_to_output(input_data, transformed_features)
        self.update_column_types(output_data)
        return output_data

    def check_and_correct_params(self, is_ts_data: bool = False):
        """
        Method check if number of features in data enough for ``n_components``
        parameter in PCA or not. And if not enough - fixes it
        """
        n_components = self.params.get('n_components')
        if isinstance(n_components, int):
            if n_components > self.number_of_features:
                self.params.update(n_components=self.number_of_features)
        elif n_components == 'mle':
            # Check that n_samples correctly map with n_features
            if self.number_of_samples < self.number_of_features:
                self.params.update(n_components=0.5)
        if is_ts_data and (n_components * self.number_of_features) < PCA_MIN_THRESHOLD_TS:
            self.params.update(n_components=PCA_MIN_THRESHOLD_TS / self.number_of_features)

        self.pca.set_params(**self.params.to_dict())

    @staticmethod
    def update_column_types(output_data: OutputData) -> OutputData:
        """
        Update column types after applying PCA operations
        """

        _, n_cols = output_data.predict.shape
        output_data.supplementary_data.col_type_ids['features'] = np.array([TYPE_TO_ID[float]] * n_cols)
        return output_data

class PCAImplementation(ComponentAnalysisImplementation):
    """
    Class for applying PCA from sklearn

    Args:
        params: OperationParameters with the hyperparameters
    """

    def __init__(self, params: Optional[OperationParameters] = None):
        super().__init__(params)
        if not self.params:
            # Default parameters
            default_params = {'svd_solver': 'full', 'n_components': 'mle'}
            self.params.update(**default_params)
        self.pca = PCA(**self.params.to_dict())
        self.number_of_features = None

class KernelPCAImplementation(ComponentAnalysisImplementation):
    """
    Class for applying kernel PCA from sklearn

    Args:
        params: OperationParameters with the hyperparameters
    """

    def __init__(self, params: Optional[OperationParameters]):
        super().__init__(params)
        self.pca = KernelPCA(**self.params.to_dict())

class FastICAImplementation(ComponentAnalysisImplementation):
    """
    Class for applying FastICA from sklearn

    Args:
        params: OperationParameters with the hyperparameters
    """

    def __init__(self, params: Optional[OperationParameters]):
        super().__init__(params)
        self.pca = FastICA(**self.params.to_dict())

class PolyFeaturesImplementation(EncodedInvariantImplementation):
    """
    Class for application of :obj:`PolynomialFeatures` operation on data,
    where only not encoded features (were not converted from categorical using
    ``OneHot encoding``) are used

    Args:
        params: OperationParameters with the arguments
    """

    def __init__(self, params: Optional[OperationParameters]):
        super().__init__(params)
        self.th_columns = 10
        if not self.params:
            # Default parameters
            self.operation = PolynomialFeatures(include_bias=False)
        else:
            # Checking the appropriate params are using or not
            poly_params = {k: self.params.get(k) for k in
                           ['degree', 'interaction_only']}
            self.operation = PolynomialFeatures(include_bias=False,
                                                **poly_params)
        self.columns_to_take = None

    def fit(self, input_data: InputData):
        """
        Method for fit Poly features operation
        """
        # Check the number of columns in source dataset
        n_rows, n_cols = input_data.features.shape
        if n_cols > self.th_columns:
            # Randomly choose subsample of features columns - 10 features
            column_indices = np.arange(n_cols)
            self.columns_to_take = random.sample(list(column_indices), self.th_columns)
            input_data = input_data.subset_features(self.columns_to_take)

        return super().fit(input_data)

    def transform(self, input_data: InputData) -> OutputData:
        """
        Firstly perform filtration of columns
        """

        clipped_input_data = input_data
        if self.columns_to_take is not None:
            clipped_input_data = input_data.subset_features(self.columns_to_take)
        output_data = super().transform(clipped_input_data)

        if self.columns_to_take is not None:
            # Get generated features from poly function
            generated_features = output_data.predict[:, self.th_columns:]
            # Concat source features with generated one
            all_features = np.hstack((input_data.features, generated_features))
            output_data.predict =

### File test/integration/real_applications/test_examples.py: 
## Improved code with docstrings and type hints:

```python
from datetime import timedelta

import numpy as np
from sklearn.metrics import mean_squared_error

from examples.advanced.multimodal_text_num_example import run_multi_modal_example
from examples.advanced.multiobj_optimisation import run_classification_multiobj_example
from examples.advanced.time_series_forecasting.exogenous import run_exogenous_experiment
from examples.advanced.time_series_forecasting.multistep import run_multistep
from examples.advanced.time_series_forecasting.nemo_multiple import run_multiple_example
from examples.simple.classification.api_classification import run_classification_example
from examples.simple.classification.classification_pipelines import classification_complex_pipeline
from examples.simple.interpretable.api_explain import run_api_explain_example
from examples.simple.pipeline_tune import get_case_train_test_data, pipeline_tuning
from examples.simple.time_series_forecasting.api_forecasting import run_ts_forecasting_example
from examples.simple.time_series_forecasting.gapfilling import run_gapfilling_example
from examples.simple.time_series_forecasting.ts_pipelines import ts_complex_dtreg_pipeline
from fedot.core.utils import fedot_project_root

def test_multiclass_example() -> None:
    """Tests the multiclass classification example."""

    file_path_train: str = fedot_project_root().joinpath('test/data/multiclass_classification.csv')

    pipeline: Any = get_model(file_path_train, cur_lead_time=timedelta(seconds=5))
    assert pipeline is not None

def test_gapfilling_example() -> None:
    """Tests the gapfilling example."""

    arrays_dict: Dict[str, np.ndarray]
    gap_data: np.ndarray
    real_data: np.ndarray

    run_gapfilling_example(arrays_dict=arrays_dict, gap_data=gap_data, real_data=real_data)

    gap_ids = np.ravel(np.argwhere(gap_data == -100.0))
    for key in arrays_dict.keys():
        arr_without_gaps = arrays_dict.get(key)
        # Get only values in the gap
        predicted_values = arr_without_gaps[gap_ids]
        true_values = real_data[gap_ids]
        model_rmse = mean_squared_error(true_values, predicted_values, squared=False)
        # only ridge correctly interpolate the data
        if key == 'ridge':
            assert model_rmse < 0.5
        else:
            assert model_rmse < 2

def test_exogenous_ts_example() -> None:
    """Tests the exogenous TS forecasting example."""

    path: str = fedot_project_root().joinpath('test/data/simple_sea_level.csv')
    run_exogenous_experiment(path_to_file=path,
                             len_forecast=50, with_exog=True)

def test_nemo_multiple_points_example() -> None:
    """Tests the Nemo multiple points example."""

    project_root_path: str = fedot_project_root()
    path: str = project_root_path.joinpath('test/data/ssh_points_grid_simple.csv')
    exog_path: str = project_root_path.joinpath('test/data/ssh_nemo_points_grid_simple.csv')
    run_multiple_example(path_to_file=path,
                         path_to_exog_file=exog_path,
                         out_path=None,
                         len_forecast=30)

def test_pipeline_tuning_example() -> None:
    """Tests the pipeline tuning example."""

    train_data, test_data = get_case_train_test_data()

    # Pipeline composition
    pipeline = classification_complex_pipeline()

    # Pipeline tuning
    after_tune_roc_auc, _ = pipeline_tuning(pipeline=pipeline,
                                            train_data=train_data,
                                            test_data=test_data,
                                            local_iter=1,
                                            tuner_iter_num=2)

def test_multistep_example() -> None:
    """Tests the multistep example."""

    pipeline = ts_complex_dtreg_pipeline()
    run_multistep('test_sea', pipeline, step_forecast=20, future_steps=5)

def test_api_classification_example() -> None:
    """Tests the API classification example."""

    prediction = run_classification_example(timeout=1, with_tuning=False)
    assert prediction is not None

def test_api_ts_forecasting_example() -> None:
    """Tests the API TS forecasting example."""

    forecast = run_ts_forecasting_example(dataset='salaries', timeout=2, with_tuning=False)
    assert forecast is not None

def test_api_classification_multiobj_example() -> None:
    """Tests the API classification multiobj example."""

    pareto = run_classification_multiobj_example(timeout=1, with_tuning=False)
    assert pareto is not None

def test_api_explain_example() -> None:
    """Tests the API explain example."""

    explainer = run_api_explain_example(timeout=1, with_tuning=False)
    assert explainer is not None

def test_multi_modal_example() -> None:
    """Tests the multi-modal example."""

    result = run_multi_modal_example(file_path='examples/data/multimodal_wine.csv', with_tuning=False, timeout=2)
    assert result > 0.5

Changes:

This bot is powered by NVIDIA AI Foundation Models and Endpoints.