Open yeomko22 opened 3 years ago
iris_schema = {
'sepal length': {
'range': {
'min': 4.0, # determined by looking at the dataframe .describe() method
'max': 8.0
},
'dtype': float,
},
'sepal width': {
'range': {
'min': 1.0,
'max': 5.0
},
'dtype': float,
},
'petal length': {
'range': {
'min': 1.0,
'max': 7.0
},
'dtype': float,
},
'petal width': {
'range': {
'min': 0.1,
'max': 3.0
},
'dtype': float,
}
}
import unittest
import sys
class TestIrisInputData(unittest.TestCase):
def setUp(self):
# `setUp` will be run before each test, ensuring that you
# have a new pipeline to access in your tests. See the
# unittest docs if you are unfamiliar with unittest.
# https://docs.python.org/3/library/unittest.html#unittest.TestCase.setUp
self.pipeline = SimplePipeline()
self.pipeline.run_pipeline()
def test_input_data_ranges(self):
# get df max and min values for each column
max_values = self.pipeline.frame.max()
min_values = self.pipeline.frame.min()
# loop over each feature (i.e. all 4 column names)
for feature in self.pipeline.feature_names:
# use unittest assertions to ensure the max/min values found in the dataset
# are less than/greater than those expected by the schema max/min.
self.assertTrue(max_values[feature] <= iris_schema[feature]['range']['max'])
self.assertTrue(min_values[feature] >= iris_schema[feature]['range']['min'])
def test_input_data_types(self):
data_types = self.pipeline.frame.dtypes # pandas dtypes method
for feature in self.pipeline.feature_names:
self.assertEqual(data_types[feature], iris_schema[feature]['dtype'])
import unittest
class TestIrisDataEngineering(unittest.TestCase):
def setUp(self):
self.pipeline = PipelineWithDataEngineering()
self.pipeline.load_dataset()
def test_scaler_preprocessing_brings_x_train_mean_near_zero(self):
# Given
# convert the dataframe to be a single column with pandas stack
original_mean = self.pipeline.X_train.stack().mean()
# When
self.pipeline.apply_scaler()
# Then
# The idea behind StandardScaler is that it will transform your data
# to center the distribution at 0 and scale the variance at 1.
# Therefore we test that the mean has shifted to be less than the original
# and close to 0 using assertAlmostEqual to check to 3 decimal places:
# https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual
self.assertTrue(original_mean > self.pipeline.X_train.mean()) # X_train is a numpy array at this point.
self.assertAlmostEqual(self.pipeline.X_train.mean(), 0.0, places=3)
print(f'Original X train mean: {original_mean}')
print(f'Transformed X train mean: {self.pipeline.X_train.mean()}')
def test_scaler_preprocessing_brings_x_train_std_near_one(self):
# When
self.pipeline.apply_scaler()
# Then
# We also check that the standard deviation is close to 1
self.assertAlmostEqual(self.pipeline.X_train.std(), 1.0, places=3)
print(f'Transformed X train standard deviation : {self.pipeline.X_train.std()}')
why test?
predicticting reliability
functionality