ML system testing concepts #121

why test?

predicticting reliability


Testing ML Systems

Key testing principles for ML: pre-deployemnt

testing theory

How much testing?

test data schema

iris_schema = {
    'sepal length': {
        'range': {
            'min': 4.0,  # determined by looking at the dataframe .describe() method
            'max': 8.0
        'dtype': float,
    'sepal width': {
        'range': {
            'min': 1.0,
            'max': 5.0
        'dtype': float,
    'petal length': {
        'range': {
            'min': 1.0,
            'max': 7.0
        'dtype': float,
    'petal width': {
        'range': {
            'min': 0.1,
            'max': 3.0
        'dtype': float,
import unittest
import sys

class TestIrisInputData(unittest.TestCase):
    def setUp(self):

        # `setUp` will be run before each test, ensuring that you
        # have a new pipeline to access in your tests. See the 
        # unittest docs if you are unfamiliar with unittest.
        self.pipeline = SimplePipeline()

    def test_input_data_ranges(self):
        # get df max and min values for each column
        max_values = self.pipeline.frame.max()
        min_values = self.pipeline.frame.min()

        # loop over each feature (i.e. all 4 column names)
        for feature in self.pipeline.feature_names:

            # use unittest assertions to ensure the max/min values found in the dataset
            # are less than/greater than those expected by the schema max/min.
            self.assertTrue(max_values[feature] <= iris_schema[feature]['range']['max'])
            self.assertTrue(min_values[feature] >= iris_schema[feature]['range']['min'])

    def test_input_data_types(self):
        data_types = self.pipeline.frame.dtypes  # pandas dtypes method

        for feature in self.pipeline.feature_names:
            self.assertEqual(data_types[feature], iris_schema[feature]['dtype'])
testing data engineering

import unittest

class TestIrisDataEngineering(unittest.TestCase):
    def setUp(self):
        self.pipeline = PipelineWithDataEngineering()

    def test_scaler_preprocessing_brings_x_train_mean_near_zero(self):
        # Given
        # convert the dataframe to be a single column with pandas stack
        original_mean = self.pipeline.X_train.stack().mean()

        # When

        # Then
        # The idea behind StandardScaler is that it will transform your data 
        # to center the distribution at 0 and scale the variance at 1.
        # Therefore we test that the mean has shifted to be less than the original
        # and close to 0 using assertAlmostEqual to check to 3 decimal places:
        self.assertTrue(original_mean > self.pipeline.X_train.mean())  # X_train is a numpy array at this point.
        self.assertAlmostEqual(self.pipeline.X_train.mean(), 0.0, places=3)
        print(f'Original X train mean: {original_mean}')
        print(f'Transformed X train mean: {self.pipeline.X_train.mean()}')

    def test_scaler_preprocessing_brings_x_train_std_near_one(self):
        # When

        # Then
        # We also check that the standard deviation is close to 1
        self.assertAlmostEqual(self.pipeline.X_train.std(), 1.0, places=3)
        print(f'Transformed X train standard deviation : {self.pipeline.X_train.std()}')