DOC: "Using ModelBuilder class for deploying PyMC models" example inconsistent

Issue with current documentation:

This is copied verbatim from Using ModelBuilder class for deploying PyMC models:

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import xarray as xr

from numpy.random import RandomState

RANDOM_SEED = 8927

rng = np.random.default_rng(RANDOM_SEED)
az.style.use("arviz-darkgrid")

from pymc_experimental.model_builder import ModelBuilder

class LinearModel(ModelBuilder):
    # Give the model a name
    _model_type = "LinearModel"
    # And a version
    version = "0.1"

    def build_model(self, model_config, data=None):
        """
        build_model creates the PyMC model

        Parameters:
        model_config: dictionary
            it is a dictionary with all the parameters that we need in our model example:  a_loc, a_scale, b_loc
        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]]
            Data we want our model fit on.
        """
        # Note that we do not have to define a with-context

        # Create mutable data containers
        x_data = pm.MutableData("x_data", data["input"].values)
        y_data = pm.MutableData("y_data", data["output"].values)

        # prior parameters
        a_mu_prior = model_config.get("a_mu_prior", 0.0)
        a_sigma_prior = model_config.get("a_sigma_prior", 1.0)
        b_mu_prior = model_config.get("b_mu_prior", 0.0)
        b_sigma_prior = model_config.get("b_sigma_prior", 1.0)
        eps_prior = model_config.get("eps_prior", 1.0)

        # priors
        a = pm.Normal("a", mu=a_mu_prior, sigma=a_sigma_prior)
        b = pm.Normal("b", mu=b_mu_prior, sigma=b_sigma_prior)
        eps = pm.HalfNormal("eps", eps_prior)

        obs = pm.Normal("y", mu=a + b * x_data, sigma=eps, shape=x_data.shape, observed=y_data)

    def _data_setter(self, data: pd.DataFrame):
        """
        _data_setter works as a set_data for the model and updates the data whenever we need to.
        Parameters:
        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]]
            It is the data we need to update for the model.
        """

        with self.model:
            pm.set_data({"x_data": data["input"].values})
            if "output" in data.columns:
                pm.set_data({"y_data": data["output"].values})

    @classmethod
    def create_sample_input(cls):
        """
        Creates example input and parameters to test the model on.
        This is optional but useful.
        """

        x = np.linspace(start=0, stop=1, num=100)
        y = 0.3 * x + 0.5
        y = y + np.random.normal(0, 1, len(x))
        data = pd.DataFrame({"input": x, "output": y})

        model_config = {
            "a_mu_prior": 0.0,
            "a_sigma_prior": 1.0,
            "b_mu_prior": 0.0,
            "b_sigma_prior": 1.0,
            "eps_prior": 1.0,
        }

        sampler_config = {
            "draws": 1_000,
            "tune": 1_000,
            "chains": 3,
            "target_accept": 0.95,
        }

        return data, model_config, sampler_config

data, model_config, sampler_config = LinearModel.create_sample_input()
model = LinearModel(model_config, sampler_config, data)

When I run this code I get the following error:

Traceback (most recent call last):
  File "/home/galen/projects/try-pymc-modelbuilder/canonical_example.py", line 97, in <module>
    model = LinearModel(model_config, sampler_config, data)
TypeError: ModelBuilder.__init__() takes from 1 to 3 positional arguments but 4 were given

This is due to the fact that build_model does not have a parameter for sampler_config, and in the current state there is no explicit handling of it within the definition of build_model either.

Idea or request for content:

Please consider completing the example such that (1) it runs without issue and (2) shows how sampler_config is intended to be used.

@galenseilis We have recently updated the docs, can you check again? #565

Sure thing, @twiecki . Here is the setup:

$ mkdir try-mb-pymc
$ cd try-mb-pymc
$ pip install arviz matplotlib numpy pandas pymc xarray pymc_experimental
$ python -m venv venv
$ source venv/bin/activate

Everything installed fine. Then I copy-pasted this into test.py:

from typing import Dict, List, Optional, Tuple, Union

import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc as pm
import xarray as xr
from pymc_experimental.model_builder import ModelBuilder

from numpy.random import RandomState

RANDOM_SEED = 8927

rng = np.random.default_rng(RANDOM_SEED)
az.style.use("arviz-darkgrid")

# Generate data
x = np.linspace(start=0, stop=1, num=100)
y = 0.3 * x + 0.5 + rng.normal(0, 1, len(x))

class LinearModel(ModelBuilder):
    # Give the model a name
    _model_type = "LinearModel"

    # And a version
    version = "0.1"

    def build_model(self, X: pd.DataFrame, y: Union[pd.Series, np.ndarray], **kwargs):
        """
        build_model creates the PyMC model

        Parameters:
        model_config: dictionary
            it is a dictionary with all the parameters that we need in our model example:  a_loc, a_scale, b_loc
        data: Dict[str, Union[np.ndarray, pd.DataFrame, pd.Series]]
            Data we want our model fit on.
        """
        # Check the type of X and y and adjust access accordingly
        X_values = X["input"].values
        y_values = y.values if isinstance(y, pd.Series) else y
        self._generate_and_preprocess_model_data(X_values, y_values)

        with pm.Model(coords=self.model_coords) as self.model:

            # Create mutable data containers
            x_data = pm.MutableData("x_data", X_values)
            y_data = pm.MutableData("y_data", y_values)

            # prior parameters
            a_mu_prior = self.model_config.get("a_mu_prior", 0.0)
            a_sigma_prior = self.model_config.get("a_sigma_prior", 1.0)
            b_mu_prior = self.model_config.get("b_mu_prior", 0.0)
            b_sigma_prior = self.model_config.get("b_sigma_prior", 1.0)
            eps_prior = self.model_config.get("eps_prior", 1.0)

            # priors
            a = pm.Normal("a", mu=a_mu_prior, sigma=a_sigma_prior)
            b = pm.Normal("b", mu=b_mu_prior, sigma=b_sigma_prior)
            eps = pm.HalfNormal("eps", eps_prior)

            obs = pm.Normal("y", mu=a + b * x_data, sigma=eps, shape=x_data.shape, observed=y_data)

    def _data_setter(
        self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray] = None
    ):
        if isinstance(X, pd.DataFrame):
            x_values = X["input"].values
        else:
            # Assuming "input" is the first column
            x_values = X[:, 0]

        with self.model:
            pm.set_data({"x_data": x_values})
            if y is not None:
                pm.set_data({"y_data": y.values if isinstance(y, pd.Series) else y})

    @property
    def default_model_config(self) -> Dict:
        """
        default_model_config is a property that returns a dictionary with all the prior values we want to build the model with.
        It supports more complex data structures like lists, dictionaries, etc.
        It will be passed to the class instance on initialization, in case the user doesn't provide any model_config of their own.
        """
        model_config: Dict = {
            "a_mu_prior": 0.0,
            "a_sigma_prior": 1.0,
            "b_mu_prior": 0.0,
            "b_sigma_prior": 1.0,
            "eps_prior": 1.0,
        }
        return model_config

    @property
    def default_sampler_config(self) -> Dict:
        """
        default_sampler_config is a property that returns a dictionary with all most important sampler parameters.
        It will be used in case the user doesn't provide any sampler_config of their own.
        """
        sampler_config: Dict = {
            "draws": 1_000,
            "tune": 1_000,
            "chains": 3,
            "target_accept": 0.95,
        }
        return sampler_config

    @property
    def output_var(self):
        return "y"

    @property
    def _serializable_model_config(self) -> Dict[str, Union[int, float, Dict]]:
        """
        _serializable_model_config is a property that returns a dictionary with all the model parameters that we want to save.
        as some of the data structures are not json serializable, we need to convert them to json serializable objects.
        Some models will need them, others can just define them to return the model_config.
        """
        return self.model_config

    def _save_input_params(self, idata) -> None:
        """
        Saves any additional model parameters (other than the dataset) to the idata object.

        These parameters are stored within `idata.attrs` using keys that correspond to the parameter names.
        If you don't need to store any extra parameters, you can leave this method unimplemented.

        Example:
            For saving customer IDs provided as an 'customer_ids' input to the model:
            self.customer_ids = customer_ids.values #this line is done outside of the function, preferably at the initialization of the model object.
            idata.attrs["customer_ids"] = json.dumps(self.customer_ids.tolist())  # Convert numpy array to a JSON-serializable list.
        """
        pass

        pass

    def _generate_and_preprocess_model_data(
        self, X: Union[pd.DataFrame, pd.Series], y: Union[pd.Series, np.ndarray]
    ) -> None:
        """
        Depending on the model, we might need to preprocess the data before fitting the model.
        all required preprocessing and conditional assignments should be defined here.
        """
        self.model_coords = None  # in our case we're not using coords, but if we were, we would define them here, or later on in the function, if extracting them from the data.
        # as we don't do any data preprocessing, we just assign the data givenin by the user. Note that it's very basic model,
        # and usually we would need to do some preprocessing, or generate the coords from the data.
        self.X = X
        self.y = y

X = pd.DataFrame(data=np.linspace(start=0, stop=1, num=100), columns=["input"])
y = 0.3 * x + 0.5
y = y + np.random.normal(0, 1, len(x))

model = LinearModel()

idata = model.fit(X, y)

fname = "linear_model_v1.nc"
model.save(fname)

model_2 = LinearModel.load(fname)

x_pred = np.random.uniform(low=1, high=2, size=10)
prediction_data = pd.DataFrame({"input": x_pred})
type(prediction_data["input"].values)

pred_mean = model_2.predict(prediction_data)
# samples
pred_samples = model_2.predict_posterior(prediction_data)

fig, ax = plt.subplots(figsize=(7, 7))
posterior = az.extract(idata, num_samples=20)
x_plot = xr.DataArray(np.linspace(1, 2, 100))
y_plot = posterior["b"] * x_plot + posterior["a"]
Line2 = ax.plot(x_plot, y_plot.transpose(), color="C1")
Line1 = ax.plot(x_pred, pred_mean, "x")
ax.set(title="Posterior predictive regression lines", xlabel="x", ylabel="y")
ax.legend(
    handles=[Line1[0], Line2[0]], labels=["predicted average", "inferred regression line"], loc=0
);

plt.show()

And I ran it:

$ python test.py
$ ls 
linear_model_v1.nc  test.py  venv

Resulting plot is this:

Here are the versions:

$ python
Python 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import xarray, arviz, pandas, numpy, matplotlib, pymc, pymc_experimental
>>> [i.__version__ for i in [xarray, arviz, pandas, numpy, matplotlib, pymc, pymc_experimental]]
['2022.12.0', '0.14.0', '1.5.2', '1.23.5', '3.6.2', '5.8.0', '0.0.11']

I'm satisfied that the changes make the example work.

Some minor comments:

The method _save_input_params has pass under it twice. Either a single pass or ... should be fine.
The matplotlib plotting code has a ; which is not needed, although not invalid.
I noticed that my plot differs slightly from the the example in the docs.

pymc-devs / pymc-examples

DOC: "Using ModelBuilder class for deploying PyMC models" example inconsistent #571

Issue with current documentation:

Idea or request for content: