bambinos / formulae

Formulas for mixed-effects models in Python
https://bambinos.github.io/formulae/
MIT License
56 stars 14 forks source link

Formula spec fails when more than one function call #87

Closed FabricatiDiem closed 1 year ago

FabricatiDiem commented 1 year ago

I'm trying out a basic polynomial regression with Bambi, but I keep hitting the following error in Formulae when specifying multiple higher-order terms:

TypeError                                 Traceback (most recent call last)
Cell In[68], line 10
      4 test_data_df = pd.DataFrame.from_dict({
      5     'X': [-1, -0.97979798, -0.95959596, -0.93939394, -0.91919192],
      6     'y': [-6.2, -5.93976743, -5.68822415, -5.44516237, -5.21037433]
      7 })
      9 # model = bmb.Model("y ~ X + np.power(X, 2)", test_data_df) -- works fine
---> 10 model = bmb.Model("y ~ X + np.power(X, 2) + np.power(X, 3)", test_data_df)
     11 # model = bmb.Model("y ~ np.power(X, 2) + np.power(X, 3)", test_data_df)  -- also fails for the same reason

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/bambi/models.py:142, in Model.__init__(self, formula, data, family, priors, link, categorical, potentials, dropna, auto_scale, noncentered, priors_cor)
    140 na_action = "drop" if dropna else "error"
    141 self.formula = formula
--> 142 self._design = design_matrices(formula, data, na_action, 1, extra_namespace)
    144 if self._design.response is None:
    145     raise ValueError(
    146         "No outcome variable is set! "
    147         "Please specify an outcome variable using the formula interface."
    148     )

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/matrices.py:497, in design_matrices(formula, data, na_action, env, extra_namespace)
    494 env = Environment.capture(env, reference=1)
    495 env = env.with_outer_namespace(extra_namespace)
--> 497 description = model_description(formula)
    499 # Incomplete rows are calculated using columns involved in model formula only
    500 cols_to_select = description.var_names.intersection(set(data.columns))

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/model_description.py:22, in model_description(formula)
      6 def model_description(formula):
      7     """Interpret model formula and obtain a model description.
      8 
      9     This function receives a string with a formula describing a statistical
   (...)
     20     An object of class ModelTerms with an internal description of the model.
     21     """
---> 22     return Resolver(Parser(Scanner(formula).scan()).parse()).resolve()

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/resolver.py:16, in Resolver.resolve(self)
     15 def resolve(self):
---> 16     return self.expr.accept(self)

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/expr.py:72, in Binary.accept(self, visitor)
     71 def accept(self, visitor):
---> 72     return visitor.visitBinaryExpr(self)

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/resolver.py:24, in Resolver.visitBinaryExpr(self, expr)
     22 otype = expr.operator.kind
     23 if otype == "TILDE":
---> 24     return Response(expr.left.accept(self)) + expr.right.accept(self)
     25 if otype == "PLUS":
     26     return expr.left.accept(self) + expr.right.accept(self)

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/expr.py:72, in Binary.accept(self, visitor)
     71 def accept(self, visitor):
---> 72     return visitor.visitBinaryExpr(self)

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/resolver.py:26, in Resolver.visitBinaryExpr(self, expr)
     24     return Response(expr.left.accept(self)) + expr.right.accept(self)
     25 if otype == "PLUS":
---> 26     return expr.left.accept(self) + expr.right.accept(self)
     27 elif otype == "MINUS":
     28     return expr.left.accept(self) - expr.right.accept(self)

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/terms/terms.py:865, in Model.__add__(self, other)
    863     return self - Intercept()
    864 elif isinstance(other, (Term, GroupSpecificTerm, Intercept)):
--> 865     return self.add_term(other)
    866 elif isinstance(other, type(self)):
    867     for term in other.terms:

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/terms/terms.py:1096, in Model.add_term(self, term)
   1094     return self
   1095 elif isinstance(term, (Term, Intercept)):
-> 1096     if term not in self.common_terms:
   1097         self.common_terms.append(term)
   1098     return self

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/terms/terms.py:237, in Term.__eq__(self, other)
    235     return False
    236 else:
--> 237     return self.components == other.components

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/terms/call.py:51, in Call.__eq__(self, other)
     49 if not isinstance(other, type(self)):
     50     return False
---> 51 return self.call == other.call

File ~/opt/anaconda3/envs/bayesEducation/lib/python3.10/site-packages/formulae/terms/call_resolver.py:225, in LazyCall.__eq__(self, other)
    222 def __eq__(self, other):
    223     return (
    224         self.callee == other.callee
--> 225         and set(self.args) == set(other.args)
    226         and set(self.kwargs) == set(other.kwargs)
    227     )

TypeError: unhashable type: 'LazyValue'

Here is a minimal example to reproduce:

import bambi as bmb
import pandas as pb

test_data_df = pd.DataFrame.from_dict({
    'X': [-1, -0.97979798, -0.95959596, -0.93939394, -0.91919192],
    'y': [-6.2, -5.93976743, -5.68822415, -5.44516237, -5.21037433]
})

# model = bmb.Model("y ~ X + np.power(X, 2)", test_data_df) -- works fine
model = bmb.Model("y ~ X + np.power(X, 2) + np.power(X, 3)", test_data_df)
# model = bmb.Model("y ~ np.power(X, 2) + np.power(X, 3)", test_data_df)  -- also fails for the same reason

Using: Python 3.10 PyMC 5.02 Bambi 0.9.3 Formulae 0.3.4

It seems like something like this used to work before, so I'm curious what I am doing wrong.

tomicapretto commented 1 year ago

@FabricatiDiem thanks for opening the issue!

This is indeed a bug. The problem is not about using multiple functions. The problem is some internal machinery is not being able to compare two values, and that's because of the '2' and the '3' in the function calls.

If you generate custom functions and use them, only with variable names, it works

test_data_df = pd.DataFrame.from_dict({
    'X': [-1, -0.97979798, -0.95959596, -0.93939394, -0.91919192],
    'y': [-6.2, -5.93976743, -5.68822415, -5.44516237, -5.21037433],
})

def p2(x):
    return np.power(x, 2)

def p3(x):
    return np.power(x, 3)

bmb.Model("y ~ X + p2(X) + p3(X)", test_data_df)
       Formula: y ~ X + p2(X) + p3(X)
        Family: gaussian
          Link: mu = identity
  Observations: 5
        Priors: 
    target = mu
        Common-level effects
            Intercept ~ Normal(mu: -5.6967, sigma: 34.3017)
            X ~ Normal(mu: 0.0, sigma: 30.6203)
            p2(X) ~ Normal(mu: 0.0, sigma: 15.9535)
            p3(X) ~ Normal(mu: 0.0, sigma: 11.0754)

        Auxiliary parameters
            y_sigma ~ HalfStudentT(nu: 4.0, sigma: 0.3499)

So this is just a workaround to make it work now. I'll try to fix the bug.

FabricatiDiem commented 1 year ago

Thank you!

tomicapretto commented 1 year ago

@FabricatiDiem it's fixed if you install from the master branch now :)