lauken13 / mrpkit

Tools and tutorials for multi-level regression and post-stratification of survey data
Other
10 stars 0 forks source link

Bug when prediction done in a saved fit object with data with unordered levels #138

Open Dewi-Amaliah opened 2 years ago

Dewi-Amaliah commented 2 years ago

I made a slight change in the data to be unordered:

shape_survey_edit <- mrpkit::shape_survey %>%
  mutate(age = factor(age, levels = c("36-45", "26-35", "18-25", "46-55", "56-65", "66-75", "76-90")))

pop_data <- mrpkit::approx_voters_popn %>%
  mutate(state = factor(state, levels = c("D", "E", "A", "B", "C")),
         age_group = factor(age_group, levels = c("56-65", "66+", "18-35", "36-55")))

This data are then used to build and fit the model

box_pref <- SurveyData$new(
  data = shape_survey_edit,
  questions = list(
    age = "Please identify your age group",
    gender = "Please select your gender",
    vote_for = "Which party did you vote for in the 2018 election?",
    highest_educ = "Please identify your completed highest education",
    state = "Which State do you live in?",
    y = "If today is election day, will you vote for the Box Party?"
  ),
  responses = list(
    age = levels(shape_survey$age),
    gender = levels(shape_survey$gender),
    vote_for = levels(shape_survey$vote_for),
    highest_educ = levels(shape_survey$highest_educ),
    state = levels(shape_survey$state),
    y = c("no", "yes")
  ),
  weights = "wt",
  design = list(ids =  ~ 1)
)

popn_obj <- SurveyData$new(
  data = pop_data,
  questions = c(
    age_group = "Which age group are you?",
    gender = "Which gender are you identified?",
    vote_pref = "Which party do you prefer to vote?",
    education = "What is the highest grade or level of school you have completed",
    state = "Please identify the state where you live in"
  ),
  responses = list(
    age_group = levels(approx_voters_popn$age_group),
    gender = levels(approx_voters_popn$gender),
    vote_pref = levels(approx_voters_popn$vote_pref),
    education = levels(approx_voters_popn$education),
    state = levels(approx_voters_popn$state)
  ),
  weights = "wt",
  design = list(ids =  ~ 1)
)

# Create QuestionMap$object for the question related to age
q_age <- QuestionMap$new(
  name = "age",
  col_names = c("age", "age_group"),
  values_map = list(
    "18-25" = "18-35",
    "26-35" = "18-35",
    "36-45" = "36-55",
    "46-55" = "36-55",
    "56-65" = "56-65",
    "66-75" = "66+",
    "76-90" = "66+"
  )
)

# Create QuestionMap$object for the question related to party preference
q_party_pref <- QuestionMap$new(
  name = "party_pref",
  col_names = c("vote_for", "vote_pref"),
  values_map = list(
    "Box Party" = "BP",
    "BP" = "BP",
    "Circle Party" = "CP",
    "CP" = "CP"
  )
)

# Create QuestionMap$object for the question related to gender
q_gender <- QuestionMap$new(
  name = "gender",
  col_names = c("gender", "gender"),
  values_map = data.frame(
    "male" = "m",
    "female" = "f",
    "nonbinary" = "nb"
  )
)

# Create QuestionMap$object for the question related to education
q_educ <- QuestionMap$new(
  name = "highest_education",
  col_names = c("highest_educ", "education"),
  values_map = list(
    "no high school" = "no high school",
    "high school" = "high school",
    "some college" = "some college",
    "associates" = "some college",
    "4-year college" = "4-years college",
    "post-graduate" = "post-grad"
  )
)

# Create QuestionMap$object for the question related to state
q_state <- QuestionMap$new(
  name = "state",
  col_names = c("state", "state"),
  values_map = list(
    "State A" = "A",
    "State B" = "B",
    "State C" = "C",
    "State D" = "D",
    "State E" = "E"
  )
)

ex_map <- SurveyMap$new(sample = box_pref, population = popn_obj, q_age, q_educ, q_gender, q_party_pref, q_state)
ex_map$mapping()
ex_map$tabulate()

fit1 <- ex_map$fit(
  fun = rstanarm::stan_glmer,
  formula = y ~ (1 | age) + (1 | gender) + (1 | highest_education) + (1 | state),
  family = "binomial",
  refresh = 100,
  cores = 2)

# save the fit object 
saveRDS(fit1, "fit1_ex.RDS")

From this point forward, I used the ordered levels of the data and reading the saved fit object.

party_pref <- SurveyData$new(
  data = shape_survey,
  questions = list(
    age = "Please identify your age group",
    gender = "Please select your gender",
    vote_for = "Which party did you vote for in the 2018 election?",
    highest_educ = "Please identify your completed highest education",
    state = "Which State do you live in?",
    y = "If today is election day, will you vote for the Box Party?"
  ),
  responses = list(
    age = levels(shape_survey$age),
    gender = levels(shape_survey$gender),
    vote_for = levels(shape_survey$vote_for),
    highest_educ = levels(shape_survey$highest_educ),
    state = levels(shape_survey$state),
    y = c("no", "yes")
  ),
  weights = "wt",
  design = list(ids =  ~ 1)
)

popn <- SurveyData$new(
  data = approx_voters_popn,
  questions = c(
    age_group = "Which age group are you?",
    gender = "Which gender are you identified?",
    vote_pref = "Which party do you prefer to vote?",
    education = "What is the highest grade or level of school you have completed",
    state = "Please identify the state where you live in"
  ),
  responses = list(
    age_group = levels(approx_voters_popn$age_group),
    gender = levels(approx_voters_popn$gender),
    vote_pref = levels(approx_voters_popn$vote_pref),
    education = levels(approx_voters_popn$education),
    state = levels(approx_voters_popn$state)
  ),
  weights = "wt",
  design = list(ids =  ~ 1)
)

# Create QuestionMap$object for the question related to age
q_age <- QuestionMap$new(
  name = "age",
  col_names = c("age", "age_group"),
  values_map = list(
    "18-25" = "18-35",
    "26-35" = "18-35",
    "36-45" = "36-55",
    "46-55" = "36-55",
    "56-65" = "56-65",
    "66-75" = "66+",
    "76-90" = "66+"
  )
)

# Create QuestionMap$object for the question related to party preference
q_party_pref <- QuestionMap$new(
  name = "party_pref",
  col_names = c("vote_for", "vote_pref"),
  values_map = list(
    "Box Party" = "BP",
    "BP" = "BP",
    "Circle Party" = "CP",
    "CP" = "CP"
  )
)

# Create QuestionMap$object for the question related to gender
q_gender <- QuestionMap$new(
  name = "gender",
  col_names = c("gender", "gender"),
  values_map = data.frame(
    "male" = "m",
    "female" = "f",
    "nonbinary" = "nb"
  )
)

# Create QuestionMap$object for the question related to education
q_educ <- QuestionMap$new(
  name = "highest_education",
  col_names = c("highest_educ", "education"),
  values_map = list(
    "no high school" = "no high school",
    "high school" = "high school",
    "some college" = "some college",
    "associates" = "some college",
    "4-year college" = "4-years college",
    "post-graduate" = "post-grad"
  )
)

# Create QuestionMap$object for the question related to state
q_state <- QuestionMap$new(
  name = "state",
  col_names = c("state", "state"),
  values_map = list(
    "State A" = "A",
    "State B" = "B",
    "State C" = "C",
    "State D" = "D",
    "State E" = "E"
  )
)

ex_map <- SurveyMap$new(sample = party_pref, population = popn, q_age, q_educ, q_gender, q_party_pref, q_state)
ex_map$mapping()
ex_map$tabulate()

# read the saved fit object, hence the ordered-levels of the data using the fit from the un-ordered levels.
fit1 <- readRDS("fit1_ex.RDS")
poststrat_est_fit1 <- fit1$population_predict()

# aggregate the predicted value by age using fit1
age_estimation <- fit1$aggregate(poststrat_est_fit1, by = "age")
age_estimation %>%
  group_by(age) %>%
  summarize(mean = mean(value), sd = sd(value))

I fit the model again (from the ordered data) and re-doing the prediction.

# same as fit1 essentially
fit2 <- ex_map$fit(
  fun = rstanarm::stan_glmer,
  formula = y ~ (1 | age) + (1 | gender) + (1 | highest_education) + (1 | state),
  family = "binomial",
  refresh = 100,
  cores = 2)

poststrat_est_fit2 <- fit2$population_predict()
age_estimation2 <- fit2$aggregate(poststrat_est_fit2, by = "age")

age_estimation2 %>%
  group_by(age) %>%
  summarize(mean = mean(value), sd = sd(value))

In which the results are slightly different.

lauken13 commented 2 years ago

Could this be solved by adding a unique check? Perhaps we should discuss tomorrow?

lauken13 commented 2 years ago

Problem isn't illustrated clearly, @lauken13 to investigate