gasparrini / 2012_gasparrini_StatMed_Rcodedata

Updated R code and data from Gasparrini StatMed 2012
5 stars 5 forks source link

Problem with crosspred for lags #1

Closed juandavidgutier closed 2 weeks ago

juandavidgutier commented 2 weeks ago

Hi @gasparrini,

I have a file called top50.csv. In that file, there are 50 municipalities (variable municipality), each with 192 observations (1 per month between 2007 and 2022). What I am trying to do is modify your code to estimate the effect of the variable SoilMoi on the variable Cases. In the first stage, I am testing three different models, each performing a time series analysis specific to each municipality using a quasi-Poisson generalized linear model with a distributed lag model (DLM). In the second stage, I am combining the municipality-specific estimates using a random effects meta-analysis adjusted with restricted maximum likelihood (REML) and nested random effects defined by municipality, so that the combined estimate represents the overall average association between the variables SoilMoi and Cases.

However, when I try to use the function crosspred to estimate the effect for lag values, I get this error: Error in crosspred(blag, coef = coef(mvall), vcov = vcov(mvall), model.link = "log", : coef/vcov not consistent with basis matrix. See help(crosspred)

Here is my code:

'library(dlnm) ; library(mvmeta) ; library(splines) ; library(dplyr); library(ggplot2); library(RCurl); library(MKdescr)

LOAD THE DATASET

url_path = "https://raw.githubusercontent.com/juandavidgutier/soil_moisture_malaria/master/top50.csv" muni_Col <- read.csv(url_path) dim(muni_Col) head(muni_Col)

sd units

muni_Col$Rain <- zscore(muni_Col$Rain, na.rm = TRUE) muni_Col$Runoff <- zscore(muni_Col$Runoff, na.rm = TRUE) muni_Col$SST12 <- zscore(muni_Col$SST12, na.rm = TRUE) muni_Col$SST3 <- zscore(muni_Col$SST3, na.rm = TRUE) muni_Col$SST34 <- zscore(muni_Col$SST34, na.rm = TRUE) muni_Col$SST4 <- zscore(muni_Col$SST4, na.rm = TRUE) muni_Col$NATL <- zscore(muni_Col$NATL, na.rm = TRUE) muni_Col$SATL <- zscore(muni_Col$SATL, na.rm = TRUE) muni_Col$TROP <- zscore(muni_Col$TROP, na.rm = TRUE)

REGIONS

regions <- as.character(unique(muni_Col$municipality))

CREATE A LIST WITH THE REGIONAL SERIES

data <- lapply(regions,function(x) muni_Col[muni_Col$municipality==x,]) names(data) <- regions m <- length(regions)

SoilMoi RANGES

ranges <- t(sapply(data, function(x) range(x$SoilMoi,na.rm=T)))

FUNCTION TO COMPUTE THE Q-AIC IN QUASI-POISSON MODELS

fqaic <- function(model) { loglik <- sum(dpois(model$y,model$fitted.values,log=TRUE)) phi <- summary(model)$dispersion qaic <- -2loglik + 2summary(model)$df[3]*phi return(qaic) }

Lag 0-4 months

lag <- c(0,4)

FIRST STAGE

MAIN MODEL

argvar <- list(fun="poly", degree=2, cen=0.0) arglag <- list(fun="ns", df=2, intercept=FALSE)

ALTERNATIVE MODELS

argvar2 <- list(fun="ns", df=3, cen=0.0) arglag2 <- list(fun="poly", degree=2,intercept=FALSE) argvar3 <- list(fun="bs", df=3, cen=0.0) arglag3 <- list(fun="poly", degree=2, intercept=FALSE)

BUILT OBJECTS WHERE RESULTS WILL BE STORED

y- IS THE MATRIX FOR THE OUTCOME PARAMETERS

S- IS THE LISTS OF (CO)VARIANCE MATRICES

OVERALL CUMULATIVE SUMMARIES

yall <- matrix(NA,length(data),2,dimnames=list(regions,paste("b",seq(2),sep=""))) yall2 <- matrix(NA,length(data),3,dimnames=list(regions,paste("b",seq(3),sep="")))

yall3 <- yall2

(CO)VARIANCE MATRICES

Sall <- vector("list",length(data)) names(Sall) <- regions Sall2 <- Sall3 <- Sall

Q-AIC

qaic <- qaic2 <- qaic3 <- 0

RUN THE MODEL FOR EACH CITY

LOOP FOR CITIES

system.time({ for(i in seq(data)) {

# LOAD
sub <- data[[i]]

# DEFINE THE CROSS-BASES
suppressWarnings({
  #cb <- crossbasis(sub$SoilMoi,lag=lag,argvar=argvar,arglag=arglag)

  cb <- crossbasis(sub$SoilMoi, lag = lag, argvar = argvar, arglag = arglag)
  cb2 <- crossbasis(sub$SoilMoi,lag=lag,argvar=argvar2,arglag=arglag2)
  cb3 <- crossbasis(sub$SoilMoi,lag=lag,argvar=argvar3,arglag=arglag3)
})

# RUN THE FIRST-STAGE MODELS
mfirst <- glm(Cases ~ cb + Rain + SST12 + SST3 + SST34 + SST4 + NATL + SATL + TROP, maxit = 1000,
              family=quasipoisson(), data[[i]], offset = log(data[[i]]$total_population))
mfirst2 <- glm(Cases ~ cb2 + Rain + SST12 + SST3 + SST34 + SST4 + NATL + SATL + TROP, maxit = 1000,
               family=quasipoisson(), data[[i]], offset = log(data[[i]]$total_population))
mfirst3 <- glm(Cases ~ cb3 + Rain + SST12 + SST3 + SST34 + SST4 + NATL + SATL + TROP, maxit = 1000,
               family=quasipoisson(), data[[i]], offset = log(data[[i]]$total_population))

# REDUCTION TO SUMMARY ASSOCIATIONS

# TO OVERALL CUMULATIVE SUMMARY
suppressWarnings({
  crall <- crossreduce(cb,mfirst)
  crall2 <- crossreduce(cb2,mfirst2)
  crall3 <- crossreduce(cb3,mfirst3)
})

# STORE THE RESULTS

# OVERALL CUMULATIVE SUMMARY FOR THE MAIN MODEL
yall[i,] <- coef(crall)
Sall[[i]] <- vcov(crall)

# OVERALL CUMULATIVE SUMMARY FOR THE ALTERNATIVE MODELS
yall2[i,] <- coef(crall2)
yall3[i,] <- coef(crall3)
Sall2[[i]] <- vcov(crall2)
Sall3[[i]] <- vcov(crall3)

# Q-AIC
qaic[i] <- fqaic(mfirst)
qaic2[i] <- fqaic(mfirst2)
qaic3[i] <- fqaic(mfirst3)

} })

GRAND Q-AIC

sum(qaic) ; sum(qaic2) ; sum(qaic3)

SECOND STAGE

PERFORM MULTIVARIATE META-ANALYSIS

SELECT THE ESTIMATION METHOD

method <- "reml"

OVERALL CUMULATIVE SUMMARY FOR THE SECOND MODEL

mvall <- mvmeta(yall2~1,Sall2,method=method) summary(mvall)

BASES OF SoilMoi AND LAG USED TO PREDICT, EQUAL TO THAT USED FOR ESTIMATION

COMPUTED USING THE ATTRIBUTES OF THE CROSS-BASIS USED IN ESTIMATION

bound <- colMeans(ranges)

xvar <- seq(bound[1],bound[2],by=0.1)

bvar <- do.call("onebasis",c(list(x=xvar),attr(cb2,"argvar"))) xlag <- 1:400/100 blag <- do.call("onebasis",c(list(x=xlag),attr(cb2,"arglag")))

REGION-SPECIFIC FIRST-STAGE SUMMARIES

regall <- apply(yall2,1,function(x) exp(bvar%*%x))

PREDICTION FOR SoilMoi FOR SECOND MODEL

cpall <- crosspred(bvar,coef=coef(mvall),vcov=vcov(mvall), cen=0.0, model.link="log",by=0.1,from=bound[1],to=bound[2])

PREDICTION FOR LAG VALUES FOR SECOND MODEL

HERE THE ERROR

cpall_lag <- crosspred(blag, coef = coef(mvall), vcov = vcov(mvall), model.link = "log", by = 0.1)'

gasparrini commented 2 weeks ago

In the last line, you are trying to predict a lag-response curve with the lag basis and coefficients defined over the (overall cumulative) exposure-response curve. The error message tells you that these two are not consistent. If you want to do it correctly, you need to do a different reduction with crossreduce(), specifically a reduction to the dimension of the lag, not the dimension of overall cumulative exposure -response (the default).