Estimating organic carbon from organic matter: lm(log) vs lognormal glm

Pakillo commented 1 year ago

download.file("https://github.com/EcologyR/BlueCarbon/raw/master/data/bluecarbon_data.rda",
              destfile = "bluecarbon_data.rda", mode = "wb")
load("bluecarbon_data.rda")

df <- bluecarbon_data[, c("om", "oc")]
df <- df[complete.cases(df), ]

plot(oc ~ om, df)

plot(log(oc) ~ log(om), df)

library(visreg)

library(DHARMa)
#> This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')

## 1. Linear model

m1 <- lm(oc ~ om, data = df)
summary(m1)
#> 
#> Call:
#> lm(formula = oc ~ om, data = df)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -3.5660 -0.3916 -0.1385  0.2927  7.1870 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.11552    0.06694   1.726   0.0852 .  
#> om           0.30743    0.01399  21.969   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.7842 on 387 degrees of freedom
#> Multiple R-squared:  0.555,  Adjusted R-squared:  0.5538 
#> F-statistic: 482.6 on 1 and 387 DF,  p-value: < 2.2e-16
visreg(m1)

simulateResiduals(m1, plot = TRUE)

#> Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
#>  
#> Scaled residual values: 0.712 0.16 0.968 0.804 0.964 1 1 1 0.996 0.628 0.964 0.988 0.896 0.848 0.732 0.732 0.724 0.996 0.94 0.968 ...
summary(fitted(m1))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.2717  0.7778  1.0920  1.2985  1.4930  9.0190
summary(df$oc, na.rm = TRUE)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.05172  0.52067  1.10691  1.29848  1.59969 12.43664
plot(df$oc, fitted(m1), xlim = c(0, 4), ylim = c(0, 4))
abline(a = 0, b = 1, col = "red")

cor(df$oc, fitted(m1))
#> [1] 0.7449687

## 2. lm(log(oc))

m2 <- lm(log(oc) ~ log(om), data = df)
summary(m2)
#> 
#> Call:
#> lm(formula = log(oc) ~ log(om), data = df)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -1.76185 -0.36345  0.01436  0.40042  1.13577 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -1.39163    0.05390  -25.82   <2e-16 ***
#> log(om)      1.14084    0.04118   27.70   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.5207 on 387 degrees of freedom
#> Multiple R-squared:  0.6647, Adjusted R-squared:  0.6639 
#> F-statistic: 767.3 on 1 and 387 DF,  p-value: < 2.2e-16
visreg(m2)

simulateResiduals(m2, plot = TRUE)

#> Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
#>  
#> Scaled residual values: 0.876 0.004 0.976 0.952 0.936 0.972 0.928 0.924 0.972 0.868 0.96 0.972 0.972 0.928 0.792 0.792 0.88 0.96 0.936 0.948 ...
summary(exp(fitted(m2)))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.1148  0.5969  0.9294  1.1967  1.3762 11.5698
summary(df$oc, na.rm = TRUE)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.05172  0.52067  1.10691  1.29848  1.59969 12.43664
plot(df$oc, exp(fitted(m2)), xlim = c(0, 4), ylim = c(0, 4))
abline(a = 0, b = 1, col = "red")

cor(df$oc, fitted(m2))
#> [1] 0.7052702

## 3. lognormal glm

library(VGAM)
#> Loading required package: stats4
#> Loading required package: splines
m3 <- vglm(oc ~ log(om), data = df, family = lognormal)
summary(m3)
#> 
#> Call:
#> vglm(formula = oc ~ log(om), family = lognormal, data = df)
#> 
#> Coefficients: 
#>               Estimate Std. Error z value Pr(>|z|)    
#> (Intercept):1 -1.39163    0.05376  -25.89   <2e-16 ***
#> (Intercept):2 -0.65519    0.03585  -18.27   <2e-16 ***
#> log(om)        1.14084    0.04108   27.77   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Names of linear predictors: meanlog, loglink(sdlog)
#> 
#> Log-likelihood: -262.1058 on 775 degrees of freedom
#> 
#> Number of Fisher scoring iterations: 7 
#> 
#> No Hauck-Donner effect found in any of the estimates
plot(fitted(m3) ~ log(df$om))

summary(fitted(m3))
#>        V1         
#>  Min.   : 0.1314  
#>  1st Qu.: 0.6830  
#>  Median : 1.0636  
#>  Mean   : 1.3695  
#>  3rd Qu.: 1.5749  
#>  Max.   :13.2402
summary(df$oc, na.rm = TRUE)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.05172  0.52067  1.10691  1.29848  1.59969 12.43664
plot(df$oc, fitted(m3), xlim = c(0, 4), ylim = c(0, 4))
abline(a= 0, b = 1, col = "red")

cor(df$oc, fitted(m3))
#>           [,1]
#> [1,] 0.7335657

^{Created on 2023-05-22 with reprex v2.0.2}

Pakillo commented 1 year ago

Now with simulated data:

set.seed(8)
om <- rlnorm(100, meanlog = 1.5, sdlog = 1)
hist(om, breaks = 50)

summary(om)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.2199  2.1947  4.1499  6.8813  8.1795 48.2349

oc <- rlnorm(100, log(0.6*om), 0.5)
hist(oc, breaks = 50)

summary(oc)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.07848  1.04400  2.51161  4.58285  5.61627 36.91661

plot(oc ~ om)


plot(log(oc) ~ log(om))

df <- data.frame(om = om, oc = oc)

library(visreg)

library(DHARMa)
#> This is DHARMa 0.4.6. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')

## 1. Linear model

m1 <- lm(oc ~ om, data = df)
summary(m1)
#> 
#> Call:
#> lm(formula = oc ~ om, data = df)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -11.2993  -1.0384  -0.4855   0.5393  24.6540 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)  0.62094    0.46198   1.344    0.182    
#> om           0.57575    0.04427  13.005   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 3.473 on 98 degrees of freedom
#> Multiple R-squared:  0.6331, Adjusted R-squared:  0.6294 
#> F-statistic: 169.1 on 1 and 98 DF,  p-value: < 2.2e-16
visreg(m1)

simulateResiduals(m1, plot = TRUE)

#> Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
#>  
#> Scaled residual values: 0.556 0.124 0.352 0.384 0.444 0.352 0.548 0.396 0.48 0.504 0.436 0.284 0.596 0.348 0.792 0.548 0.66 0.708 0.028 0.44 ...
summary(fitted(m1))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.7476  1.8845  3.0103  4.5828  5.3302 28.3920
summary(df$oc, na.rm = TRUE)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.07848  1.04400  2.51161  4.58285  5.61627 36.91661
plot(df$oc, fitted(m1), xlim = c(0, 8), ylim = c(0, 8))
abline(a = 0, b = 1, col = "red")

cor(df$oc, fitted(m1))
#> [1] 0.7956948

## 2. lm(log(oc))

m2 <- lm(log(oc) ~ log(om), data = df)
summary(m2)
#> 
#> Call:
#> lm(formula = log(oc) ~ log(om), data = df)
#> 
#> Residuals:
#>      Min       1Q   Median       3Q      Max 
#> -1.20306 -0.40277  0.07664  0.34591  1.22284 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept) -0.51697    0.08854  -5.839 6.83e-08 ***
#> log(om)      1.00314    0.05004  20.047  < 2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.537 on 98 degrees of freedom
#> Multiple R-squared:  0.804,  Adjusted R-squared:  0.802 
#> F-statistic: 401.9 on 1 and 98 DF,  p-value: < 2.2e-16
visreg(m2)

simulateResiduals(m2, plot = TRUE)

#> Object of Class DHARMa with simulated residuals based on 250 simulations with refit = FALSE . See ?DHARMa::simulateResiduals for help. 
#>  
#> Scaled residual values: 0.664 0.012 0.072 0.056 0.488 0.316 0.72 0.36 0.588 0.728 0.448 0.184 0.74 0.12 0.924 0.924 0.584 0.924 0.116 0.432 ...
summary(exp(fitted(m2)))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.1305  1.3120  2.4858  4.1345  4.9099 29.1156
summary(df$oc, na.rm = TRUE)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.07848  1.04400  2.51161  4.58285  5.61627 36.91661
plot(df$oc, exp(fitted(m2)), xlim = c(0, 8), ylim = c(0, 8))
abline(a = 0, b = 1, col = "red")

cor(df$oc, fitted(m2))
#> [1] 0.7033441

## 3. lognormal glm

library(VGAM)
#> Loading required package: stats4
#> Loading required package: splines
m3 <- vglm(oc ~ log(om), data = df, family = lognormal)
summary(m3)
#> 
#> Call:
#> vglm(formula = oc ~ log(om), family = lognormal, data = df)
#> 
#> Coefficients: 
#>               Estimate Std. Error z value Pr(>|z|)    
#> (Intercept):1 -0.51697    0.08765  -5.898 3.67e-09 ***
#> (Intercept):2 -0.63181    0.07071  -8.935  < 2e-16 ***
#> log(om)        1.00314    0.04954  20.250  < 2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Names of linear predictors: meanlog, loglink(sdlog)
#> 
#> Log-likelihood: -168.1262 on 197 degrees of freedom
#> 
#> Number of Fisher scoring iterations: 9 
#> 
#> No Hauck-Donner effect found in any of the estimates
plot(fitted(m3) ~ log(df$om))

summary(fitted(m3))
#>        V1         
#>  Min.   : 0.1503  
#>  1st Qu.: 1.5111  
#>  Median : 2.8631  
#>  Mean   : 4.7621  
#>  3rd Qu.: 5.6551  
#>  Max.   :33.5350
summary(df$oc, na.rm = TRUE)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>  0.07848  1.04400  2.51161  4.58285  5.61627 36.91661
plot(df$oc, fitted(m3), xlim = c(0, 8), ylim = c(0, 8))
abline(a= 0, b = 1, col = "red")

cor(df$oc, fitted(m3))
#>           [,1]
#> [1,] 0.7954332

^{Created on 2023-05-22 with reprex v2.0.2}

Pakillo commented 1 year ago

lognormal models slightly better, even though lm(log) don't perform bad. lm models on absolute scales do perform badly. See #26

Julenasti commented 1 year ago

Very nice tests Paco! Thanks!! Yes, I agree using a lognormal or lm(log)

EcologyR / BlueCarbon

Estimating organic carbon from organic matter: lm(log) vs lognormal glm #28