GLM Practice

# load libraries
library(ggplot2)
library(Rmisc)

## Warning: package 'Rmisc' was built under R version 4.0.3

## Loading required package: lattice

## Loading required package: plyr

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(modelr)
library(emmeans)

## Warning: package 'emmeans' was built under R version 4.0.3

library(piecewiseSEM)

## Warning: package 'piecewiseSEM' was built under R version 4.0.3

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

## 
##   This is piecewiseSEM version 2.1.0.
## 
## 
##   Questions or bugs can be addressed to <LefcheckJ@si.edu>.

library(car)

## Warning: package 'car' was built under R version 4.0.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.0.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.3.0 --

## v tibble  3.0.3     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::arrange()   masks plyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x dplyr::count()     masks plyr::count()
## x dplyr::failwith()  masks plyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x dplyr::id()        masks plyr::id()
## x dplyr::lag()       masks stats::lag()
## x dplyr::mutate()    masks plyr::mutate()
## x car::recode()      masks dplyr::recode()
## x dplyr::rename()    masks plyr::rename()
## x purrr::some()      masks car::some()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()

library(doBy)

## Warning: package 'doBy' was built under R version 4.0.3

## 
## Attaching package: 'doBy'

## The following object is masked from 'package:dplyr':
## 
##     order_by

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.0.3

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:plyr':
## 
##     mutate

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(profileModel)

## Warning: package 'profileModel' was built under R version 4.0.3

library(broom.mixed)

## Warning: package 'broom.mixed' was built under R version 4.0.3

## Registered S3 method overwritten by 'broom.mixed':
##   method      from 
##   tidy.gamlss broom

library(brms)

## Warning: package 'brms' was built under R version 4.0.3

## Loading required package: Rcpp

## Loading 'brms' package (version 2.14.0). Useful instructions
## can be found by typing help('brms'). A more detailed introduction
## to the package is available through vignette('brms_overview').

## 
## Attaching package: 'brms'

## The following object is masked from 'package:stats':
## 
##     ar

library(tidybayes)

## Warning: package 'tidybayes' was built under R version 4.0.3

## 
## Attaching package: 'tidybayes'

## The following objects are masked from 'package:brms':
## 
##     dstudent_t, pstudent_t, qstudent_t, rstudent_t

library(boot)

## 
## Attaching package: 'boot'

## The following object is masked from 'package:car':
## 
##     logit

## The following object is masked from 'package:lattice':
## 
##     melanoma

library(ggdist)

## Warning: package 'ggdist' was built under R version 4.0.3

## 
## Attaching package: 'ggdist'

## The following objects are masked from 'package:brms':
## 
##     dstudent_t, pstudent_t, qstudent_t, rstudent_t

library(loo)

## Warning: package 'loo' was built under R version 4.0.3

## This is loo version 2.3.1

## - Online documentation and vignettes at mc-stan.org/loo

## - As of v2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session.

## - Windows 10 users: loo may be very slow if 'mc.cores' is set in your .Rprofile file (see https://github.com/stan-dev/loo/issues/94).

library(modelr)

1. Comparing Means

To start with, let’s warm up with a simple one-way ANOVA model. This example, from Whitlock and Schluter chapter 15 question 22 looks at the mass of lodgepole pinecones from different habitats.

1.1. Load and plot the data. Choose a plot that not only shows the raw data, but also the means and SE or CI of those means. +1 EC if Michael thinks it’s fancy.

#load data
lodgepole <- read.csv("./raw_data/all_data/chapter15/chap15q23LodgepolePineCones.csv")

# view data
head(lodgepole)

##         habitat conemass
## 1 island.absent      9.6
## 2 island.absent      9.4
## 3 island.absent      8.9
## 4 island.absent      8.8
## 5 island.absent      8.5
## 6 island.absent      8.2

# summarize mean, sd, se and ci of data using Rmisc::summarySE function
lodgepole2 <- summarySE(lodgepole, measurevar="conemass", groupvars="habitat")

# view summary table
lodgepole2

##            habitat N conemass        sd        se        ci
## 1    island.absent 6     8.90 0.5291503 0.2160247 0.5553091
## 2   island.present 5     6.08 0.6220932 0.2782086 0.7724308
## 3 mainland.present 5     6.12 0.4658326 0.2083267 0.5784076

# plot
# Error bars represent standard error of the mean
ggplot(lodgepole2, aes(y=conemass, x=factor(habitat), fill =habitat)) + 
    geom_bar(position=position_dodge(), stat="identity") +
    geom_errorbar(aes(ymin=conemass-se, ymax=conemass+se),
                  width=.2,                    # Width of the error bars
                  position=position_dodge(.9)) +
  labs(y ="Cone Mass", x = "Habitat",
       title = "SE of lodgecone pinecone mass at different habitats")

# Use 95% confidence intervals instead of SEM
ggplot(lodgepole2, aes(y=conemass, x=factor(habitat), fill =habitat)) + 
    geom_bar(position=position_dodge(), stat="identity") +
    geom_errorbar(aes(ymin=conemass-ci, ymax=conemass+ci),
                  width=.2,                    # Width of the error bars
                  position=position_dodge(.9)) +
  labs(y ="Cone Mass", x = "Habitat",
       title = "CI of lodgecone pinecone mass at different habitats")

1.2 Fit a model using least squares and evaluate all relevant assumptions. List them out as you test them. Can we use this model? If not, fix it. But if we can, no fix is needed!

#exploring the data
ggplot(lodgepole, mapping=aes(x=habitat, y=conemass)) +
  stat_summary(color="red", size=1.3) +
    geom_point(alpha=0.7) +
  theme_bw(base_size=17)

## No summary function supplied, defaulting to `mean_se()`

# fit a model
lodgepole_glm <- glm(conemass ~ factor(habitat),
                 data = lodgepole,
                 family = gaussian(link = "identity"))


# assumptions evaluation
#The whole par thing lets me make a multi-panel plot
par(mfrow=c(2,2))
plot(lodgepole_glm, which=c(1,2,5))
par(mfrow=c(1,1))

# compare the distribution of residuals across each habitat
lodgepole <- lodgepole %>%
  add_residuals(lodgepole_glm)

qplot(habitat, resid, data = lodgepole, geom = "boxplot")

# F-test
Anova(lodgepole_glm)

## Analysis of Deviance Table (Type II tests)
## 
## Response: conemass
##                 LR Chisq Df Pr(>Chisq)    
## factor(habitat)   100.17  2  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#post-hocs tests
lodgepole_em <- emmeans::emmeans(lodgepole_glm, specs = ~ habitat)
lodgepole_em

##  habitat          emmean    SE  df asymp.LCL asymp.UCL
##  island.absent      8.90 0.221 Inf      8.47      9.33
##  island.present     6.08 0.242 Inf      5.61      6.55
##  mainland.present   6.12 0.242 Inf      5.65      6.59
## 
## Confidence level used: 0.95

# contrast means 
contrast(lodgepole_em, method = "tukey")

##  contrast                          estimate    SE  df z.ratio p.value
##  island.absent - island.present        2.82 0.328 Inf  8.596  <.0001 
##  island.absent - mainland.present      2.78 0.328 Inf  8.474  <.0001 
##  island.present - mainland.present    -0.04 0.343 Inf -0.117  0.9925 
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

# plot contrasts
plot(contrast(lodgepole_em,
        method = "tukey")) +
  geom_vline(xintercept = 0, color = "red", lty=2)

# see which groups are statistically the same versus different using cld
multcomp::cld(lodgepole_em, adjust="tukey")

## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons

##  habitat          emmean    SE  df asymp.LCL asymp.UCL .group
##  island.present     6.08 0.242 Inf      5.50      6.66  1    
##  mainland.present   6.12 0.242 Inf      5.54      6.70  1    
##  island.absent      8.90 0.221 Inf      8.37      9.43   2   
## 
## Confidence level used: 0.95 
## Conf-level adjustment: sidak method for 3 estimates 
## P value adjustment: tukey method for comparing a family of 3 estimates 
## significance level used: alpha = 0.05

# plot
multcomp::cld(lodgepole_em, adjust="tukey") %>%
  ggplot(aes(x = habitat, y = emmean, 
             ymin = asymp.LCL, ymax = asymp.UCL,
             color = factor(.group))) +
  geom_pointrange()

## Note: adjust = "tukey" was changed to "sidak"
## because "tukey" is only appropriate for one set of pairwise comparisons

1.2 How much variation is explained by your model?

A significant amount of variation was explained by this model. Out of the three pairwise comparisons, two were significantly different where, p < 0.05

1.3 Show which means are different from each other. Are you correcting p-values? If so, how, and justify your choice.

Yes, I am correcting p-values for easy detection of statistically significant differences between all pairwise comparisons.

# p-value correction
contrast(lodgepole_em,
        method = "tukey", adjust="bonferroni")

##  contrast                          estimate    SE  df z.ratio p.value
##  island.absent - island.present        2.82 0.328 Inf  8.596  <.0001 
##  island.absent - mainland.present      2.78 0.328 Inf  8.474  <.0001 
##  island.present - mainland.present    -0.04 0.343 Inf -0.117  1.0000 
## 
## P value adjustment: bonferroni method for 3 tests

2. Comparing Means from Multiple Categories

In a study from Rogers et al. (2020) link, the authors performed an experiment where they moved panels that had been colonized by invertebrates on a dock to a nearby rocky jetty where predators could access panels. To separate out the effects of changes in abiotic environment versus predation, they performed a factorial experiment, either caging or not caging panels and placing them either on the side of a cinder block or hanging on a piece of PVC attached to the block where predators would have little access (but weren’t entirely stopped). They then looked at change in total cover of invertebrates. Using this old data file dug off of my hard drive, let’s see what they found.

2.1. Load and plot the data. We are interested in change in percent cover. Choose a plot that not only shows the raw data, but also the means and SE or CI of those means. +1 EC if Michael thinks it’s fancy.

# download and read data directly and fix names using janitor::clean_names
fouling <- read.csv("./raw_data/fouling_transplant_data.csv") %>%
  janitor::clean_names()

# view data
head(fouling)

##   treatment caged position_on_block botry_init botry_fin water_init water_fin
## 1      DJHC Caged           Hanging         13        13          3         3
## 2      DJHC Caged           Hanging         11        11          5         5
## 3      DJHC Caged           Hanging         21        21          7         8
## 4      DJHC Caged           Hanging         24        22         23        21
## 5      DJHC Caged           Hanging         14        12          9         8
## 6      DJHC Caged           Hanging         28        29         13        10
##   diplo_init diplo_fin distap_init distap_fin bugula_init bugula_fin
## 1          0         0           1          1           4          4
## 2         20        25           2          4           2          3
## 3          0         0           0          3           0          0
## 4          0         0           0          0           0          0
## 5          0         0           0          0           2          1
## 6          0         0           1          1           1          1
##   initial_cover final_cover change_in_cover
## 1            21          21               0
## 2            40          48               8
## 3            28          32               4
## 4            47          43              -4
## 5            25          21              -4
## 6            43          41              -2

# convert double to factor
fouling2 <- fouling %>%
  mutate_at(vars(botry_init, botry_fin, water_init, water_fin, diplo_init, diplo_fin, distap_init, distap_fin, bugula_init, bugula_fin, initial_cover, final_cover, change_in_cover), 
            list(factor)) %>%
  summarise(treatment, caged, position_on_block, change_in_cover)

long_fouling2 <- fouling2 %>%
  pivot_longer(cols = -c(treatment, caged, position_on_block),
               names_to = "variables",
               values_to = "results")

# summarize mean, sd, se and ci of data using Rmisc::summarySE function
sum_fouling <- summaryBy(results ~ treatment + caged + position_on_block, data=long_fouling2, FUN=c(length, mean, sd)) %>%
  mutate(result.se = results.sd / sqrt(results.length)) %>%
  pivot_longer(cols = c("treatment", "caged", "position_on_block"),
               names_to = "conditions")
  
# view data
head(sum_fouling)

## # A tibble: 6 x 6
##   results.length results.mean results.sd result.se conditions        value  
##            <dbl>        <dbl>      <dbl>     <dbl> <chr>             <chr>  
## 1              8        11          4.31      1.52 treatment         DJHC   
## 2              8        11          4.31      1.52 caged             Caged  
## 3              8        11          4.31      1.52 position_on_block Hanging
## 4              8         9.88       5.25      1.86 treatment         DJHO   
## 5              8         9.88       5.25      1.86 caged             Open   
## 6              8         9.88       5.25      1.86 position_on_block Hanging

# plot
# Error bars represent standard error of the mean
ggplot(sum_fouling, aes(y=results.mean, x=value, fill =value)) +
  facet_grid(. ~ conditions) +
  geom_bar(stat="identity") +
  geom_errorbar(aes(ymin=results.mean-result.se, ymax=results.mean+result.se),
                  width=.2,                    # Width of the error bars
                  position="identity") +
  coord_flip() +
  labs(y ="Mean values", x = "Treatments",
       title = "Change in percent cover across blocks")

2.2 Fit a model using likelihood and evaluate all relevant assumptions. Do you meet assumptions?

# Factorial Anova

# fit and assumption evaluation
fouling_mle <- glm(change_in_cover ~ position_on_block*caged, data=fouling2, family="binomial")


#The whole par thing lets me make a multi-panel plot
par(mfrow=c(2,2))
plot(fouling_mle, which=c(1,2,5))

## hat values (leverages) are all = 0.125
##  and there are no factor predictors; no plot no. 5

par(mfrow=c(1,1))

# compare the distribution of residuals across each habitat
fouling2 <- fouling2 %>%
  add_residuals(fouling_mle)

## Warning in Ops.factor(response(model, data), stats::predict(model, data)): '-'
## not meaningful for factors

qplot(position_on_block, resid, data = fouling2, geom = "boxplot")

# F-tests
Anova(fouling_mle)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Analysis of Deviance Table (Type II tests)
## 
## Response: change_in_cover
##                         LR Chisq Df Pr(>Chisq)
## position_on_block          1.453  1     0.2280
## caged                      1.453  1     0.2280
## position_on_block:caged    0.000  1     0.9999

# compare means
fouling_mle_em <- emmeans(fouling_mle, ~position_on_block+caged)

multcomp::cld(fouling_mle_em)

##  position_on_block caged emmean       SE  df asymp.LCL asymp.UCL .group
##  Side              Open    1.95     1.07 Inf -1.49e-01      4.04  1    
##  Hanging           Caged  21.57 10335.23 Inf -2.02e+04  20278.24  1    
##  Side              Caged  21.57 10335.23 Inf -2.02e+04  20278.24  1    
##  Hanging           Open   21.57 10335.23 Inf -2.02e+04  20278.24  1    
## 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05

# posthocs
contrast(fouling_mle_em, method = "dunnett")

##  contrast                     estimate    SE  df z.ratio p.value
##  Side Caged - Hanging Caged        0.0 14616 Inf  0.000  1.0000 
##  Hanging Open - Hanging Caged      0.0 14616 Inf  0.000  1.0000 
##  Side Open - Hanging Caged       -19.6 10335 Inf -0.002  1.0000 
## 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: dunnettx method for 3 tests

# plot
cont <- contrast(emmeans(fouling_mle, ~ position_on_block|caged), method = "tukey")

plot(cont) +
  geom_vline(xintercept = 0, color = "red", lty = 2)

All tested assumptions for this model were not met. Means were homogeneous and residuals of change could not be identified.

2.3 If you answered yes to the above…. you are wrong. It doesn’t! Percentage data is weird. Difference in percentages can be ever weirder! There are three tried and true solutions here. But they MIGHT not all work.

Incorporate initial cover as a covariate. This takes out that influence, and as such we’re looking at residuals of change. This sometimes, but not always, works.

# Factorial Anova

# fit and assumption evaluation
fouling_mle <- glm(initial_cover ~ position_on_block*caged, data=fouling, family="gaussian")


#The whole par thing lets me make a multi-panel plot
par(mfrow=c(2,2))
plot(fouling_mle, which=c(1,2,5))

## hat values (leverages) are all = 0.125
##  and there are no factor predictors; no plot no. 5

par(mfrow=c(1,1))

# compare the distribution of residuals across each habitat
fouling <- fouling %>%
  add_residuals(fouling_mle)

qplot(position_on_block, resid, data = fouling, geom = "boxplot")

# F-tests
Anova(fouling_mle)

## Analysis of Deviance Table (Type II tests)
## 
## Response: initial_cover
##                         LR Chisq Df Pr(>Chisq)
## position_on_block        0.13871  1     0.7096
## caged                    0.87465  1     0.3497
## position_on_block:caged  0.25488  1     0.6137

# compare means
fouling_mle_em <- emmeans(fouling_mle, ~position_on_block+caged)

multcomp::cld(fouling_mle_em)

##  position_on_block caged emmean   SE  df asymp.LCL asymp.UCL .group
##  Side              Caged   38.2 7.55 Inf      23.4      53.1  1    
##  Hanging           Caged   44.9 7.55 Inf      30.1      59.7  1    
##  Hanging           Open    48.1 7.55 Inf      33.3      62.9  1    
##  Side              Open    49.1 7.55 Inf      34.3      63.9  1    
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05

# posthocs
contrast(fouling_mle_em, method = "dunnett")

##  contrast                     estimate   SE  df z.ratio p.value
##  Side Caged - Hanging Caged      -6.62 10.7 Inf -0.620  0.8391 
##  Hanging Open - Hanging Caged     3.25 10.7 Inf  0.304  0.9623 
##  Side Open - Hanging Caged        4.25 10.7 Inf  0.398  0.9344 
## 
## P value adjustment: dunnettx method for 3 tests

# plot
cont <- contrast(emmeans(fouling_mle, ~ position_on_block|caged), method = "tukey")

plot(cont) +
  geom_vline(xintercept = 0, color = "red", lty = 2)

Divide change by initial cover to express change as percent change relative to initial cover.

# add a column for percent change
fouling <-  fouling %>%
  mutate(percent_change = change_in_cover/initial_cover)

# Factorial Anova

# fit and assumption evaluation
fouling_mle <- glm(percent_change ~ position_on_block*caged, data=fouling, family="gaussian")


#The whole par thing lets me make a multi-panel plot
par(mfrow=c(2,2))
plot(fouling_mle, which=c(1,2,5))

## hat values (leverages) are all = 0.125
##  and there are no factor predictors; no plot no. 5

par(mfrow=c(1,1))

# compare the distribution of residuals across each habitat
fouling <- fouling %>%
  add_residuals(fouling_mle)

qplot(position_on_block, resid, data = fouling, geom = "boxplot")

# F-tests
Anova(fouling_mle)

## Analysis of Deviance Table (Type II tests)
## 
## Response: percent_change
##                         LR Chisq Df Pr(>Chisq)  
## position_on_block         4.8150  1    0.02821 *
## caged                     4.6258  1    0.03149 *
## position_on_block:caged   1.4525  1    0.22813  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# compare means
fouling_mle_em <- emmeans(fouling_mle, ~position_on_block+caged)

multcomp::cld(fouling_mle_em)

##  position_on_block caged  emmean     SE  df asymp.LCL asymp.UCL .group
##  Side              Open  -0.3173 0.0805 Inf    -0.475    -0.160  1    
##  Side              Caged -0.0473 0.0805 Inf    -0.205     0.110  12   
##  Hanging           Open  -0.0438 0.0805 Inf    -0.202     0.114  12   
##  Hanging           Caged  0.0322 0.0805 Inf    -0.125     0.190   2   
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05

# posthocs
contrast(fouling_mle_em, method = "dunnett")

##  contrast                     estimate    SE  df z.ratio p.value
##  Side Caged - Hanging Caged    -0.0796 0.114 Inf -0.699  0.7968 
##  Hanging Open - Hanging Caged  -0.0761 0.114 Inf -0.669  0.8137 
##  Side Open - Hanging Caged     -0.3496 0.114 Inf -3.072  0.0061 
## 
## P value adjustment: dunnettx method for 3 tests

# contrast means
cont <- contrast(emmeans(fouling_mle, ~ position_on_block|caged), method = "tukey")

# plot contrast
plot(cont) +
  geom_vline(xintercept = 0, color = "red", lty = 2)

Calculate difference in logit cover (so, logit(initial cover) - logit(final cover)). Logit transformations linearize percent cover data, and are often all that is needed to work percent cover into a linear model. You can use car::logit() for this.

#create a logit column
fouling <- fouling %>%
  mutate(Log = car::logit(initial_cover) - car::logit(final_cover))

# Factorial Anova

# fit and assumption evaluation
fouling_mle <- glm(Log ~ position_on_block*caged, data=fouling, family="gaussian")


#The whole par thing lets me make a multi-panel plot
par(mfrow=c(2,2))
plot(fouling_mle, which=c(1,2,4,5))

## hat values (leverages) are all = 0.125
##  and there are no factor predictors; no plot no. 5

par(mfrow=c(1,1))


# compare the distribution of residuals across each habitat
fouling <- fouling %>%
  add_residuals(fouling_mle)

qplot(position_on_block, resid, data = fouling, geom = "boxplot")

# F-tests
Anova(fouling_mle)

## Analysis of Deviance Table (Type II tests)
## 
## Response: Log
##                         LR Chisq Df Pr(>Chisq)  
## position_on_block         5.8783  1    0.01533 *
## caged                     3.1975  1    0.07375 .
## position_on_block:caged   0.5070  1    0.47642  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# compare means
fouling_mle_em <- emmeans(fouling_mle, ~position_on_block+caged)

multcomp::cld(fouling_mle_em)

##  position_on_block caged  emmean    SE  df asymp.LCL asymp.UCL .group
##  Hanging           Caged -0.3853 0.282 Inf    -0.937     0.167  1    
##  Hanging           Open  -0.0823 0.282 Inf    -0.634     0.470  12   
##  Side              Caged  0.0969 0.282 Inf    -0.455     0.649  12   
##  Side              Open   0.8008 0.282 Inf     0.249     1.353   2   
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 4 estimates 
## significance level used: alpha = 0.05

# posthocs
contrast(fouling_mle_em, method = "dunnett")

##  contrast                     estimate    SE  df z.ratio p.value
##  Side Caged - Hanging Caged      0.482 0.398 Inf 1.211   0.4765 
##  Hanging Open - Hanging Caged    0.303 0.398 Inf 0.761   0.7615 
##  Side Open - Hanging Caged       1.186 0.398 Inf 2.979   0.0083 
## 
## P value adjustment: dunnettx method for 3 tests

# p-value correction
contrast(fouling_mle_em,
        method = "tukey", adjust="bonferroni")

##  contrast                     estimate    SE  df z.ratio p.value
##  Hanging Caged - Side Caged     -0.482 0.398 Inf -1.211  1.0000 
##  Hanging Caged - Hanging Open   -0.303 0.398 Inf -0.761  1.0000 
##  Hanging Caged - Side Open      -1.186 0.398 Inf -2.979  0.0174 
##  Side Caged - Hanging Open       0.179 0.398 Inf  0.450  1.0000 
##  Side Caged - Side Open         -0.704 0.398 Inf -1.768  0.4624 
##  Hanging Open - Side Open       -0.883 0.398 Inf -2.218  0.1594 
## 
## P value adjustment: bonferroni method for 6 tests

# plot
cont <- contrast(emmeans(fouling_mle, ~ position_on_block|caged), method = "tukey")

plot(cont) +
  geom_vline(xintercept = 0, color = "red", lty = 2)

Try all three methods. Which one works so that you can produce valid inference?

The Logit transformation has more reasonable p-values(smaller numbers) compared to the other methods.

2.4 Great! So, take us home! Using NHST with an alpha of 0.08 (why not), what does this fit model tell you about whether predation matters given how I have described the system? Feel free to replot the data or fit model results if helpful.

Significant difference was observed in only one group where p < 0.08 (Hanging Caged - Side Open). This shows that the arrangement of the panel has a strong effect on predation. Predators can have no access when the panel is caged and hanging and have maximum access when the panel is opened and placed by the side.

3. Comparing Means with Covariates

We will wrap up with a model mixing continuous and discrete variables. In this dataset from Scantlebury et al, the authors explored how caste and mass affected the energy level of naked mole rats.

3.1 OK, you know what you are about at this point. Load in the data, plot it, fit it, check assumptions. Use Bayes for this.

# load mole rats data
mole_rats <- read.csv("./raw_data/18e4MoleRatLayabouts.csv")

# view data
head(mole_rats)

##    caste   lnmass lnenergy
## 1 worker 3.850148 3.688879
## 2 worker 3.988984 3.688879
## 3 worker 4.110874 3.688879
## 4 worker 4.174387 3.663562
## 5 worker 4.248495 3.871201
## 6 worker 4.262680 3.850148

# plot
ggplot(mole_rats, aes(x= lnmass, y = lnenergy, fill = caste)) +
  geom_boxplot()

# fit Bayes model and summarize
mole_banova <- brm(lnenergy ~ lnmass+caste,
                   data = mole_rats,
                   chains = 3,
                   family=gaussian())

## Compiling Stan program...

## Start sampling

## 
## SAMPLING FOR MODEL '70a63c5fd575285edb8f7ad3d1fac50e' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.085 seconds (Warm-up)
## Chain 1:                0.057 seconds (Sampling)
## Chain 1:                0.142 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL '70a63c5fd575285edb8f7ad3d1fac50e' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.09 seconds (Warm-up)
## Chain 2:                0.053 seconds (Sampling)
## Chain 2:                0.143 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL '70a63c5fd575285edb8f7ad3d1fac50e' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 0 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.064 seconds (Warm-up)
## Chain 3:                0.057 seconds (Sampling)
## Chain 3:                0.121 seconds (Total)
## Chain 3:

# compare means
mole_banova_em <- emmeans(mole_banova, ~lnmass+caste)
#view means
mole_banova_em

##  lnmass caste  emmean lower.HPD upper.HPD
##    4.54 lazy     3.95      3.73      4.16
##    4.54 worker   4.35      4.20      4.51
## 
## Point estimate displayed: median 
## HPD interval probability: 0.95

# Banova and variance partitioning
# emmeans helps us out here again as it provides ready access to the caste levels
sd_caste <- gather_emmeans_draws(mole_banova_em) %>%
  group_by(.draw) %>%
  summarize(sd_caste = sd(.value))

## `summarise()` ungrouping output (override with `.groups` argument)

# To get the sd of residuals for each draw of the coefficients
sd_residuals <- residuals(mole_banova, summary=FALSE) %>%
  t() %>%
  as.data.frame() %>%
  summarise_all(sd) %>%
  as.numeric

# group sd for each caste  
sd_groups <- tibble(type = c(rep("caste", 3000),
                             rep("residual", 3000)),
                    value = c(sd_caste$sd_caste, sd_residuals))

# plot
ggplot(sd_groups, 
       aes(x = value, y = type)) +
  geom_halfeyeh()

## Warning: 'geom_halfeyeh' is deprecated.
## Use 'stat_halfeye' instead.
## See help("Deprecated") and help("tidybayes-deprecated").

# make a tibble of posteriors
sd_bycol <- tibble(caste_sd = sd_caste$sd_caste,
                   residuals_sd = sd_residuals)


tidyMCMC(sd_bycol, conf.int = TRUE, conf.method = "HPDinterval")

## # A tibble: 2 x 5
##   term         estimate std.error conf.low conf.high
##   <chr>           <dbl>     <dbl>    <dbl>     <dbl>
## 1 caste_sd        0.280   0.108     0.0499     0.482
## 2 residuals_sd    0.294   0.00999   0.288      0.317

#  get % of variance by transforming sd_bycol to percentages
sd_percent_bycol <- sd_bycol/rowSums(sd_bycol) * 100

tidyMCMC(sd_percent_bycol, estimate.method = "median",
         conf.int = TRUE, conf.method = "HPDinterval")

## # A tibble: 2 x 5
##   term         estimate std.error conf.low conf.high
##   <chr>           <dbl>     <dbl>    <dbl>     <dbl>
## 1 caste_sd         49.0      11.2     22.7      63.7
## 2 residuals_sd     51.0      11.2     36.3      77.3

3.2 Examine whether there is an interaction or not using LOO cross-validation. Is a model with an interaction more predictive?

# Factorial Anova

# Using boot::cv.glm() for LOO or k-fold ####
# validate model with interaction
loo1 <- loo(mole_banova, save_psis = TRUE, cores = 2)

# validate model without interaction
mole_banova2 <- brm(lnmass ~ lnenergy, data = mole_rats,
                 family = gaussian(link = "identity"))

## Compiling Stan program...

## Start sampling

## 
## SAMPLING FOR MODEL '21b08133c08a566f8fd6ff88314fbde8' NOW (CHAIN 1).
## Chain 1: 
## Chain 1: Gradient evaluation took 0 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1: 
## Chain 1: 
## Chain 1: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 1: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 1: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 1: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 1: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 1: 
## Chain 1:  Elapsed Time: 0.042 seconds (Warm-up)
## Chain 1:                0.064 seconds (Sampling)
## Chain 1:                0.106 seconds (Total)
## Chain 1: 
## 
## SAMPLING FOR MODEL '21b08133c08a566f8fd6ff88314fbde8' NOW (CHAIN 2).
## Chain 2: 
## Chain 2: Gradient evaluation took 0 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2: 
## Chain 2: 
## Chain 2: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 2: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 2: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 2: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 2: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 2: 
## Chain 2:  Elapsed Time: 0.268 seconds (Warm-up)
## Chain 2:                0.044 seconds (Sampling)
## Chain 2:                0.312 seconds (Total)
## Chain 2: 
## 
## SAMPLING FOR MODEL '21b08133c08a566f8fd6ff88314fbde8' NOW (CHAIN 3).
## Chain 3: 
## Chain 3: Gradient evaluation took 0 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3: 
## Chain 3: 
## Chain 3: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 3: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 3: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 3: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 3: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 3: 
## Chain 3:  Elapsed Time: 0.043 seconds (Warm-up)
## Chain 3:                0.047 seconds (Sampling)
## Chain 3:                0.09 seconds (Total)
## Chain 3: 
## 
## SAMPLING FOR MODEL '21b08133c08a566f8fd6ff88314fbde8' NOW (CHAIN 4).
## Chain 4: 
## Chain 4: Gradient evaluation took 0 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4: 
## Chain 4: 
## Chain 4: Iteration:    1 / 2000 [  0%]  (Warmup)
## Chain 4: Iteration:  200 / 2000 [ 10%]  (Warmup)
## Chain 4: Iteration:  400 / 2000 [ 20%]  (Warmup)
## Chain 4: Iteration:  600 / 2000 [ 30%]  (Warmup)
## Chain 4: Iteration:  800 / 2000 [ 40%]  (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%]  (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%]  (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%]  (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%]  (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%]  (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%]  (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%]  (Sampling)
## Chain 4: 
## Chain 4:  Elapsed Time: 0.043 seconds (Warm-up)
## Chain 4:                0.043 seconds (Sampling)
## Chain 4:                0.086 seconds (Total)
## Chain 4:

loo2 <- loo(mole_banova2, save_psis = TRUE, cores = 2)

# comparison
loo_compare(loo1, loo2)

## Warning: Not all models have the same y variable. ('yhash' attributes do not
## match)

##              elpd_diff se_diff
## mole_banova   0.0       0.0   
## mole_banova2 -2.5       4.1

The model with an interaction is more predictive as it has a lower standard error (0) when compared to the model without an interaction.

3.3 Compare the two castes energy expenditure at the mean level of log mass. Are they different? How would you discuss your conclusions.

# posthocs
# compare the two castes at mean level of log mass
contrast(mole_banova_em, method = "dunnett")

##  contrast                                        estimate lower.HPD upper.HPD
##  4.54009315473429 worker - 4.54009315473429 lazy    0.396    0.0621     0.675
## 
## Point estimate displayed: median 
## HPD interval probability: 0.95

# p-value correction
contrast(mole_banova_em,
        method = "tukey", adjust="bonferroni")

##  contrast                                        estimate lower.HPD upper.HPD
##  4.54009315473429 lazy - 4.54009315473429 worker   -0.396    -0.675   -0.0621
## 
## Point estimate displayed: median 
## HPD interval probability: 0.95

# We can visualize this using `tidybayes::gather_emmeans_draws`` to see the results of the contrast.
contrast(mole_banova_em, method = "tukey") %>%
  gather_emmeans_draws() %>%
  ggplot(aes(x = .value, y = contrast)) +
  stat_halfeye() +
  geom_vline(xintercept = 0, color = "red", lty = 2)

# some interesting and useful visualizations of the means themselves with some additional geoms
gather_emmeans_draws(mole_banova_em) %>%
  ggplot(aes(x = caste, y = .value)) +
  stat_lineribbon(alpha = 0.25, fill = "gray25") +
  stat_pointinterval()

From the mean estimates and contrast plots, it can be inferred that both castes have the same level of energy expenditure.

3.4 Plot the fit model. Use tidybayes and ggdist with your model to show fit and credible intervals with the raw data points on top. modelr::data.grid() might help as well.

# make a grid using modelr::data_grid()
grid <- mole_rats %>%
  data_grid(lnmass, caste) %>%
  mutate(pred = predict(mole_banova, newdata = ., type ='response'))

#show fit and credible intervals with the raw data points on top
ggplot(mole_rats, aes(lnmass, lnenergy, color = caste)) +
  geom_point(aes(y = lnenergy - 1)) +
  geom_line(data = grid, aes(y = pred[1:60])) +
  scale_y_continuous('lnenergy', breaks = 0:1, labels = levels(mole_rats$lnenergy))

GLM Practice

Uzezi Okinedo

11/23/2020