Cross-Validation and Bayes

# load all required libraries
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)
library(rsample)

## Warning: package 'rsample' was built under R version 4.0.3

library(modelr)
library(boot)
library(AICcmodavg)

## Warning: package 'AICcmodavg' was built under R version 4.0.3

library(tidyr)
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.0.3

library(ggpmisc)

## Warning: package 'ggpmisc' was built under R version 4.0.3

## 
## Attaching package: 'ggpmisc'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(palmerpenguins)
library(rgl)

## Warning: package 'rgl' was built under R version 4.0.3

library(rayshader)

## Warning: package 'rayshader' was built under R version 4.0.3

1. Create models with different polys

Let’s first look at the data. Plot it, along with a polynomial fit (remember, formula = y ~ poly(x,2) for a quadratic). Then, compare the r2 value of a linear versus fifth order fit. What do you see?

# read progesterone data
progesterone_data <- read.csv("./raw_data/all_data/chapter17/chap17q07ProgesteroneExercise.csv")

# view data
progesterone_data

##    progesterone ventilation
## 1           9.8        62.1
## 2          13.8        55.8
## 3          17.3        57.7
## 4          22.5        58.1
## 5          12.7        65.3
## 6          20.2        66.4
## 7          20.4        68.6
## 8          23.3        69.3
## 9          32.9        58.8
## 10         37.5        65.6
## 11         35.2        72.3
## 12         54.2        64.1
## 13         59.4        59.3
## 14         78.3        53.4
## 15         79.6        52.5
## 16        100.6        54.8
## 17         91.5        60.5
## 18        106.0        64.7
## 19         80.6        68.1
## 20         66.2        65.1
## 21         58.8        69.6
## 22         54.8        72.3
## 23         71.0        74.3
## 24         73.1        80.6
## 25         59.4        88.8
## 26        106.5        79.2
## 27        164.8        84.8
## 28        155.2        97.1
## 29        104.6       106.4
## 30         84.4       105.9

# Basic scatter plot using ggplot2
ggplot(progesterone_data, aes(x= progesterone, y= ventilation)) +
  geom_point() +
  #plot with a polynomial fit using formula = y ~ poly(x,2)
  stat_smooth(method = lm, formula = y ~ poly(x, 5)) +
  stat_poly_eq(formula = y ~ poly(x, 5), parse = TRUE)

#fit a linear model
progesterone_lm <- lm(ventilation ~ progesterone, data=progesterone_data)
#fit a polynomial model
progesterone_poly <- lm(ventilation ~ poly(progesterone, 5), data = progesterone_data)

# check r2 value of linear model
summary(progesterone_lm)$r.squared

## [1] 0.2371781

# check r2 value of polynomial model
summary(progesterone_poly)$r.squared

## [1] 0.2746004

The r2 value of fifth order fit (0.2460284) is slighter higher than the linear fit (0.2371781).

2. Fit each model with 5-fold CV

Does that result hold up, or is it due to overfitting? Let’s evaluate by comparing 5-fold CV scores using RMSE. Let’s do this efficiently, though!

A. Get things ready! Make a 5-fold cross validation tibble using rsample::vfold_cv() and then combine each possible fold with the polynomials 1:5 using tidyr::crossing()

# 5-Fold Cross-Validation

# make a folded data set object using tidyr::crossing()
progesterone_five_fold <- vfold_cv(progesterone_data, v = 5) %>%
  # use polynomials 1:5
  crossing(polynomial = 1:5)

# view CV data
progesterone_five_fold

## # A tibble: 25 x 3
##    splits         id    polynomial
##    <list>         <chr>      <int>
##  1 <split [24/6]> Fold1          1
##  2 <split [24/6]> Fold1          2
##  3 <split [24/6]> Fold1          3
##  4 <split [24/6]> Fold1          4
##  5 <split [24/6]> Fold1          5
##  6 <split [24/6]> Fold2          1
##  7 <split [24/6]> Fold2          2
##  8 <split [24/6]> Fold2          3
##  9 <split [24/6]> Fold2          4
## 10 <split [24/6]> Fold2          5
## # ... with 15 more rows

B. Now you have splits and a column of coefficients. Use purr::map2() to make a list column of fit models, where you use the splits and data and the polynomials for your poly() call in the model.

# start with our tibble
prog_mod <- progesterone_five_fold %>%
  # create a new column, which we make with map2
  # iterating over all splits AND polynomials
 mutate(mods = map2(splits, 
                       polynomial, 
                       ~lm(ventilation ~ poly(progesterone, .y),
                           data = analysis(.x))))
                    #for each split, fit a model using
                    #the training data set
                  

# view data
prog_mod

## # A tibble: 25 x 4
##    splits         id    polynomial mods  
##    <list>         <chr>      <int> <list>
##  1 <split [24/6]> Fold1          1 <lm>  
##  2 <split [24/6]> Fold1          2 <lm>  
##  3 <split [24/6]> Fold1          3 <lm>  
##  4 <split [24/6]> Fold1          4 <lm>  
##  5 <split [24/6]> Fold1          5 <lm>  
##  6 <split [24/6]> Fold2          1 <lm>  
##  7 <split [24/6]> Fold2          2 <lm>  
##  8 <split [24/6]> Fold2          3 <lm>  
##  9 <split [24/6]> Fold2          4 <lm>  
## 10 <split [24/6]> Fold2          5 <lm>  
## # ... with 15 more rows

C. Great! Now, calculate the rmse for each fold/polynomial combination as we did in lab.

progesterone_five_fold_rmse <- prog_mod %>%
  # create a new column, which we make with map2
  # iterating over all splits AND fit models
  mutate(rmse = map2_dbl(.x = splits, .y = mods,
                         ~rmse(model = .y,
                               data = assessment(.x)))) # x is a standing for every element of split

# view data
progesterone_five_fold_rmse

## # A tibble: 25 x 5
##    splits         id    polynomial mods    rmse
##    <list>         <chr>      <int> <list> <dbl>
##  1 <split [24/6]> Fold1          1 <lm>    8.10
##  2 <split [24/6]> Fold1          2 <lm>    7.78
##  3 <split [24/6]> Fold1          3 <lm>    7.75
##  4 <split [24/6]> Fold1          4 <lm>    7.09
##  5 <split [24/6]> Fold1          5 <lm>   10.1 
##  6 <split [24/6]> Fold2          1 <lm>   12.6 
##  7 <split [24/6]> Fold2          2 <lm>   13.1 
##  8 <split [24/6]> Fold2          3 <lm>   13.1 
##  9 <split [24/6]> Fold2          4 <lm>   12.9 
## 10 <split [24/6]> Fold2          5 <lm>   13.1 
## # ... with 15 more rows

D. Implications - ok, given that the 5-fold score is the average RMSE across all folds for a given polynomial, show in both a table and figure the relationship between polynomial and out-of-sample RMSE. What does this tell you?

# start with a tibble
prog_4 <- progesterone_five_fold_rmse %>%
  group_by(polynomial) %>%
  summarise(avg_rmse = mean(rmse))

## `summarise()` ungrouping output (override with `.groups` argument)

# view data showing average rmse values for each fold
prog_4

## # A tibble: 5 x 2
##   polynomial avg_rmse
##        <int>    <dbl>
## 1          1     13.0
## 2          2     13.1
## 3          3     14.8
## 4          4     16.4
## 5          5     17.8

# plot relationship between average rmse and polynomial
  ggplot(data = prog_4,
         mapping = aes(x = polynomial, y = avg_rmse)) +
  geom_point() +
    geom_line() +
  scale_x_discrete(labels = NULL)

3. Compare models and see how they differ from AIC

That was all well and good, but, how do these results compare to doing this analysis with AIC using the {AICcmodavg} package? Note, you can use dplyr and purrr to not have to fit each model manually.

# create an object for each model
prog_lm <- lm(ventilation ~ progesterone, data = progesterone_data)
poly_2 <- lm(ventilation ~ poly(progesterone, 2), data = progesterone_data)
poly_3 <- lm(ventilation ~ poly(progesterone, 3.), data = progesterone_data)
poly_4 <- lm(ventilation ~ poly(progesterone, 4), data = progesterone_data)
poly_5 <- lm(ventilation ~ poly(progesterone, 5), data = progesterone_data)

# make a list of all models created
mod_list <- list(prog_lm, poly_2, poly_3, poly_4, poly_5)
# name each model
name_vec <- c("1st order", "2nd order", "3rd order", "4th order", "5th order")

# compute aic values
aic_analysis <- aictab(cand.set = mod_list, modnames = name_vec)

# view results
aic_analysis

## 
## Model selection based on AICc:
## 
##           K   AICc Delta_AICc AICcWt Cum.Wt      LL
## 1st order 3 242.86       0.00   0.72   0.72 -117.97
## 2nd order 4 245.29       2.43   0.21   0.93 -117.85
## 3rd order 5 248.16       5.31   0.05   0.98 -117.83
## 4th order 6 250.72       7.87   0.01   1.00 -117.54
## 5th order 7 253.52      10.66   0.00   1.00 -117.21

Lower value of AIC suggests “better” model. The model with the highest support is the one with the highest AIC weight (AICw closest to 1) which is the linear model.

EC 4. boot::gv.glm()

Let’s try again, for orders 1-5, but this time, let’s do a LOOCV analysis using boot::cv.glm(). Using dplyr and purrr will make things faster and more efficient here - perhaps even with something you created in #3, if you used glm() instead of lm().

Although, if you do that, quick note that you will need to use a map2_*() function with polys in it so that it’s variable can match the . variable used. This may seem like a weird sentence. But, once you get the error that made me realize this, you’ll get it.

# create function that computes LOOCV MSE based on specified polynomial degree
loocv_error <- function(x) {
  glm.fit <- glm(ventilation ~ poly(progesterone, x), data = progesterone_data)
  cv.glm(progesterone_data, glm.fit)$delta[1]%>% sqrt()
}

# compute LOOCV MSE for polynomial degrees 1-5
1:5 %>% map_dbl(loocv_error)

## [1] 13.10008 13.51999 14.30377 14.37377 14.35069

5. Grid sample with Bayes Last week, we did grid sampling with Likelihood. This week, let’s do it with Bayes!

p(H|D)=p(D|H)p(H)p(D)

A. Let’s start with the Palmer Penguins data. Let’s look at just the Gentoo. Why don’t you plot the distribution of the average flipper length of females. We’ll use this data for the exercise. Remember to remove NAs - it will make the rest of the exercise easier. 1 EC for each thing you do to snaz the plot up.

# create an object containing flipper length measurements of all Gentoo female penguins
fem_flip <- penguins %>%
  group_by(species) %>%
  filter(species == "Gentoo", sex == "female") %>%
  summarise(flipper_length_mm, sex) %>%
  na.omit()

## `summarise()` regrouping output by 'species' (override with `.groups` argument)

# view object
fem_flip

## # A tibble: 58 x 3
## # Groups:   species [1]
##    species flipper_length_mm sex   
##    <fct>               <int> <fct> 
##  1 Gentoo                211 female
##  2 Gentoo                210 female
##  3 Gentoo                210 female
##  4 Gentoo                211 female
##  5 Gentoo                209 female
##  6 Gentoo                214 female
##  7 Gentoo                214 female
##  8 Gentoo                210 female
##  9 Gentoo                210 female
## 10 Gentoo                209 female
## # ... with 48 more rows

# plot the distribution of flipper length
ggplot(data = fem_flip,
       mapping = aes(x = flipper_length_mm, # Setting ggplot parameters
                     fill = sex)) +
  geom_density(add = "mean", alpha = 0.5) +
  # add mean flipper length with geom_vline
  geom_vline(xintercept = mean(fem_flip$flipper_length_mm), linetype="dotted", 
                color = "black", size=0.8) +
  labs(x ="Flipper Length", y = "Density",
     title = "Distribution of flipper length in female Gentoo penguins")

## Warning: Ignoring unknown parameters: add

B. OK, this is pretty normal, with a mean of 212.71 and sd of 3.9. Make a grid to search a number of values around that mean and SD, just as you did for likelihood. Let’s say 100 values of each parameter.

# make a grid to search a number of values around the frequentist mean and SD
fem_flip_grid <- crossing(m = seq(212.5, 215.5, length.out = 100),
                            s = seq(1.5, 4.5, length.out = 100))
# view object
fem_flip_grid

## # A tibble: 10,000 x 2
##        m     s
##    <dbl> <dbl>
##  1  212.  1.5 
##  2  212.  1.53
##  3  212.  1.56
##  4  212.  1.59
##  5  212.  1.62
##  6  212.  1.65
##  7  212.  1.68
##  8  212.  1.71
##  9  212.  1.74
## 10  212.  1.77
## # ... with 9,990 more rows

C. Write a function that will give you the numerator for any combination of m and s! This is just the same as writing a function for likelihood, but including an additional multiplier of p(H), when putting in the likelihood. Let’s assume a prior for m of dnorm(210, 50) and for s of dunif(1,10) - so, pretty weak!

So, we want p(m, s|flipper length)p(m)p(s).

BUT - small problem. These numbers get so vanishingly small, we can no longer do calculations with them. So, for any probability density you use, add log=TRUE and take a sum instead of products or multiplication, as

log(p(D|H)p(H))=log(p(D|H))+log(p(H))

# a function that will give you the numerator for any combination of m and s!
num_function <- function(m, s) {
  
  # Log likelihood:
  dnorm(fem_flip$flipper_length_mm, m, s, log = TRUE) %>% sum() +
    # plus the log priors results in log posterior:
    dnorm(m, 210, 50, log = TRUE) %>% sum() +
    dunif(s, 1, 10, log = TRUE) %>% sum()

}

# test function
num_function(212, 4)

## [1] -168.7012

D. Great! Now use this function with your sample grid to get the numerator of the posterior, and then standardize with the p(D) - the sum of all numerators - to get a full posterior. Note, as we’re working in logs, we just subtract log(p(D)) What is the modal estimate of each parameter? How do they compare to the standard frequentist estimate?

Note: log(p(d)) = log(sum(exp(p(D|H)p(H))))

# to get numerator of posterior
fem_flip_grid_post <- fem_flip_grid %>%
  rowwise() %>%
  mutate(pois_numerator = num_function(m, s),
         poisterior = exp(pois_numerator)) %>%
  ungroup() %>%
  na.omit()

fem_flip_grid_post

## # A tibble: 10,000 x 4
##        m     s pois_numerator poisterior
##    <dbl> <dbl>          <dbl>      <dbl>
##  1  212.  1.5           -277.  5.86e-121
##  2  212.  1.53          -270.  3.55e-118
##  3  212.  1.56          -264.  1.43e-115
##  4  212.  1.59          -259.  3.90e-113
##  5  212.  1.62          -254.  7.51e-111
##  6  212.  1.65          -249.  1.04e-108
##  7  212.  1.68          -244.  1.07e-106
##  8  212.  1.71          -240.  8.29e-105
##  9  212.  1.74          -236.  4.96e-103
## 10  212.  1.77          -232.  2.32e-101
## # ... with 9,990 more rows

# create a function to get modal estimate of each parameter
getmode <- function(x) {
  uniq <- unique(x)
  uniq[which.max(tabulate(match(x, uniq)))]
}

# calculate estimated mode of mean and Sd using the mode function
getmode(fem_flip_grid_post$m)

## [1] 212.5

getmode(fem_flip_grid_post$s)

## [1] 1.5

The modal estimate of the mean is 212 while that of the SD is 1.5

These values are lower than the frequentist estimates.

E.C. E. Show me ’dat surface! Make it sing!

# visualize with contour plot!
raster_plot <- ggplot(data = fem_flip_grid_post %>% filter(pois_numerator > max(pois_numerator) - 1),
       mapping = aes(x = m, y = s, fill = pois_numerator)) +
 geom_raster() +
  scale_fill_viridis_c()

raster_plot

E.C. x2 F Compare our weak prior to one with a strong prior. Note, as you progress in this, instead of doing the product of p(D|H)p(H), you might want to do log(p(D|H)) + log(p(H)) as it simplifies calculations. The nice thing is then you just subtract log(p(D)) to get log(p(H|D)) - which you can then safely exponentiate!

# create a second function using stronger priors of mean and SD
num_function_2 <- function(m, s) {
  
  # Log likelihood:
  dnorm(fem_flip$flipper_length_mm, m, s, log = TRUE) %>% sum() +
    # plus the log priors results in log posterior:
    dnorm(m, 212, 4, log = TRUE) %>% sum() +
    dunif(s, 2, 4, log = TRUE) %>% sum()

}

# get posterior 
fem_flip_grid_post_2 <- fem_flip_grid_post %>%
  rowwise() %>%
  mutate(pois_numerator_2 = num_function_2(m, s),
         poisterior_2 = exp(pois_numerator_2)) %>%
  ungroup() %>%
  na.omit()

# view posterior
fem_flip_grid_post_2

## # A tibble: 10,000 x 6
##        m     s pois_numerator poisterior pois_numerator_2 poisterior_2
##    <dbl> <dbl>          <dbl>      <dbl>            <dbl>        <dbl>
##  1  212.  1.5           -277.  5.86e-121             -Inf            0
##  2  212.  1.53          -270.  3.55e-118             -Inf            0
##  3  212.  1.56          -264.  1.43e-115             -Inf            0
##  4  212.  1.59          -259.  3.90e-113             -Inf            0
##  5  212.  1.62          -254.  7.51e-111             -Inf            0
##  6  212.  1.65          -249.  1.04e-108             -Inf            0
##  7  212.  1.68          -244.  1.07e-106             -Inf            0
##  8  212.  1.71          -240.  8.29e-105             -Inf            0
##  9  212.  1.74          -236.  4.96e-103             -Inf            0
## 10  212.  1.77          -232.  2.32e-101             -Inf            0
## # ... with 9,990 more rows

# get modal estimates
getmode(fem_flip_grid_post_2$m)

## [1] 212.5

getmode(fem_flip_grid_post_2$s)

## [1] 1.5

I still got the same modal estimates of mean and SD of 212 and 1.5.

6. Final Project Thinking

We’re at the half-way point in the course, and after the mid-term, it’s time to start thinking about your final project. So…. I want to know a bit about what you’re thinking of!

A. What is the dataset you are thinking of working with? Tell me a bit about what’s in it, and where it comes from.

I am thinking of working on a dataset obtained from the USDA-ARS, Dale Bumpers National Rice Research Center, Stuttgart, Arkansas, Genetic Stocks Oryza Collection (www.ars.usda.gov/GSOR).

B. What question do you want to ask of that data set?

I am considering asking the following questions;

What is the best model for evaluating genome-phenotype associations in African and Asian rice populations?
What is the extent of genetic variation between the two rice populations?

EC C. Wanna make a quick visualization of some aspect of the data that might be provocative and interesting?

# read in my interest data 
rice_phenotype_data <- read.csv("raw_data/rice_phenotype_data.csv")

# view data
head(rice_phenotype_data)

##   NSFTV.ID. GSOR.ID. IRGC.ID.          Name Orignal.providing.country
## 1         1   301001   126380      Agostano                     Italy
## 2         2   301002   117637   Aichi Asahi                     Japan
## 3         3   301003   117636 Ai-Chiao-Hong                     China
## 4         4   301004   117601      NSF-TV 4                   Unknown
## 5         5   301005   117641      NSF-TV 5                   Unknown
## 6         6   301006   117603      ARC 7229                     India
##        Region Subpopulation.group..Structure.. Subpopulation.group..PCA..
## 1 West Europe                              TEJ                        TEJ
## 2   East Asia                              TEJ                        N/A
## 3   East Asia                              IND                        IND
## 4     Unknown                              AUS                        AUS
## 5     Unknown                              ARO                        ARO
## 6  South Asia                              AUS                        AUS
##   Subpopulation.group..fastStructure... AMYCN ALKDIG DHULPROTCN DTHD  PTHT
## 1                    temperate-japonica  15.6    6.1        8.4 75.1 110.9
## 2                    temperate-japonica  17.2      7        9.6   80 103.3
## 3                                indica  23.1    5.8        9.2 89.5 143.5
## 4                                   aus  22.8    4.7          8 94.5 128.1
## 5                              aromatic  19.1      6        9.6 87.5 153.8
## 6                                   aus  23.2    5.5        8.5 89.1 148.3
##   FLFLG FLFWD PNNB PNLG PBRNB FLNBPPN UNFILGRNB FILGRNB HULGRLG HULGRWD
## 1  28.4   1.3 21.5 20.5   9.3   136.3      16.5   119.8     8.1     3.7
## 2  24.6   1.3 29.7 22.4  11.4   143.7      12.4   131.2     7.6     3.4
## 3    39     1 57.5 26.8   9.2     113      28.3    84.8     7.7       3
## 4  27.7   1.5 22.8 23.5   8.7   171.8      11.1   160.7     8.2     2.9
## 5  30.4   0.9 40.3   29   6.4   113.4      21.2    92.2     9.7     2.4
## 6  36.9   1.8 17.4 30.9  11.2   291.9      37.6   254.3     7.1     3.3
##   HULGRLGWDRO HULGRVOL HHULGRWT DHULGRLG DHULGRWD DHULGRVOL CULMHAB SDSH AWNPLU
## 1         2.2     13.3      3.1      5.8      3.1       7.7       4  yes      0
## 2         2.2     10.6      2.6      5.6      2.9       6.2       3  yes      0
## 3         2.6      8.3      2.3      5.6      2.6       5.1       8  yes      0
## 4         2.8      8.6      2.2      6.1      2.5       5.1       6  yes      0
## 5         4.1        7      2.3      7.1        2       4.1       4  yes      0
## 6         2.2        9      2.2      5.1      2.8       5.6       6  yes      0
##   LFLPUBES HULCL DHULGRCL LFBLRS STRTHEAD
## 1      yes     2        2      6      4.8
## 2      yes     2        2      5      7.3
## 3      yes     2        1      3      7.8
## 4      yes     2        1    2.5        0
## 5      yes     2        2      4      8.3
## 6      yes     4        2      0      8.2

# create an object containing PNLG values for African and Asian cultivars
plot_rice_data <- rice_phenotype_data %>%
  # convert PNLG values to class numeric
  mutate(PNLG_2 = as.numeric(PNLG)) %>%
  group_by(Region) %>%
  filter(Region %in% c("Africa","East Asia", "Southeast Asia", "South Asia", "West Asia")) %>%
  summarise(PNLG_2) %>%
  na.omit()

## Warning: Problem with `mutate()` input `PNLG_2`.
## i NAs introduced by coercion
## i Input `PNLG_2` is `as.numeric(PNLG)`.

## Warning in mask$eval_all_mutate(dots[[i]]): NAs introduced by coercion

## `summarise()` regrouping output by 'Region' (override with `.groups` argument)

# plot distribution of PNLG values
ggplot(plot_rice_data,
       aes(y = PNLG_2, fill = Region)) +
  geom_boxplot() +
  labs(y ="Panicle length /Inflorescence length (PNLG)",
       title = "Distribution of panicle length in African and Asian populations of Oryza spp.")

Cross-Validation and Bayes

Uzezi Okinedo

10/27/2020