Sims and Viz

# Load all required libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)
library(ggplot2)
library(readr)
library(forcats)
library(colorfindr)
library(gganimate)
library(transformr)
library(png)

1. Sample Properties

Consider the following vasopressin levels in voles.

# Assign a variable name to vasopressin levels
vole_vaso <- c(98,96,94,88,86,82,77,74,70,60,
           59,52,50,47,40,35,29,13,6,5)

1a. Say “Vole vasopressin” 10 times as fast as you can. How many times did you trip up?

I tripped up 3 times.

1b. What is the mean, median, sd, and interquartile range of the sample?

# Create a data frame 
as.data.frame(vole_vaso) %>%
  # Summarize mean, median, sd and interquartile range into a data frame
  summarise(mean_vole_vaso = mean(vole_vaso), med_vole_vaso = median(vole_vaso), sd_vole_vaso = sd(vole_vaso), IQR(vole_vaso))

##   mean_vole_vaso med_vole_vaso sd_vole_vaso IQR(vole_vaso)
## 1          58.05          59.5     29.75244          44.25

The mean, median and IQR of the sample are 58.05, 59.5 and 44.25 respectively.

1c. What is the standard error of the mean (do this with a formula!)?

# Calculate the standard error of sample 
sd(vole_vaso)/sqrt(length(vole_vaso))

## [1] 6.652849

The standard error of the sample is 6.652849

1d. What does the standard error of the mean tell you about our estimate of the mean values of the population of vole vassopressin?

The calculated standard error of the mean shows that the population mean values are dispersed and far from the sample mean.

2. Sample Size for upper quartiles.

We can get the upper quartile value of vole vassopressin with

# Calculate the upper qiartile of sample
quantile(vole_vaso, probs = 0.75)

## 75% 
##  83

2a. Use sample() to get just one resample with a sample size of 10. What is its upper quartile?

# Create one resample using sample()
one_resamp <-  sample(vole_vaso,
                    size = length(1:10),
                    replace = TRUE)

one_resamp

##  [1] 77 74 52 88 94 50 86 70 96 13

# Calculate upper_quartile of one resample
quantile(one_resamp, probs = 0.75)

##  75% 
## 87.5

The Upper quartile is 92.5

2b. Build an initial data frame for simulations with the sample sizes 5 through 20.

# Create a data frame for simulating sample sizes 5:20
sim_data <- data.frame(samp_size = 5:20) %>%
  # for each sample size
  rowwise(samp_size) %>%
  #replicate (1000 times)
  summarise(sim_data = sample(vole_vaso,
                   size = samp_size,
                   replace = TRUE))

## `summarise()` regrouping output by 'samp_size' (override with `.groups` argument)

sim_data

## # A tibble: 200 x 2
## # Groups:   samp_size [16]
##    samp_size sim_data
##        <int>    <dbl>
##  1         5       13
##  2         5       88
##  3         5       59
##  4         5       60
##  5         5       86
##  6         6       35
##  7         6       96
##  8         6        6
##  9         6       60
## 10         6       88
## # ... with 190 more rows

2c. Use this data frame to get simulated upper quartiles for each sample size. using 1,000 simulations

sim_upper_quarts <- sim_data %>% 
  # For each sample size (set of params)...
  rowwise() %>%
  
  # Replicate calculating estimated parameters 
  # from a random draw 
  # some # of times
  summarize(upper_quarts = replicate(1000,
                                  sample(vole_vaso,
                                         size = samp_size,
                                        replace = TRUE) %>% 
                                    quantile(probs = 0.75)))

## `summarise()` regrouping output by 'samp_size' (override with `.groups` argument)

sim_upper_quarts

## # A tibble: 200,000 x 2
## # Groups:   samp_size [16]
##    samp_size upper_quarts
##        <int>        <dbl>
##  1         5           94
##  2         5           47
##  3         5           88
##  4         5           94
##  5         5           88
##  6         5           94
##  7         5           52
##  8         5           70
##  9         5           86
## 10         5           86
## # ... with 199,990 more rows

2d. With a ggplot, make a guesstimate as to the best sample size for estimating the upper quartile of the population. Use whatever geom you feel makes things most easy to see. E.C. Add a red dashed line using geom_vline() or geom_hline() to show where that should be, perhaps.

# Create an object for plotting simulated upper quartiles
plot_sim_upper_quarts <- ggplot(data = sim_upper_quarts,
                        mapping = aes(x = samp_size,
                                      y = upper_quarts))

plot_sim_upper_quarts +
  # This geom made the plot readable and quite understandable
  geom_count(bins = 50) +
  # Plot labels
  labs(x = "Sample size", y = "upper_quarts",
       title = "Guesstimate as to the best sample size for estimating the upper quartile") +
  # Added a verical red line to identify the best guesstimate
  geom_vline(xintercept = 17, linetype="dotted", 
                color = "red", size=1.5) +
  theme_bw()

## Warning: Ignoring unknown parameters: bins

2e. Plot the SE of the estimate of the upper quantile by sample size. Again, what it the best way to see this? Does it level off? Is there a level you feel acceptable? Justify your answer. Does this match with what you put in 3d?

upper_quart_SE <- sim_upper_quarts %>% 
  # For each sample size (set of params)...
  rowwise() %>%
  
  # Replicate calculating estimated parameters 
  # from a random draw 
  # some # of times
  summarise(upper_quart_SE = sample(vole_vaso,
                        size = samp_size,
                        replace = TRUE)%>%
                        sd(upper_quarts)/sqrt(length(upper_quarts)))

## `summarise()` regrouping output by 'samp_size' (override with `.groups` argument)

upper_quart_SE

## # A tibble: 200,000 x 2
## # Groups:   samp_size [16]
##    samp_size upper_quart_SE
##        <int>          <dbl>
##  1         5           22.3
##  2         5           33.8
##  3         5           15.3
##  4         5           25.3
##  5         5           15.6
##  6         5           35.6
##  7         5           17.7
##  8         5           44.4
##  9         5           31.8
## 10         5           19.2
## # ... with 199,990 more rows

# Create an object for plotting SE of upper quartiles

plot_upper_quart_SE <- ggplot(data = upper_quart_SE,
                        mapping = aes(x = samp_size,
                                      y = upper_quart_SE))

plot_upper_quart_SE +
  geom_count(bins = 50) +
  #plot labels
  labs(x = "Sample size", y = "Count",
       title = "SE of the estimate of the upper quantile by sample size") +
  # Added a verical red line to identify the best guesstimate
  geom_vline(xintercept = 17, linetype="dotted", 
                color = "red", size=1.5) +
  theme_bw()

## Warning: Ignoring unknown parameters: bins

This plot levels off with the plot in 2d. Sample size = 5 has more standard errors as indicated by the number of counts in the plot. On the other hand, Sample size = 17 is a good guesstimate for estimating upper quartiles and has the lowest count of standard errors.

3. Ggplot

3a. Some setup. Run the code below. For extra credit, look up the packages and functions used and explain what is going on here. But, that’s EC.

# Downloaded data from BIOL 607 GitHub course repository and saved in a file in my project directory
download.file(url = "https://biol607.github.io/homework/data/NH_seaice_extent_monthly_1978_2016.csv",
              destfile = "raw_data/seaice.csv")

theme_set(theme_bw(base_size=12))

ice <- read_csv("http://biol607.github.io/homework/data/NH_seaice_extent_monthly_1978_2016.csv") %>%
  mutate(Month_Name = factor(Month_Name),
         Month_Name = fct_reorder(Month_Name, Month))

## Parsed with column specification:
## cols(
##   Year = col_double(),
##   Month = col_double(),
##   Day = col_double(),
##   Extent = col_double(),
##   Missing = col_double(),
##   Month_Name = col_character()
## )

ice

## # A tibble: 458 x 6
##     Year Month   Day Extent Missing Month_Name
##    <dbl> <dbl> <dbl>  <dbl>   <dbl> <fct>     
##  1  1978    11     1  10.7        0 Nov       
##  2  1978    12     1  12.7        0 Dec       
##  3  1979     2     1  15.9        0 Feb       
##  4  1979     3     1  16.6        0 Mar       
##  5  1979     6     1  13.1        0 Jun       
##  6  1979     7     1  11.6        0 Jul       
##  7  1979     9     1   7.23       0 Sep       
##  8  1979    10     1   7.40       0 Oct       
##  9  1980     1     1  14.2        0 Jan       
## 10  1980     3     1  16.2        0 Mar       
## # ... with 448 more rows

The Month_Name in the data set was changed to a factor with the factor function, while the fct_reorder function reorders one variable for the other.

3b. Make a boxplot showing the variability in sea ice extent every month.

# Made an object for plotting variability in sea ice extent
var_ice <- ggplot(data = ice,
                        mapping = aes(x = Month,   # Setting ggplot parameters
                                      y = Extent,
                                      fill = Month_Name))
var_ice +
  geom_boxplot() +
  #Plot labels
labs(x ="Month", y ="Sea ice extent",
     title = "Variability in sea ice extent every month") +
  # Here comes the gganimate code
  transition_states(
    Month_Name,
    transition_length = 2,  # Parameters for gganimate
    state_length = 1
  ) +
  enter_fade() +
  exit_shrink() +
  ease_aes('sine-in-out')

3c. Use dplyr to get the annual minimum sea ice extent. Plot minimum ice by year. What do you observe?

# Create a data frame for minimum sea ice and summarized into a new data frame
min_ice_extent <- ice %>% 
  group_by(Year) %>%
 summarise(min_sea_ice = min(Extent))

## `summarise()` ungrouping output (override with `.groups` argument)

min_ice_extent

## # A tibble: 39 x 2
##     Year min_sea_ice
##    <dbl>       <dbl>
##  1  1978       10.7 
##  2  1979        7.23
##  3  1980        7.54
##  4  1981        7.20
##  5  1982        7.40
##  6  1983        7.32
##  7  1984        6.83
##  8  1985        6.72
##  9  1986        7.23
## 10  1987        6.89
## # ... with 29 more rows

# Create a dataframe for plotting the extent of minimum sea ice and set ggplot parameters
plot_min_sea_ice <- ggplot(data = min_ice_extent,
                    mapping = aes(x = Year,
                                  y = min_sea_ice))

plot_min_sea_ice +
  geom_point() +
  geom_smooth() +
labs(x ="Year", y ="Minimum sea ice extent",
     title = "Annual minimum sea ice extent")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Lowest sea ice extent is observed in 2009 and continues through year 2010 and beyond.

3d. With the original data, plot sea ice by year, with different lines (oh! What geom will you need for that?) for different months. Then, use facet_wrap and cut_interval(Month, n=4) to split the plot into seasons.

# Create an object for plotting sea ice by year and set parameters 
plot_ice_by_year <- ggplot(data = ice,
                    mapping = aes(x = Extent,
                                  y = Year,
                                  color = Month_Name))

plot_ice_by_year +
  geom_point() +
  # using facet_wrap and cut_interval(Month, n=4) to split the plot into seasons
  facet_wrap(~cut_interval(1:458, n = 4)) +
  labs(x ="Sea ice extent", y ="Year",
     title = "Sea ice by year")

3e. Last, make a line plot of sea ice by month with different lines as different years. Gussy it up with colors by year, a different theme, critical values, and whatever other annotations, changes to axes, etc., you think best show the story of this data. For ideas, see the lab, and look at various palettes around. Extra credit for using colorfindr to make a palette.

# Using colorfindr to create a color palette
palette <- get_colors("raw_data/website-color-palettes-18.jpg") %>% # I referenced a jpg file on my computer.
  make_palette(n = 5) # here we extract 5 colors

# Creating an object for plotting sea by month with reference to different years
ice_by_month <- ggplot(data = ice,
                        mapping = aes(x = Month_Name,
                                      y = Extent,
                                      group = Year,  # Setting ggplot parameters
                                      colour = Year))
plot_ice_by_month <- ice_by_month +
  geom_line() +
  scale_fill_manual(values = palette) +
  # Plot labels
labs(x ="Month", y ="Sea ice extent",
     title = "sea ice by month with different lines as different years")

plot_ice_by_month

3f. Extra Credit. Make it animated with gganimate. Just like above.

# Applying gganimate
plot_ice_by_month +
  # Here comes the gganimate specific bits
  labs(title = 'sea ice by month with different lines as different years', x = 'Month', y = 'Sea ice extent') +
  transition_reveal(Year)

3g. Extra Credit. Use the data and make something wholly new and awesome. Even extra extra credit for something amazing animated.

# I used the data to create a new plot called Awesome!
# The plot shows how sea ice extent transitioned over the years
awesome_plot <- ggplot(ice,
       aes(x = Month, y = Extent, size = Day, colour = Month_Name)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  # plot labels
  labs(title = "Transition of sea ice extent over time", x = "Month", y = "Extent")

awesome_plot +
  transition_time(Year) +
  labs(title = "Year: {frame_time}")

GitHub Extra Credit