# Load all required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)
library(readr)
library(forcats)
library(colorfindr)
library(gganimate)
library(transformr)
library(png)
Consider the following vasopressin levels in voles.
# Assign a variable name to vasopressin levels
vole_vaso <- c(98,96,94,88,86,82,77,74,70,60,
59,52,50,47,40,35,29,13,6,5)
I tripped up 3 times.
# Create a data frame
as.data.frame(vole_vaso) %>%
# Summarize mean, median, sd and interquartile range into a data frame
summarise(mean_vole_vaso = mean(vole_vaso), med_vole_vaso = median(vole_vaso), sd_vole_vaso = sd(vole_vaso), IQR(vole_vaso))
## mean_vole_vaso med_vole_vaso sd_vole_vaso IQR(vole_vaso)
## 1 58.05 59.5 29.75244 44.25
The mean, median and IQR of the sample are 58.05, 59.5 and 44.25 respectively.
# Calculate the standard error of sample
sd(vole_vaso)/sqrt(length(vole_vaso))
## [1] 6.652849
The standard error of the sample is 6.652849
The calculated standard error of the mean shows that the population mean values are dispersed and far from the sample mean.
We can get the upper quartile value of vole vassopressin with
# Calculate the upper qiartile of sample
quantile(vole_vaso, probs = 0.75)
## 75%
## 83
# Create one resample using sample()
one_resamp <- sample(vole_vaso,
size = length(1:10),
replace = TRUE)
one_resamp
## [1] 77 74 52 88 94 50 86 70 96 13
# Calculate upper_quartile of one resample
quantile(one_resamp, probs = 0.75)
## 75%
## 87.5
The Upper quartile is 92.5
# Create a data frame for simulating sample sizes 5:20
sim_data <- data.frame(samp_size = 5:20) %>%
# for each sample size
rowwise(samp_size) %>%
#replicate (1000 times)
summarise(sim_data = sample(vole_vaso,
size = samp_size,
replace = TRUE))
## `summarise()` regrouping output by 'samp_size' (override with `.groups` argument)
sim_data
## # A tibble: 200 x 2
## # Groups: samp_size [16]
## samp_size sim_data
## <int> <dbl>
## 1 5 13
## 2 5 88
## 3 5 59
## 4 5 60
## 5 5 86
## 6 6 35
## 7 6 96
## 8 6 6
## 9 6 60
## 10 6 88
## # ... with 190 more rows
sim_upper_quarts <- sim_data %>%
# For each sample size (set of params)...
rowwise() %>%
# Replicate calculating estimated parameters
# from a random draw
# some # of times
summarize(upper_quarts = replicate(1000,
sample(vole_vaso,
size = samp_size,
replace = TRUE) %>%
quantile(probs = 0.75)))
## `summarise()` regrouping output by 'samp_size' (override with `.groups` argument)
sim_upper_quarts
## # A tibble: 200,000 x 2
## # Groups: samp_size [16]
## samp_size upper_quarts
## <int> <dbl>
## 1 5 94
## 2 5 47
## 3 5 88
## 4 5 94
## 5 5 88
## 6 5 94
## 7 5 52
## 8 5 70
## 9 5 86
## 10 5 86
## # ... with 199,990 more rows
# Create an object for plotting simulated upper quartiles
plot_sim_upper_quarts <- ggplot(data = sim_upper_quarts,
mapping = aes(x = samp_size,
y = upper_quarts))
plot_sim_upper_quarts +
# This geom made the plot readable and quite understandable
geom_count(bins = 50) +
# Plot labels
labs(x = "Sample size", y = "upper_quarts",
title = "Guesstimate as to the best sample size for estimating the upper quartile") +
# Added a verical red line to identify the best guesstimate
geom_vline(xintercept = 17, linetype="dotted",
color = "red", size=1.5) +
theme_bw()
## Warning: Ignoring unknown parameters: bins
upper_quart_SE <- sim_upper_quarts %>%
# For each sample size (set of params)...
rowwise() %>%
# Replicate calculating estimated parameters
# from a random draw
# some # of times
summarise(upper_quart_SE = sample(vole_vaso,
size = samp_size,
replace = TRUE)%>%
sd(upper_quarts)/sqrt(length(upper_quarts)))
## `summarise()` regrouping output by 'samp_size' (override with `.groups` argument)
upper_quart_SE
## # A tibble: 200,000 x 2
## # Groups: samp_size [16]
## samp_size upper_quart_SE
## <int> <dbl>
## 1 5 22.3
## 2 5 33.8
## 3 5 15.3
## 4 5 25.3
## 5 5 15.6
## 6 5 35.6
## 7 5 17.7
## 8 5 44.4
## 9 5 31.8
## 10 5 19.2
## # ... with 199,990 more rows
# Create an object for plotting SE of upper quartiles
plot_upper_quart_SE <- ggplot(data = upper_quart_SE,
mapping = aes(x = samp_size,
y = upper_quart_SE))
plot_upper_quart_SE +
geom_count(bins = 50) +
#plot labels
labs(x = "Sample size", y = "Count",
title = "SE of the estimate of the upper quantile by sample size") +
# Added a verical red line to identify the best guesstimate
geom_vline(xintercept = 17, linetype="dotted",
color = "red", size=1.5) +
theme_bw()
## Warning: Ignoring unknown parameters: bins
This plot levels off with the plot in 2d. Sample size = 5 has more standard errors as indicated by the number of counts in the plot. On the other hand, Sample size = 17 is a good guesstimate for estimating upper quartiles and has the lowest count of standard errors.
# Downloaded data from BIOL 607 GitHub course repository and saved in a file in my project directory
download.file(url = "https://biol607.github.io/homework/data/NH_seaice_extent_monthly_1978_2016.csv",
destfile = "raw_data/seaice.csv")
theme_set(theme_bw(base_size=12))
ice <- read_csv("http://biol607.github.io/homework/data/NH_seaice_extent_monthly_1978_2016.csv") %>%
mutate(Month_Name = factor(Month_Name),
Month_Name = fct_reorder(Month_Name, Month))
## Parsed with column specification:
## cols(
## Year = col_double(),
## Month = col_double(),
## Day = col_double(),
## Extent = col_double(),
## Missing = col_double(),
## Month_Name = col_character()
## )
ice
## # A tibble: 458 x 6
## Year Month Day Extent Missing Month_Name
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 1978 11 1 10.7 0 Nov
## 2 1978 12 1 12.7 0 Dec
## 3 1979 2 1 15.9 0 Feb
## 4 1979 3 1 16.6 0 Mar
## 5 1979 6 1 13.1 0 Jun
## 6 1979 7 1 11.6 0 Jul
## 7 1979 9 1 7.23 0 Sep
## 8 1979 10 1 7.40 0 Oct
## 9 1980 1 1 14.2 0 Jan
## 10 1980 3 1 16.2 0 Mar
## # ... with 448 more rows
The Month_Name in the data set was changed to a factor with the factor function, while the fct_reorder function reorders one variable for the other.
# Made an object for plotting variability in sea ice extent
var_ice <- ggplot(data = ice,
mapping = aes(x = Month, # Setting ggplot parameters
y = Extent,
fill = Month_Name))
var_ice +
geom_boxplot() +
#Plot labels
labs(x ="Month", y ="Sea ice extent",
title = "Variability in sea ice extent every month") +
# Here comes the gganimate code
transition_states(
Month_Name,
transition_length = 2, # Parameters for gganimate
state_length = 1
) +
enter_fade() +
exit_shrink() +
ease_aes('sine-in-out')
# Create a data frame for minimum sea ice and summarized into a new data frame
min_ice_extent <- ice %>%
group_by(Year) %>%
summarise(min_sea_ice = min(Extent))
## `summarise()` ungrouping output (override with `.groups` argument)
min_ice_extent
## # A tibble: 39 x 2
## Year min_sea_ice
## <dbl> <dbl>
## 1 1978 10.7
## 2 1979 7.23
## 3 1980 7.54
## 4 1981 7.20
## 5 1982 7.40
## 6 1983 7.32
## 7 1984 6.83
## 8 1985 6.72
## 9 1986 7.23
## 10 1987 6.89
## # ... with 29 more rows
# Create a dataframe for plotting the extent of minimum sea ice and set ggplot parameters
plot_min_sea_ice <- ggplot(data = min_ice_extent,
mapping = aes(x = Year,
y = min_sea_ice))
plot_min_sea_ice +
geom_point() +
geom_smooth() +
labs(x ="Year", y ="Minimum sea ice extent",
title = "Annual minimum sea ice extent")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Lowest sea ice extent is observed in 2009 and continues through year 2010 and beyond.
# Create an object for plotting sea ice by year and set parameters
plot_ice_by_year <- ggplot(data = ice,
mapping = aes(x = Extent,
y = Year,
color = Month_Name))
plot_ice_by_year +
geom_point() +
# using facet_wrap and cut_interval(Month, n=4) to split the plot into seasons
facet_wrap(~cut_interval(1:458, n = 4)) +
labs(x ="Sea ice extent", y ="Year",
title = "Sea ice by year")
# Using colorfindr to create a color palette
palette <- get_colors("raw_data/website-color-palettes-18.jpg") %>% # I referenced a jpg file on my computer.
make_palette(n = 5) # here we extract 5 colors
# Creating an object for plotting sea by month with reference to different years
ice_by_month <- ggplot(data = ice,
mapping = aes(x = Month_Name,
y = Extent,
group = Year, # Setting ggplot parameters
colour = Year))
plot_ice_by_month <- ice_by_month +
geom_line() +
scale_fill_manual(values = palette) +
# Plot labels
labs(x ="Month", y ="Sea ice extent",
title = "sea ice by month with different lines as different years")
plot_ice_by_month
# Applying gganimate
plot_ice_by_month +
# Here comes the gganimate specific bits
labs(title = 'sea ice by month with different lines as different years', x = 'Month', y = 'Sea ice extent') +
transition_reveal(Year)
# I used the data to create a new plot called Awesome!
# The plot shows how sea ice extent transitioned over the years
awesome_plot <- ggplot(ice,
aes(x = Month, y = Extent, size = Day, colour = Month_Name)) +
geom_point(show.legend = FALSE, alpha = 0.7) +
scale_size(range = c(2, 12)) +
scale_x_log10() +
# plot labels
labs(title = "Transition of sea ice extent over time", x = "Month", y = "Extent")
awesome_plot +
transition_time(Year) +
labs(title = "Year: {frame_time}")