library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(broom)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(ggfortify)
# download all W&S data
download.file(url = "https://whitlockschluter.zoology.ubc.ca/wp-content/data/ABD_all_data.zip",
destfile = "raw_data/all_data.zip")
1. Correlation - W&S Chapter 16
# read data
michelli_data <- read.csv("./raw_data/all_data/chapter16/chap16q15LanguageGreyMatter.csv")
# view data
michelli_data
## proficiency greymatter
## 1 0.26 -0.070
## 2 0.44 -0.080
## 3 0.89 -0.008
## 4 1.26 -0.009
## 5 1.69 -0.023
## 6 1.97 -0.009
## 7 1.98 -0.036
## 8 2.24 -0.029
## 9 2.24 -0.008
## 10 2.58 -0.023
## 11 2.50 -0.006
## 12 2.75 -0.008
## 13 3.25 -0.006
## 14 3.85 0.022
## 15 3.04 0.018
## 16 2.55 0.023
## 17 2.50 0.022
## 18 3.11 0.036
## 19 3.18 0.059
## 20 3.52 0.062
## 21 3.59 0.049
## 22 3.40 0.033
a. Display the association between the two variables in a scatter plot.
# Basic scatter plot using ggplot2
ggplot(michelli_data, aes(x= proficiency, y= greymatter)) +
geom_point()
b. Calculate the correlation between second language proficiency and gray-matter density.
# use the cor() to calculate correlation
cor(michelli_data$proficiency, michelli_data$greymatter)
## [1] 0.8183134
The correlation between second language proficiency and grey matter density is 0.8183134
c. Test the null hypothesis of zero correlation.
# Fit a model
michelli_lm <- lm(greymatter ~ proficiency, data = michelli_data)
# use anova to check p-value
anova(michelli_lm) %>%
# use tidy() to summarize model statistical findings
tidy()
## # A tibble: 2 x 6
## term df sumsq meansq statistic p.value
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 proficiency 1 0.0193 0.0193 40.5 0.00000326
## 2 Residuals 20 0.00953 0.000477 NA NA
d. What are your assumptions in part (c)?
e. Does the scatter plot support these assumptions? Explain.
Yes, the scatter plot explains these assumptions. The plot shows an association between second language proficiency and gray matter density but fails to show any linear relationship between both variables as data points are not in a straight line along the plot.
f. Do the results demonstrate that second language proficiency affects gray-matter density in the brain? Why or why not?
No, these results do not demonstrate if second language proficiency affects gray-matter density because the population correlation co-efficient or p-value is close to 0. Therefore, the sample data cannot be used to infer any relationship.
2. Correlation - W&S Chapter 16
liver_data <- read.csv("./raw_data/all_data/chapter16/chap16q19LiverPreparation.csv")
liver_data
## concentration unboundFraction
## 1 2.8 0.63
## 2 5.8 0.44
## 3 12.0 0.31
## 4 23.9 0.19
## 5 47.8 0.13
a. Calculate the correlation coefficient between the taurocholate unbound fraction and the concentration.
cor(liver_data$concentration, liver_data$unboundFraction)
## [1] -0.8563765
b. Plot the relationship between the two variables in a graph.
# Basic scatter plot using ggplot2
ggplot(liver_data, aes(x= concentration, y= unboundFraction)) +
geom_point()
c. Examine the plot in part (b). The relationship appears to be maximally strong, yet the correlation coefficient you calculated in part (a) is not near the maximum possible value. Why not?
The variables do not meet the assumptions of bivariate normality and points appear to form a curve which explains the disparity between the plot and the calculated correlation.
d. What steps would you take with these data to meet the assumptions of correlation analysis?
Log transformation of varibales
Nonparametric Spearman’s rank correlation can be used as a test of zero correlation between variables that do not meet the assumption of bivariate normality, even after data transformation.
3. Correlation SE
Consider the following dataset:
cat <- c(-0.30, 0.42, 0.85, -0.45, 0.22, -0.12, 1.46, -0.79, 0.40, -0.07)
happiness_score <- c(-0.57, -0.10, -0.04, -0.29, 0.42, -0.92, 0.99, -0.62, 1.14, 0.33)
data_set <- data.frame(cat, happiness_score)
data_set
## cat happiness_score
## 1 -0.30 -0.57
## 2 0.42 -0.10
## 3 0.85 -0.04
## 4 -0.45 -0.29
## 5 0.22 0.42
## 6 -0.12 -0.92
## 7 1.46 0.99
## 8 -0.79 -0.62
## 9 0.40 1.14
## 10 -0.07 0.33
3a. Are these two variables correlated? What is the output of cor() here. What does a test show you?
# Basic scatter plot using ggplot2 to check the association between the variables
ggplot(data_set, aes(x= cat, y= happiness_score)) +
geom_point()
# using the cor() function to calculate the correlation
cor(data_set$cat, data_set$happiness_score)
## [1] 0.6758738
# testing correlation using Anova
data_set_lm <- lm(happiness_score ~ cat, data =data_set)
anova(data_set_lm) %>%
tidy()
## # A tibble: 2 x 6
## term df sumsq meansq statistic p.value
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 cat 1 1.92 1.92 6.73 0.0319
## 2 Residuals 8 2.28 0.286 NA NA
-From the scatter plot, these values do not show correlation. - The value obtained by using the cor() is 0.6758738 - The Anova test shows a p-value < 0.05 which confirms no correlation as a regression line cannot be used to model any relationship between the two variables.
3b. What is the SE of the correlation based on the info from cor.test()
# create a function that calculates the standard error from the info of cor.test
cor.test.plus <- function(x) {
list(x,
Standard.Error = unname(sqrt((1 - x$estimate^2)/x$parameter)))
}
# use function to determine SE
cor.test.plus(cor.test(data_set$cat, data_set$happiness_score))
## [[1]]
##
## Pearson's product-moment correlation
##
## data: data_set$cat and data_set$happiness_score
## t = 2.5938, df = 8, p-value = 0.03193
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08050709 0.91578829
## sample estimates:
## cor
## 0.6758738
##
##
## $Standard.Error
## [1] 0.260575
The standard error is 0.260575
3c.Now, what is the SE via simulation? To do this, you’ll need to use cor() and get the relevant parameter from the output (remember - you get a matrix back, so, what’s the right index!), replicate(), and sample() or dplyr::sample_n() with replace=TRUE to get, let’s say, 1000 correlations. How does this compare to your value above?
# create an object for correlation values of the data_set
cor_data <- cor(data_set)
# view parameters in the created object
cor_data
## cat happiness_score
## cat 1.0000000 0.6758738
## happiness_score 0.6758738 1.0000000
# list parameters;
# N = number of replications or simulations
N = 1000
# r = correlation value
r = 0.6758738
# n = length of dataframe
n = 10
# use the replicate function to simulate SE 1000x
replicate(N, cor(mvrnorm(10, c(0,0), cor_data))[2,1], sqrt((1-r^2)/(n-2)))
## [1] 0.827889045 0.616721970 0.083306848 0.200965100 0.912855316
## [6] 0.914500457 0.710945809 0.596018749 0.409527732 0.801030229
## [11] 0.440681099 0.754917652 0.242794520 0.730735649 0.726527139
## [16] 0.722215956 0.678160410 0.618976579 0.632669607 0.617692132
## [21] 0.926462254 0.638896290 0.227504956 0.843178764 0.655329925
## [26] 0.470362515 0.869712164 0.655960969 0.880002690 0.855437029
## [31] 0.742219331 -0.597084750 0.473170026 0.572646850 0.709987814
## [36] 0.290634828 0.847428560 0.744571916 0.336942610 0.722181663
## [41] 0.710295530 0.556094275 0.532450289 0.327923086 0.783730911
## [46] 0.958750554 0.828862854 0.306799158 0.649347113 0.793931107
## [51] 0.652015664 0.724592914 0.815744254 0.893414408 0.594889380
## [56] 0.582016224 0.702529631 0.643363989 0.361973933 0.776890284
## [61] 0.786060122 0.830755136 0.509525576 0.652054089 0.257004578
## [66] 0.876864812 0.774383080 0.280434960 0.798423899 0.920446550
## [71] 0.842325995 0.509232094 0.771643146 0.891383192 0.694878653
## [76] 0.631303756 0.729456596 0.775763844 0.564249391 0.706688911
## [81] 0.689565466 0.604666871 0.458122812 0.638890102 0.631259141
## [86] 0.844626226 0.872747504 0.451297521 0.706329141 0.758823790
## [91] 0.818053374 0.489409495 0.834355219 0.545230715 0.872681518
## [96] 0.493688175 0.627815170 0.714081562 0.434703340 0.516486055
## [101] 0.816190132 0.664779072 0.792855287 -0.095641914 0.742590570
## [106] 0.423207938 0.748284990 0.804339110 0.863900465 0.812382845
## [111] 0.773213873 0.784957132 0.672638505 0.789260149 0.649277904
## [116] 0.770036279 0.288236406 0.494227119 0.744030069 0.812732070
## [121] 0.673463287 0.793321350 0.878342714 0.739666783 0.770104813
## [126] 0.797860945 0.829804624 0.445177581 0.910081408 0.362331025
## [131] 0.657854614 0.830801093 0.714429330 0.751629289 0.961242811
## [136] 0.745911257 0.784091171 0.398533020 0.725983072 0.636322021
## [141] 0.741542054 0.844695143 0.848658306 0.637200725 0.187228140
## [146] 0.855233691 0.734249571 0.650898401 0.648332708 0.944257514
## [151] 0.961367130 0.806652526 0.646239277 0.569560743 0.518852286
## [156] 0.760627352 0.541766774 0.534043946 -0.133372871 0.984803296
## [161] 0.723560434 0.765547078 0.801485472 0.792584137 0.764354356
## [166] 0.818581531 0.713735824 0.884763599 0.892084044 0.538276625
## [171] 0.276218392 0.814475352 0.713895548 0.688639236 0.809481424
## [176] 0.764968981 0.464782017 0.809056690 0.522970813 0.439264166
## [181] 0.737800477 0.680232038 0.786987727 0.912877505 0.866676384
## [186] 0.785972861 0.949566834 0.661005386 0.781800893 0.816771068
## [191] 0.060035670 0.756319779 0.096378476 -0.033608369 0.756908825
## [196] 0.389187386 0.816222927 0.757462803 0.706355399 0.843783790
## [201] 0.519755901 0.910596556 0.131383466 0.570112958 0.456541396
## [206] 0.836959228 0.641004904 0.795927575 0.663208943 0.606229454
## [211] 0.786280111 0.785849388 0.357258937 0.609587411 0.657700266
## [216] 0.580853348 0.763231031 0.601871861 0.745073302 0.650850405
## [221] 0.802623192 0.815039628 0.715760866 0.458530744 0.679396435
## [226] 0.725055816 0.601375615 0.499874524 0.395357765 0.592467512
## [231] 0.840141734 0.725069220 0.539702083 0.897010436 0.856499175
## [236] 0.853093948 0.757712243 0.540990654 0.478263885 0.826839108
## [241] 0.693574121 0.506677114 0.390094069 0.923760071 0.792092748
## [246] 0.794008550 0.651944554 0.328493044 0.357891516 0.493516344
## [251] 0.880406817 0.596982015 0.415507264 0.701586212 0.349584955
## [256] 0.689657172 0.883361157 0.699375051 0.896238335 0.809553625
## [261] 0.661130345 0.703790624 0.852146299 0.583148551 0.820654167
## [266] 0.801589153 0.580572658 0.830235153 0.707746179 0.456739672
## [271] 0.683705838 -0.509572950 0.818483740 0.838130484 0.560162731
## [276] 0.647936462 0.735282117 0.939132052 0.836243022 0.939526749
## [281] 0.646659562 0.230500465 0.891784991 0.678971865 0.884764571
## [286] 0.693717393 0.894692201 0.832883464 0.748574557 0.845172390
## [291] 0.593130600 0.705776731 0.727905213 0.737660990 0.261435628
## [296] 0.837965404 0.780325388 0.694777796 0.487826150 0.655335016
## [301] 0.397295950 0.564350649 0.756593679 0.137495818 0.245823509
## [306] 0.913552556 0.855877211 0.261219047 0.865276010 0.458907912
## [311] 0.726035475 0.714373232 0.849878015 0.838730685 0.668667273
## [316] 0.767675647 0.584847358 0.809035436 0.789088835 0.819884387
## [321] 0.714377759 0.767605085 0.586297511 0.612281877 0.783147620
## [326] 0.687649732 0.890773746 0.817598948 0.404549918 0.583932291
## [331] 0.550114048 0.886636351 0.888740697 0.453060211 0.380364495
## [336] 0.952286126 0.571153085 0.745877421 0.877462497 0.570981721
## [341] 0.444257978 0.627921296 0.919384366 0.729346509 0.603197677
## [346] 0.619801390 0.768336056 0.454901710 0.898080687 0.513588624
## [351] 0.657142805 0.677783229 0.712361580 0.863367106 0.704750719
## [356] 0.728204796 0.755898970 0.853707998 0.811277809 0.813410786
## [361] 0.284026624 0.409895648 0.811651609 0.756677761 0.859048616
## [366] 0.545407214 0.863196199 0.701130998 0.714474716 0.735520494
## [371] 0.199260965 0.548413059 0.729295601 0.341545485 0.770102547
## [376] 0.825757550 0.795407075 0.739120017 0.328299407 0.400580399
## [381] 0.460248387 0.838417841 0.869740844 0.742220817 0.779064333
## [386] 0.625672327 0.800270692 0.842468032 0.939597626 0.400925946
## [391] 0.622597646 0.683124129 0.415890615 0.844764805 0.725524102
## [396] 0.842275755 0.691340174 0.393213764 0.003221904 0.497810477
## [401] 0.370818681 0.943976164 0.675635457 0.153033542 0.673612526
## [406] 0.820955608 0.552536663 0.501512673 0.676570337 0.652293623
## [411] 0.812315371 0.623207404 0.484086278 0.724337861 0.583011932
## [416] 0.601778798 0.872195565 0.805962782 0.717476471 0.813379418
## [421] 0.192162098 0.796893460 0.693770245 0.830967406 0.487005023
## [426] 0.743003394 0.664024852 0.214799349 0.071884702 0.814662539
## [431] 0.607243277 0.858210294 0.697435221 0.641190273 0.436365436
## [436] 0.705261131 0.786467943 0.445700618 0.796140879 0.791098249
## [441] 0.809302027 0.597766016 0.746367517 0.832971830 0.589429493
## [446] 0.400345347 0.827041076 0.412696720 0.829042789 0.781246079
## [451] 0.933880345 0.685912081 0.780546119 0.750268032 0.806570966
## [456] 0.656908423 0.805322987 0.558094526 0.702827292 0.711144760
## [461] 0.424326619 0.837402945 0.112181360 0.839877139 0.726177222
## [466] 0.665163635 0.479349076 0.831040334 0.801693409 0.550578847
## [471] 0.496487203 0.865491071 0.531011534 0.631222836 0.700297777
## [476] 0.795356353 0.452231400 0.784479698 0.918135447 0.379785568
## [481] 0.706814875 0.729705064 0.882587622 0.878409336 0.842147441
## [486] 0.869920713 0.377473053 0.540764862 0.623168645 0.296798814
## [491] 0.677738608 0.632155113 0.788047596 0.886381911 0.886707030
## [496] 0.617994174 0.723552416 0.712872758 0.662096215 0.547362430
## [501] 0.830442444 0.710462252 0.873593086 0.500827074 0.848714865
## [506] 0.700769189 0.059117469 0.443648277 0.406490494 0.727365509
## [511] 0.832938146 0.669240213 0.695437182 0.670808761 0.877988317
## [516] 0.585674650 0.689440817 0.748977940 0.577887710 0.765927919
## [521] 0.829369689 0.217805491 0.681110542 0.655745229 0.804828090
## [526] 0.843008101 0.679345625 0.785447800 0.796004515 0.569401591
## [531] 0.612177529 0.822651482 0.531186808 0.718422038 0.809098832
## [536] 0.823057316 0.555318629 0.560939789 0.016328896 0.700242311
## [541] 0.470637849 0.899953970 0.694321760 0.725903845 0.453538237
## [546] 0.236711524 0.836724159 0.757961858 0.702926036 0.718580817
## [551] 0.586619145 0.735963018 0.573422915 0.690669714 0.791083296
## [556] 0.800495870 0.553630578 0.792888740 0.490946537 0.827913547
## [561] 0.916525972 0.079919725 0.769412254 0.723443199 0.652965347
## [566] 0.119691212 0.916674347 0.588949011 0.684792876 0.231986325
## [571] 0.874927101 0.668697256 0.776686632 0.026543753 0.594260436
## [576] 0.717925372 0.882511727 0.518051833 0.864405446 0.345721756
## [581] 0.815893269 0.444841940 0.728232203 0.712191356 0.716153329
## [586] 0.690651009 0.628642050 0.345555322 0.632864976 0.880225091
## [591] 0.765467053 -0.079056828 0.926598963 0.602005355 0.241849112
## [596] 0.650561744 0.963436761 0.755209505 0.860096072 0.819498102
## [601] 0.743109492 0.710237059 0.852819771 0.769370733 0.792446449
## [606] 0.825885594 0.611974643 0.504461203 0.672327997 0.808712726
## [611] 0.492890206 0.315881534 0.668242194 0.751830272 0.747970713
## [616] 0.793322154 0.758019445 0.636906742 0.763506408 0.590243395
## [621] 0.521368235 0.489273678 0.372296987 0.631561892 0.905155270
## [626] 0.620961774 0.909992063 0.588795094 0.718623189 0.766461452
## [631] 0.377200227 0.907923557 0.737112577 -0.047690204 0.874962285
## [636] 0.792071276 0.489908773 0.821047330 0.616025566 0.808312321
## [641] 0.846650360 0.755097635 0.727237533 0.452324249 0.599291922
## [646] 0.562179197 0.638982571 0.778310826 0.559704219 0.746559892
## [651] 0.463234277 0.570152868 0.793986249 0.646365950 0.718361191
## [656] 0.609897178 0.791931393 -0.184400655 0.834870069 0.778765696
## [661] 0.639497700 0.142517748 0.714515927 0.836586610 0.870024783
## [666] 0.432808021 0.747675468 0.883641680 0.822488780 0.392719149
## [671] 0.656572849 0.807324906 0.477240540 0.867643490 0.744945324
## [676] 0.630278516 0.829823911 0.632270844 0.843209611 0.468434082
## [681] 0.907925278 0.791825761 0.566404600 -0.009617033 0.651239885
## [686] 0.793686252 0.747736114 0.710301002 0.726266905 0.922121314
## [691] 0.194331389 0.739093284 0.664722017 0.642689010 0.625718219
## [696] 0.862160135 0.659572985 0.493551448 0.563279280 0.846046018
## [701] 0.880357391 0.883794355 0.466006335 0.637386415 0.683142119
## [706] 0.697308760 0.813665175 0.851041107 0.659797844 0.495566826
## [711] 0.890207947 0.691899394 0.837787174 0.793578864 0.798938536
## [716] 0.681781701 0.813777243 0.909034377 0.093769858 0.739068248
## [721] 0.750038950 0.756321764 0.930405574 0.601079526 0.311631696
## [726] 0.537561972 0.839092125 0.625561537 0.856952825 0.667592666
## [731] 0.864945954 0.388478907 0.780948407 0.668188402 0.798522194
## [736] 0.286751650 0.586224259 0.890575669 0.752955431 0.899498819
## [741] 0.798232617 0.714537277 0.645450081 0.643710925 0.729995490
## [746] 0.737067049 0.349802793 0.407628284 0.851404669 0.812373576
## [751] 0.877006574 0.836500471 0.890170627 0.757932435 0.782419104
## [756] 0.685168687 0.934012019 0.289785218 0.376503489 0.775717649
## [761] 0.791810922 0.661732071 0.519974463 0.557261070 0.937596318
## [766] 0.678730620 0.925087451 0.887301159 0.737885518 0.740739938
## [771] 0.788828629 0.769010107 0.470541006 0.152520510 0.680123173
## [776] 0.532488001 0.804109410 0.886466818 0.552509635 0.732768807
## [781] 0.261367873 0.652506162 0.850078400 0.744435948 0.712472188
## [786] 0.893419001 0.563966757 0.635667498 0.855339606 0.647424216
## [791] 0.428112674 0.859394166 0.863292238 0.546887178 0.727181350
## [796] 0.716824709 0.813669964 0.573808990 0.317099191 0.859298946
## [801] 0.869260453 0.896789121 0.697295337 0.641929956 0.619360652
## [806] 0.604330312 0.734338597 0.828351089 0.708570292 0.563680397
## [811] 0.606941174 0.857038386 0.624377148 0.611433627 0.890160006
## [816] 0.553903228 0.592673665 0.825519390 0.524244715 0.855924273
## [821] 0.684883539 0.640323975 0.850020405 0.711918854 0.755520200
## [826] 0.804692253 0.544070651 0.466991796 0.562624730 0.923384864
## [831] 0.768857372 0.807316891 0.729468752 0.614473799 0.807960304
## [836] 0.526703696 0.773642822 0.304005357 0.885852772 0.831390616
## [841] 0.874945636 0.723468799 0.377537527 0.296322015 0.853262036
## [846] -0.334178614 0.889413674 0.713284070 0.864601202 0.611317751
## [851] 0.723050530 0.851056689 0.237869887 0.778088076 0.298313739
## [856] 0.850243195 0.523226168 0.949065107 0.921501854 0.814777907
## [861] 0.788855061 0.894798542 0.776113490 0.662694195 0.656660822
## [866] 0.758721475 0.790084976 0.678045147 0.642596719 0.847114478
## [871] 0.662016985 0.779881186 0.230582205 0.489321556 0.769567132
## [876] 0.661961649 0.220501152 0.744572720 0.700238225 0.806064013
## [881] 0.480403440 0.527622375 0.427367416 0.830385238 0.846198684
## [886] 0.922128176 0.412125637 0.884158936 0.831997523 0.837825117
## [891] 0.797361523 0.726945223 0.493909926 0.169018361 0.882315468
## [896] 0.368325156 0.771259169 0.520200468 0.835692471 0.562352932
## [901] 0.652459552 0.737551816 0.198324098 0.734827219 0.554099349
## [906] 0.618021609 0.506230534 0.639382476 0.550437138 0.554633233
## [911] 0.407163568 0.883317878 0.271860933 0.703129715 0.243521167
## [916] 0.465793826 0.730866363 -0.106057913 0.613026788 0.495854500
## [921] 0.624426833 0.887744332 0.722527663 0.846150122 0.770968784
## [926] 0.619870534 0.900908578 0.802792103 0.568968943 0.672697882
## [931] 0.245147836 0.816283600 0.733470064 0.908786597 0.734414892
## [936] 0.555277097 0.838220694 0.686494047 0.946264609 0.765359412
## [941] 0.484843088 0.577791569 0.752192626 0.300672029 0.812295831
## [946] 0.886232665 0.955384076 0.566638236 0.620926479 0.593222723
## [951] 0.582436573 0.648263868 0.370676139 0.786508572 0.822392475
## [956] 0.451526102 0.763623120 0.933779835 0.873464443 0.556555855
## [961] 0.744420532 0.836421072 0.872540905 0.626611492 0.753523186
## [966] 0.293393068 0.519117172 0.619581947 0.792424756 0.871099877
## [971] 0.585597667 0.568212532 0.385290868 0.723118137 0.660212452
## [976] 0.803595070 0.661530170 0.726231596 0.848309361 0.363433810
## [981] 0.691403113 0.431603233 0.813690268 0.868408325 0.280677426
## [986] 0.753871632 0.774694696 0.850065003 0.755555568 0.633713669
## [991] 0.841117173 0.522076371 0.729400155 0.492292317 -0.015682408
## [996] 0.946356838 0.899499657 0.819535790 0.594852333 0.621158534
The simulated values of the standard error are significantly higher than the initial calculated value.
4. W&S Chapter 17
# read in nutrient data
nutrient_data <- read.csv("./raw_data/all_data/chapter17/chap17q19GrasslandNutrientsPlantSpecies.csv")
# view nutrient data
nutrient_data
## nutrients species
## 1 0 36
## 2 0 36
## 3 0 32
## 4 1 34
## 5 2 33
## 6 3 30
## 7 1 20
## 8 3 23
## 9 4 21
## 10 4 16
a. Draw a scatter plot of these data. Which variable should be the explanatory variable (X), and which should be the response variable (Y)?
# Basic scatter plot using ggplot2
ggplot(nutrient_data, aes(x= species, y= nutrients)) +
geom_point()
The explanatory variable(X) is “species” while the response variable(Y) is “nutrients”.
b. What is the rate of change in the number of plant species supported per nutrient type added? Provide a standard error for your estimate.
fit.lm <-lm(nutrients ~ species, data = nutrient_data)
slope <- coef(fit.lm)
slope
## (Intercept) species
## 6.3106539 -0.1605215
# rate=(slope)nutrients+(intercept)
fit.lm$coefficient[1] + fit.lm$coefficient[2]
## (Intercept)
## 6.150132
The rate of change is 6.150132 .
c. Add the least-squares regression line to your scatter plot. What fraction of the variation in the number of plant species is “explained” by the number of nutrients added?
# Fit a model
nutrient_lm <- lm(nutrients ~ species, data = nutrient_data)
# use anova to check p-value
anova_tb <- anova(nutrient_lm) %>%
# use tidy() to summarize model statistical findings
tidy()
anova_tb
## # A tibble: 2 x 6
## term df sumsq meansq statistic p.value
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 species 1 12.6 12.6 9.24 0.0161
## 2 Residuals 8 11.0 1.37 NA NA
# R^2 = SS(regression)/SS(total)
anova_tb[1,3]/(anova_tb[1,3] + anova_tb[2, 3])
## sumsq
## 1 0.5359785
The fraction of plant species explained by the nutrient added is 0.5359785
d. Test the null hypothesis of no treatment effect on the number of plant species.
# test the null hypothesis by checking whether regression mean square is greater than residual mean square
anova_tb[1,3] > anova_tb[2, 3]
## sumsq
## [1,] TRUE
The null hypothesis is false and hence, rejected because the regression mean square value is higher than the residual mean square value as shown in the ANOVA table in 4C above. That is; anova_tb[1,3] > anova_tb[2, 3].
5. W&S Chapter 17-25
# load beetle data
beetle <- read.csv("./raw_data/all_data/chapter17/chap17q25BeetleWingsAndHorns.csv")
# view beetle data
beetle
## hornSize wingMass
## 1 0.074 -42.8
## 2 0.079 -21.7
## 3 0.019 -18.8
## 4 0.017 -16.0
## 5 0.085 -12.8
## 6 0.081 11.6
## 7 0.011 7.6
## 8 0.023 1.6
## 9 0.005 3.7
## 10 0.007 1.1
## 11 0.004 -0.8
## 12 -0.002 -2.9
## 13 -0.065 12.1
## 14 -0.065 20.1
## 15 -0.014 21.2
## 16 -0.014 22.2
## 17 -0.132 20.1
## 18 -0.143 12.5
## 19 -0.177 7.0
a. Use these results to calculate the residuals
beetle_sims <- beetle %>%
mutate('Predicted_relative_wing_mass_mg' = c(-9.9, -10.6, -2.6, 2.4, -11.4, -10.9, -1.6, -3.2, -0.8, -1.1, -0.7, 0.1, 8.5, 8.5, 1.7, 1.7, 17.4, 18.8, 23.3))
beetle_sims
## hornSize wingMass Predicted_relative_wing_mass_mg
## 1 0.074 -42.8 -9.9
## 2 0.079 -21.7 -10.6
## 3 0.019 -18.8 -2.6
## 4 0.017 -16.0 2.4
## 5 0.085 -12.8 -11.4
## 6 0.081 11.6 -10.9
## 7 0.011 7.6 -1.6
## 8 0.023 1.6 -3.2
## 9 0.005 3.7 -0.8
## 10 0.007 1.1 -1.1
## 11 0.004 -0.8 -0.7
## 12 -0.002 -2.9 0.1
## 13 -0.065 12.1 8.5
## 14 -0.065 20.1 8.5
## 15 -0.014 21.2 1.7
## 16 -0.014 22.2 1.7
## 17 -0.132 20.1 17.4
## 18 -0.143 12.5 18.8
## 19 -0.177 7.0 23.3
# Fit a model in the simulated beetle data.
beetle_lm <- lm(Predicted_relative_wing_mass_mg ~ hornSize, data = beetle_sims)
# calculate resduals from fit
residual_data <- residuals(beetle_lm)
# view residual values
residual_data
## 1 2 3 4 5 6
## -0.31686109 -0.36069358 -0.23470372 4.50282927 -0.37329257 -0.39822658
## 7 8 9 10 11 12
## -0.28457174 -0.30976971 -0.27197275 -0.30950575 -0.30320626 -0.29060727
## 13 14 15 16 17 18
## -0.15831791 -0.15831791 -0.26540930 -0.26540930 -0.05096257 -0.09453109
## 19
## -0.05647017
b. Use your results from part (a) to produce a residual plot.
beet_new <- beetle %>%
mutate('residuals' = residual_data)
residual_plot <- ggplot(beet_new, aes(x= hornSize, y= residuals)) +
geom_point()
residual_plot
c. Use the graph provided and your residual plot to evaluate the main assumptions of linear regression.
If the assumptions of normality and equal variance of residuals are not met because of the following reasons;
For the residual plot; the cloud of points above the horizontal line at zero are asymmetric with more points below the line than above the line. On the other hand, the graph provided also violates the assumptions of normality as there are more points above the horizontal zero line than below it.
No noticeable curvature observed as we move from left to right along the x-axis in the residual plot.
No equal variance of points above and below the line at all values of X in both the residual plot and the graph provided.
d. In light of your conclusions in part (c), what steps should be taken?
e. Do any other diagnostics misbehave?
Yes. Using noticeable curvature to evaluate the assumption of linear regression is not an effective diagnostic method.
6. W&S Chapter 17-30
# load nuclear data
nuclear <- read.csv("./raw_data/all_data/chapter17/chap17q30NuclearTeeth.csv")
# view data
nuclear
## dateOfBirth deltaC14
## 1 1985.5 89
## 2 1983.5 109
## 3 1990.5 91
## 4 1987.5 127
## 5 1990.5 99
## 6 1984.5 110
## 7 1983.5 123
## 8 1989.5 105
## 9 1963.5 622
## 10 1971.7 262
## 11 1963.7 471
## 12 1990.5 112
## 13 1975.0 285
## 14 1970.2 439
## 15 1972.6 363
## 16 1971.8 391
a. What is the approximate slope of the regression line?
# fit a regression line through the data points
fit.lm <-lm( dateOfBirth ~ deltaC14, data = nuclear)
slope <- coef(fit.lm)
slope
## (Intercept) deltaC14
## 1992.26736906 -0.05325906
The approximate slope of the regression line is -0.053
b. Which pair of lines shows the confidence bands? What do these confidence bands tell us?
The confidence bands are the pair of lines closest to the regression line. It shows the 95% confidence bands for the predicted mean date of births at every amount of Carbon 14.
c. Which pair of lines shows the prediction interval? What does this prediction interval tell us?
The prediction intervals are the pair of lines farther away from the regression line. It shows the 95% prediction intervals for the predicted date of births of each cadaver. n = 16.
d. Using predict() and geom_ribbon() in ggplot2, reproduce the above plot showing data, fit, fit interval, and prediction interval.
# fit interval
fit_nuclear <- predict(fit.lm,
nuclear,
interval = "confidence") %>%
as_tibble() %>%
rename(lwr_ci = lwr,
upr_ci = upr)
nuclear_ci <- cbind(nuclear$deltaC14, fit_nuclear)
# prediction interval
predict_nuclear <- predict(fit.lm,
nuclear,
interval = "prediction") %>%
as_tibble() %>%
rename(lwr_pi = lwr,
upr_pi = upr)
# create a new nuclear dataset which includes the prediction and fit values
new_nuclear <- nuclear_ci %>%
# mutate prediction and fit values to make a new dataframe
mutate('upper_pi' = predict_nuclear$upr_pi,
'lower_pi' = predict_nuclear$lwr_pi,
'date_of_birth' = nuclear$dateOfBirth)
# view new dataframe
new_nuclear
## nuclear$deltaC14 fit lwr_ci upr_ci upper_pi lower_pi date_of_birth
## 1 89 1987.527 1985.140 1989.914 1995.031 1980.024 1985.5
## 2 109 1986.462 1984.212 1988.712 1993.923 1979.002 1983.5
## 3 91 1987.421 1985.048 1989.794 1994.919 1979.922 1990.5
## 4 127 1985.503 1983.367 1987.640 1992.931 1978.076 1987.5
## 5 99 1986.995 1984.678 1989.312 1994.476 1979.514 1990.5
## 6 110 1986.409 1984.166 1988.652 1993.867 1978.950 1984.5
## 7 123 1985.717 1983.556 1987.877 1993.151 1978.282 1983.5
## 8 105 1986.675 1984.399 1988.951 1994.144 1979.207 1989.5
## 9 622 1959.140 1954.645 1963.635 1967.555 1950.726 1963.5
## 10 262 1978.313 1976.516 1980.111 1985.650 1970.976 1971.7
## 11 471 1967.182 1964.108 1970.256 1974.932 1959.433 1963.7
## 12 112 1986.302 1984.072 1988.532 1993.757 1978.848 1990.5
## 13 285 1977.089 1975.238 1978.939 1984.439 1969.739 1975.0
## 14 439 1968.887 1966.086 1971.688 1976.532 1961.242 1970.2
## 15 363 1972.934 1970.703 1975.166 1980.390 1965.479 1972.6
## 16 391 1971.443 1969.018 1973.868 1978.958 1963.928 1971.8
ggplot(data = new_nuclear,
mapping = aes(x = nuclear$deltaC14,
y = date_of_birth)) +
#prediction interval
geom_ribbon(mapping = aes(ymin = lower_pi,
ymax = upper_pi),
color = "blue",
alpha = 0.1) +
# fit interval - just coefficient error (precision)
geom_ribbon(mapping = aes(ymin = lwr_ci,
ymax = upr_ci),
color = "blue",
alpha = 0.1) +
geom_point() +
stat_smooth(method = "lm") #shows error around our FIT
## `geom_smooth()` using formula 'y ~ x'