Below is a short demonstration of R based on an assignment for a course at University of Illinois at Chicago.
Students were given data files with inputs and asked to create plots and analyses based on the values.
The first set is composed of the following variables: - Weight: a numeric vector giving the body weight of the chick (gm). - Time: a numeric vector giving the number of days since birth when the measurement was made. - Chick: a grouping variable - Diet : different diet that chicken received
The tasks required by the assignment are as follows: 0. Make a descriptive statistic table use the package/function of your choice and output to MS Word 1. Run a t-test to determine if the mean weight of all sampled chicken is different from the population mean(mu=100) support you conclusion with a boxplot. 2. Make a scatterplot show the relationship of chicken weight and time, add a fitted line for each diet treatment 3. Make a matrix of scatter plot defined by “diet” variable, choose a grid layout whichever fits your data. 4. Add the equations and \(R^2\) to your plots
library(knitr) # Knit your markdown to Word,Pdf and etc.
library(beepr)
library(rio) # Swiss-army knife for data I/O
library(DescTools) # A collection of functions for efficiently describing data
library(ggpubr) # facilitates the creation of beautiful ggplot2-based graphs
library(car) # Some useful function for learning R
library(stargazer)
library(psych)
# And
library(tidyverse) # tidyverse
library(rstatix) # simple and intuitive pipe-friendly framework
library(janitor) # simplified data cleaning tool
library(corrr) # correlation ish in R
library(summarytools) # for reporting your results
library(ggstatsplot) # save your time on box plot here
library(table1)
library(afex)
#import data and check its integrity
chicken <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/chicken.csv")
head(chicken)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
str(chicken)
## 'data.frame': 578 obs. of 4 variables:
## $ weight: int 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : int 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Diet : int 1 1 1 1 1 1 1 1 1 1 ...
chicken[is.na(chicken)] # how many NA in our data (hopefully 0)
## integer(0)
#Make a descriptive statistic table use the package/function
#of your choice and output to MS Word
chicken.table <- describe(chicken)
export(chicken.table,chickenStats <- tempfile(fileext = ".txt"))
#Run a t-test to determine if the mean weight of all sampled chicken
#is different from the population mean(mu=100)
t1 <- chicken %>%
t_test(weight ~ 1, #p value is less than 0.01% so it is definitely significant
mu = 100,
detailed = T)
#t1 <- t.test(chicken$weight, #in base R
# mu = 100)
t1 #Displays information from our t-test, including our p-value
## # A tibble: 1 × 12
## estimate .y. group1 group2 n statistic p df conf.low conf.high
## * <dbl> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 122. weig… 1 null … 578 7.38 5.53e-13 577 116. 128.
## # … with 2 more variables: method <chr>, alternative <chr>
#shows a small effect size
chicken %>% cohens_d(weight ~ 1, mu = 100)
## # A tibble: 1 × 6
## .y. group1 group2 effsize n magnitude
## * <chr> <chr> <chr> <dbl> <int> <ord>
## 1 weight 1 null model 0.307 578 small
#Support you conclusion with a boxplot.
ggboxplot(chicken$weight,
add = c("mean", "jitter"), width = .5,
ylab = "Weight (g)", xlab= "")+
labs(subtitle = get_test_label(t1,detailed = T),title="Weights of Chickens")
#look at all those chickens
#Make a scatterplot show the relationship of chicken weight and time
#add a fitted line for each diet treatment
chicken$Diet <- as.factor(chicken$Diet)
ggplot(chicken, aes(x = Time, y = weight, color = Diet))+
geom_point()+
geom_smooth(method = "lm",se = F)+
scale_color_manual(values = c("#0009AD", "#ADA500", "#AD002B","#00AD82"))+
labs( x = "Time (weeks)",
y = "Chicken Weight (g)",
title = "Increase in Chicken Weight over Time by Diet")+
theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))
## `geom_smooth()` using formula 'y ~ x'
# Make a matrix of scatter plot defined by "diet" variable
# Choose a grid layout that fits your data.
chicken.Diets <- split(chicken,chicken$Diet)
chicken.D1 <- as.data.frame(chicken.Diets[1])
chicken.D2 <- as.data.frame(chicken.Diets[2])
chicken.D3 <- as.data.frame(chicken.Diets[3])
chicken.D4 <- as.data.frame(chicken.Diets[4])
par(mfrow=c(2,2))
plot(chicken.D1$X1.Time, chicken.D1$X1.weight,
main = "Chicken Weight Over Time for Diet 1",
xlab = "Time (weeks)",
ylab = "Chicken Weight (g)")
abline(lm(X1.weight~X1.Time, data = chicken.D1))
plot(chicken.D2$X2.Time, chicken.D2$X2.weight,
main = "Chicken Weight Over Time for Diet 2",
xlab = "Time (weeks)",
ylab = "Chicken Weight (g)")
abline(lm(X2.weight~X2.Time, data = chicken.D2))
plot(chicken.D3$X3.Time, chicken.D3$X3.weight,
main = "Chicken Weight Over Time for Diet 3",
xlab = "Time (weeks)",
ylab = "Chicken Weight (g)")
abline(lm(X3.weight~X3.Time, data = chicken.D3))
plot(chicken.D4$X4.Time, chicken.D4$X4.weight,
main = "Chicken Weight Over Time for Diet 4",
xlab = "Time (weeks)",
ylab = "Chicken Weight (g)")
abline(lm(X4.weight~X4.Time, data = chicken.D4))
#Redoes everything so I can get that equation and R^2 on the graphs....
c1p <- ggplot(chicken.D1, aes(x = X1.Time, y = X1.weight))+
geom_point() +
geom_smooth(method = lm,
alpha = .2,
se = F) +
labs( x = "Time (in weeks)",
y = "Chicken Weight (g)",
title = "Chicken Weight Over Time for Diet 2")
lm_eqn.1 <- function(df){
m <- lm(X1.weight ~ X1.Time , df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
c1p1 <- c1p + geom_text(x = 4, y = 280, label = lm_eqn.1(chicken.D1), cex=2, parse = TRUE) +
theme(plot.title = element_text(size = 10))
c2p <- ggplot(chicken.D2, aes(x = X2.Time, y = X2.weight))+
geom_point() +
geom_smooth(method = lm,
alpha = .2,
se = F) +
labs( x = "Time (in weeks)",
y = "Chicken Weight (g)",
title = "Chicken Weight Over Time for Diet 2")
lm_eqn.2 <- function(df){
m <- lm(X2.weight ~ X2.Time , df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
c2p1 <- c2p + geom_text(x = 4, y = 280, label = lm_eqn.2(chicken.D2),cex=2, parse = TRUE) +
theme(plot.title = element_text(size = 10))
c3p <- ggplot(chicken.D3, aes(x = X3.Time, y = X3.weight))+
geom_point() +
geom_smooth(method = lm,
alpha = .2,
se = F) +
labs( x = "Time (in weeks)",
y = "Chicken Weight (g)",
title = "Chicken Weight Over Time for Diet 3")
lm_eqn.3 <- function(df){
m <- lm(X3.weight ~ X3.Time , df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
c3p1 <- c3p + geom_text(x = 4, y = 280, label = lm_eqn.3(chicken.D3), cex=2, parse = TRUE) +
theme(plot.title = element_text(size = 10))
c4p <- ggplot(chicken.D4, aes(x = X4.Time, y = X4.weight))+
geom_point() +
geom_smooth(method = lm,
alpha = .2,
se = F) +
labs( x = "Time (in weeks)",
y = "Chicken Weight (g)",
title = "Chicken Weight Over Time for Diet 4")
lm_eqn.4 <- function(df){
m <- lm(X4.weight ~ X4.Time , df);
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
c4p1 <- c4p + geom_text(x = 4, y = 280, label = lm_eqn.4(chicken.D4), cex=2, parse = TRUE) +
theme(plot.title = element_text(size = 10))
#Add the equations and R^2 to your plot ... I did not enjoy this
ggarrange(c1p1, c2p1, c3p1, c4p1, ncol = 2, nrow = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
beep(1) #beepr was actually super useful for this one for troubleshooting
The second portion of the assignment uses another data set that models tooth growth. The variables for the data set are defined as follows: len : Tooth length supp : Supplement type (VC or OJ) dose : Dose in milligrams/day
The assignment requested the following actions be performed: - Make a descriptive statistics table and output to MS Word or other destination - Make a graph display the 3 different dosages for 2 different types of supplement - Make a contingency table and test if supplement type is independent from dosage intake - Test if the proportions of all three types of supplements are equal - Compare the mean tooth growth from two supplements, write up your results - Visualize your results
Then, based on your findings, what’s the next step you would like to consider to see how different levels of dosage could have effects on the mean tooth growth rates?
#Make a descriptive statistics table and output to MS Word or other destination
TG <- ToothGrowth
head(TG)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
TG <- na.omit(TG) #removes NA rows
TG.table <- describe(TG) #Produces descriptive table to export
TG$dose <-as.factor(TG$dose)
export(TG.table,ToothSupps <- tempfile(fileext = ".txt"))
apaTables::apa.1way.table(iv=supp,
dv=len,
data=TG,
file="ToothGrowth.doc")
##
##
## Descriptive statistics for len as a function of supp.
##
## supp M SD
## OJ 20.66 6.61
## VC 16.96 8.27
##
## Note. M and SD represent mean and standard deviation, respectively.
##
ggplot(TG, aes(supp, len, fill=dose)) +
geom_bar(stat="identity", position = "dodge") +
labs(title="Tooth Length by Supplement (At 3 Dosages)", x="Supplement", y="Tooth Length")
#Make a contingency table and test if supplement type
#is independent from dosage intake
conTableTG = table(ToothGrowth)
#print(conTableTG)
#I commented this out because it is long but it is here
chisq.test(TG$dose, TG$supp, correct = TRUE) #Shows independence of variables
##
## Pearson's Chi-squared test
##
## data: TG$dose and TG$supp
## X-squared = 0, df = 2, p-value = 1
# Test if the proportions of all three dosages of supplements are equal
TG %>%
group_by(dose) %>%
summarise(
Mean = mean(len),
SD= sd(len),
count = n()
) # all dosages are equal with n = 20
## # A tibble: 3 × 4
## dose Mean SD count
## <fct> <dbl> <dbl> <int>
## 1 0.5 10.6 4.50 20
## 2 1 19.7 4.42 20
## 3 2 26.1 3.77 20
# Compare the mean tooth growth from two supplements, write up your results
TG %>%
group_by(supp) %>%
summarise(
Mean = mean(len),
SD= sd(len),
count = n()
)
## # A tibble: 2 × 4
## supp Mean SD count
## <fct> <dbl> <dbl> <int>
## 1 OJ 20.7 6.61 30
## 2 VC 17.0 8.27 30
#The results show the mean tooth length for the OJ supplement is greater than
#the mean tooth length for the VC supplement. The OJ supplement also has
# a smaller standard deviation. Both supplements have the same number of
# observations as well, which shows balance in our data set.
# Visualize your results
ggbetweenstats(
data = TG,
x = supp,
y = len,
plot.type = "box", # for boxplot
type = "parametric", # for student's t-test
var.equal = TRUE, # equal variances
centrality.plotting = F# remove mean
) +
labs(caption = NULL)
beep(2)
Based on the findings presented here, it would appear that OJ is a more effective supplement than VC. However, VC performed better at the highest dosage. I would like to compare VC and OJ at a high dosage to see if the increased effect on tooth growth is statiscally signficant. The sample size is small, so it may not be enough to warrant tests of both supplements at the higher dosage. There also may be other factors that led to the selection of the dosages used in this trial such as the risks of side effects associated with both supplements.
In any event, if the VC supplement does perform significantly better than the OJ supplement, I would likely test for the effect size of the supplement choice at this dosage. I think it would be unlikely that the effect size would warrant further research, but I would like to report the statistical results so that the researcher can make their decision based on their dental / medical expertise.
The third data set contains data for mice after a hypothetical treatment: before : Weight of the mice after treatment after: Weight of the mice after treatment
The assignment asked us to complete the following tasks: - Check your data and visualize the data after your import - Check the assumption and run a paired sample t-test - Interpret the result, and report it with a plot
mice <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/mice.csv")
head(mice)
## group weight
## 1 before 200.1
## 2 before 190.9
## 3 before 192.7
## 4 before 213.0
## 5 before 241.4
## 6 before 196.9
str(mice)
## 'data.frame': 20 obs. of 2 variables:
## $ group : chr "before" "before" "before" "before" ...
## $ weight: num 200 191 193 213 241 ...
mice[is.na(mice)] #all good
## character(0)
mice$group[mice$group == "before"] <-0
mice$group[mice$group == "after"] <-1
mice$group <- as.factor(mice$group)
mice$group <- factor(mice$group, labels = c("before", "after"),ordered=TRUE)
ggbetweenstats(
data = mice,
x = group,
y = weight,
plot.type = "box", # for boxplot
type = "parametric", # for student's t-test
var.equal = TRUE, # equal variances
centrality.plotting = F# remove mean
) +
labs(caption = NULL)
#Check the assumption and run a paired sample t-test
# Outliers
mice %>%
identify_outliers(weight)
## [1] group weight is.outlier is.extreme
## <0 rows> (or 0-length row.names)
# No outliers
# Normality
mice %>%
shapiro_test(weight)
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 weight 0.809 0.00117
# Population is not normal since p is smaller than 0.01
# 2. visualize it by Q-Q plot
ggqqplot(mice, x= "weight", facet.by = "group")
# Levene’s test for Homogeneity
# A non-significant result here (greater than .05) indicates you have met the assumption of homogeneity of variance (i.e., equal variances are assumed).
# A significant result here (less than .05) indicates you have violated the assumption of homogeneity of variance (i.e., equal variances are not assumed).
mice %>%
levene_test(weight~group)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 18 1.13 0.302
# p is .3 (greater than .05 and non-significant), the assumption of homogeneity
#of variance is met (i.e., equal variances are assumed).
rquery.t.test<-function(x, y = NULL, paired = FALSE, graph = TRUE, ...)
{
# I. Preliminary test : normality and variance tests
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
var.equal = FALSE # by default
# I.1 One sample t test
if(is.null(y)){
if(graph) par(mfrow=c(1,2))
shapiro.px<-normaTest(x, graph,
hist.title="X - Histogram",
qq.title="X - Normal Q-Q Plot")
if(shapiro.px < 0.05)
warning("x is not normally distributed :",
" Shapiro-Wilk test p-value : ", shapiro.px,
".\n Use a non-parametric test like Wilcoxon test.")
}
# I.2 Two samples t test
if(!is.null(y)){
# I.2.a unpaired t test
if(!paired){
if(graph) par(mfrow=c(2,2))
# normality test
shapiro.px<-normaTest(x, graph,
hist.title="X - Histogram",
qq.title="X - Normal Q-Q Plot")
shapiro.py<-normaTest(y, graph,
hist.title="Y - Histogram",
qq.title="Y - Normal Q-Q Plot")
if(shapiro.px < 0.05 | shapiro.py < 0.05){
warning("x or y is not normally distributed :",
" Shapiro test p-value : ", shapiro.px,
" (for x) and ", shapiro.py, " (for y)",
".\n Use a non parametric test like Wilcoxon test.")
}
# Check for equality of variances
if(var.test(x,y)$p.value >= 0.05) var.equal=TRUE
}
# I.2.b Paired t-test
else {
if(graph) par(mfrow=c(1,2))
d = x-y
shapiro.pd<-normaTest(d, graph,
hist.title="D - Histogram",
qq.title="D - Normal Q-Q Plot")
if(shapiro.pd < 0.05 )
warning("The difference d ( = x-y) is not normally distributed :",
" Shapiro-Wilk test p-value : ", shapiro.pd,
".\n Use a non-parametric test like Wilcoxon test.")
}
}
# II. Student's t-test
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
res <- t.test(x, y, paired=paired, var.equal=var.equal, ...)
return(res)
}
#+++++++++++++++++++++++
# Helper function
#+++++++++++++++++++++++
# Performs normality test using Shapiro Wilk's method
# The histogram and Q-Q plot of the data are plotted
# x : a (non-empty) numeric vector of data values.
# graph : possible values are TRUE or FALSE. If TRUE,
# the histogram and the Q-Q plot of the data are displayed
# hist.title : title of the histogram
# qq.title : title of the Q-Q plot
normaTest<-function(x, graph=TRUE,
hist.title="Histogram",
qq.title="Normal Q-Q Plot",...)
{
# Significance test
#++++++++++++++++++++++
shapiro.p<-signif(shapiro.test(x)$p.value,1)
if(graph){
# Plot : Visual inspection
#++++++++++++++++
h<-hist(x, col="lightblue", main=hist.title,
xlab="Data values", ...)
m<-round(mean(x),1)
s<-round(sd(x),1)
mtext(paste0("Mean : ", m, "; SD : ", s),
side=3, cex=0.8)
# add normal curve
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="red", lwd=2)
# qq plot
qqnorm(x, pch=19, frame.plot=FALSE,main=qq.title)
qqline(x)
mtext(paste0("Shapiro-Wilk, p-val : ", shapiro.p),
side=3, cex=0.8)
}
return(shapiro.p)
}
mice.w <- mice %>%
mutate(id = 1:20)
mice.w$id[11:20] <- mice.w$id[1:10]
mice.w <- as.data.frame(pivot_wider(
mice.w,
names_from = group,
values_from = weight))
rquery.t.test(mice.w$before,mice.w$after, paired =T)
##
## Paired t-test
##
## data: x and y
## t = -20.883, df = 9, p-value = 6.2e-09
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -215.5581 -173.4219
## sample estimates:
## mean difference
## -194.49
t.test(mice.w$before,mice.w$after, paired =T)
##
## Paired t-test
##
## data: mice.w$before and mice.w$after
## t = -20.883, df = 9, p-value = 6.2e-09
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -215.5581 -173.4219
## sample estimates:
## mean difference
## -194.49
#p is smaller than 0.01, so we reject our null hypothesis and accept our alternative
#that the true mean difference is not equal to 0.
ggwithinstats(
data = mice,
x = group,
y = weight,
effsize.type = "g",
type = "parametric", # for student's t-test
centrality.plotting = T, # remove mean point or not
title = "Mice Weight Before and After Treatment"
) + labs(caption = NULL,x="Before or After Treatment")
beep(3)
The final data set tracked air quality and wind speed to demonstrate a correlation.
The last tasks asked of us for this assignment were as follows: - Regress Ozone on the Wind speed, and then interpret out the result - Check the assumptions, does your data meet all assumptions? - Then make a scatter plot showing their relationships
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
aq.reg1 <- lm(Ozone ~Wind, data = airquality)
summary(aq.reg1)
##
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.572 -18.854 -4.868 15.234 90.000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.8729 7.2387 13.38 < 2e-16 ***
## Wind -5.5509 0.6904 -8.04 9.27e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.47 on 114 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.3619, Adjusted R-squared: 0.3563
## F-statistic: 64.64 on 1 and 114 DF, p-value: 9.272e-13
par(mfrow=c(2,2))
plot(aq.reg1)
outlierTest(aq.reg1)
## rstudent unadjusted p-value Bonferroni p
## 117 3.647406 0.0004022 0.046655
#117 is an outlier, and 48 is worth removing as an outlier as well as both have
#the same Cook's d value of .23.
#The plots do confirm our assumptions for the regression:
#the linear relationship assumptions are confirmed by the largely horizontal
#fitted vs residuals plot, the normal distribution of the variables by the
#QQ plot that is relatively fitted to a straight line; the homogeneity of
#the variance of the residuals is confirmed by the equal spread about the horizontal
#line, and the residuals vs. leverage plot identifies only two or three outliers
#that we may choose to remove from our data. We proceed to test for 48, 62 and 117
#through the outlier test function to determine which ought to be removed. We find
#only 117 as identified by the test.
augment.AQ <- augment(aq.reg1)
ggplot(airquality, aes(x = Wind, y = Ozone))+
geom_point() +
geom_smooth(method = lm,
alpha = .6,
se = F) +
labs( x = "Wind Speed MPH",
y = "Ozone Concentration BBP",
title = "Relationship Between Wind Speed and Ozone Level")+
stat_regline_equation(label.x = 15, label.y = 140, aes(label = ..eq.label..)) + # show the lm formula
stat_regline_equation(label.x = 15, label.y = 130, aes(label = ..rr.label..)) +# show the R^2
theme_linedraw()
## `geom_smooth()` using formula 'y ~ x'
ggplot(augment.AQ, aes(x = Wind, y = Ozone))+
geom_point() +
geom_smooth(method = lm,
alpha = .6,
se = F) +
geom_segment(aes(xend=Wind,yend=.fitted), color="red", size=0.3)+
labs( x = "Wind Speed MPH",
y = "Ozone Concentration BBP",
title = "Distance Between Observed Values and Fitted Regression Line")+
stat_regline_equation(label.x = 15, label.y = 140, aes(label = ..eq.label..)) + # show the lm formula
stat_regline_equation(label.x = 15, label.y = 130, aes(label = ..rr.label..)) +# show the R^2
theme_linedraw()
## `geom_smooth()` using formula 'y ~ x'
beep(5)
Load the anxiety.sav data, here are the variables of interest :
PartNum: participants ID Condition: 1=full song, 2=lyric only, 3=instrumental only, 4= no music x_anxiety: Mean Anxiety Score year: 1=freshman, 2=sophomore, 3=junior, 4=senior, 5=other
Please preprocess the dataset and: - Run a statistical test to compare the mean anxiety scores between different genders - Is there any statistical differences among the means of Anxiety Score under different treatment conditions? - Do students at different school years show statistical differences in mean Anxiety Score? - Support your conclusion with plots and test results
anxiety <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/anxiety.sav")
head(anxiety)
## PartNum Condition x_anxiety Pieces Gender Age Year
## 1 1 4 2.846154 8 2 20 3
## 2 2 2 2.076923 14 2 27 4
## 3 3 4 4.230769 9 2 20 3
## 4 4 2 2.692308 12 2 19 2
## 5 5 1 5.769231 16 1 18 2
## 6 6 3 4.153846 11 2 19 2
anxiety[is.na(anxiety)]
## numeric(0)
#Run a statistical test to compare the mean anxiety scores between different genders
anxiety$Gender <- factor(anxiety$Gender, labels = c("Gender1", "Gender2"))
anxiety$Condition <- factor(anxiety$Condition, labels = c("FullSong", "LyricsOnly","Instrumentals","NoMusic"))
anxiety$Year <- factor(anxiety$Year, labels = c("Freshman", "Sophomore","Junior","Senior"))
anxiety.g <- group_by(anxiety, Gender) %>%
summarise(
count = n(),
mean = mean(x_anxiety, na.rm = TRUE)
)
#Our samples are very skewed with a larger sample for one gender than the
#other by several orders of magnitude. The mean anxiety level of our smaller
#sample seems much higher
anxiety.g.aov <- aov(x_anxiety ~ Gender,data = anxiety)
summary(anxiety.g.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 1 7.628 7.628 11.42 0.00169 **
## Residuals 38 25.388 0.668
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#There is a statistically significant difference in mean anxiety levels
#by gender at the 0.01 level.
ggboxplot(anxiety, x = "Gender", y = "x_anxiety",
ylab = "Anxiety Score", xlab = "Gender")
# Normality
levene_test(anxiety.g.aov)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 38 2.40 0.130
#p is larger than 0.05; Homogenity is met
par(mfrow=c(2,2))
plot(anxiety.g.aov,1)
plot(anxiety.g.aov,2) #Very fitted, looks good.
qqPlot(anxiety.g.aov) # All in CI, looks good.
## [1] 2 26
hist(anxiety.g.aov$residuals) #Hm.
shapiro_test(anxiety.g.aov$residuals) #nevermind, we're good. All normal here
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 anxiety.g.aov$residuals 0.977 0.572
#with a p value waaaaaay bigger than 0.05.
#Is there any statistical differences among the means of Anxiety Score
#under different treatment conditions?
anxiety.t <- group_by(anxiety, Condition) %>%
summarise(
count = n(),
mean = mean(x_anxiety, na.rm = TRUE)
)
anxiety.t.aov <- aov(x_anxiety ~ Condition,data = anxiety)
summary(anxiety.t.aov) #There is some significant difference at the 0.05 level
## Df Sum Sq Mean Sq F value Pr(>F)
## Condition 3 8.026 2.6755 3.854 0.0172 *
## Residuals 36 24.990 0.6942
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggline(anxiety,"Condition","x_anxiety",
add=c("mean_se","jitter"),
ylab="Anxiety Level",
main="Means Anxiety Levels by Treatment Condition")
TukeyHSD(anxiety.t.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = x_anxiety ~ Condition, data = anxiety)
##
## $Condition
## diff lwr upr p adj
## LyricsOnly-FullSong -1.16153846 -2.16504301 -0.1580339 0.0179565
## Instrumentals-FullSong -0.23076923 -1.23427378 0.7727353 0.9252264
## NoMusic-FullSong -0.21538462 -1.21888917 0.7881199 0.9380223
## Instrumentals-LyricsOnly 0.93076923 -0.07273532 1.9342738 0.0772622
## NoMusic-LyricsOnly 0.94615385 -0.05735071 1.9496584 0.0706163
## NoMusic-Instrumentals 0.01538462 -0.98811994 1.0188892 0.9999742
#Tukey's probably isn't appropriate here due to small sample size
#But when we use it, we do see a significant difference only between
#The Lyrics Only and Full Song groups
levene_test(anxiety.t.aov)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 3 36 0.532 0.663
#p is larger than 0.05; Homogeneity is met
par(mfrow=c(2,2))
plot(anxiety.t.aov,1) #2,26 and 33 flagged as outliers
plot(anxiety.t.aov,2) #Very fitted, looks good.
qqPlot(anxiety.t.aov) # Most all in CI, looks good.
## [1] 5 35
hist(anxiety.t.aov$residuals) #Hm.
shapiro_test(anxiety.t.aov$residuals) #greater than 0.05, so it
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 anxiety.t.aov$residuals 0.954 0.104
#meets our assumption of normality, which surprises me when looking at that graph tbh
#Do students at different school years show
#statistical differences in mean Anxiety Score?
anxiety.y <- group_by(anxiety, Year) %>%
summarise(
count = n(),
mean = mean(x_anxiety, na.rm = TRUE)
)
anxiety.y.aov <- aov(x_anxiety ~ Year,data = anxiety)
summary(anxiety.y.aov) #There is not a statistically significant difference by year.
## Df Sum Sq Mean Sq F value Pr(>F)
## Year 3 4.192 1.3974 1.745 0.175
## Residuals 36 28.824 0.8007
ggline(anxiety,"Year","x_anxiety",
add=c("mean_se","jitter"),
ylab="Anxiety Level",
main="Means Anxiety Levels by Year in School")
#ANOVA
levene_test(anxiety.y.aov)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 3 36 0.814 0.495
#p is larger than 0.05; Homogenity is met
par(mfrow=c(2,2))
plot(anxiety.y.aov,1) #2,26 and 33 flagged as outliers
plot(anxiety.y.aov,2) #Very fitted, looks good.
qqPlot(anxiety.y.aov) # Most all in CI, looks good.
## [1] 18 35
hist(anxiety.y.aov$residuals) #Hm.
shapiro_test(anxiety.y.aov$residuals) #greater than 0.05, so it
## # A tibble: 1 × 3
## variable statistic p.value
## <chr> <dbl> <dbl>
## 1 anxiety.y.aov$residuals 0.965 0.247
#meets our assumption, which surprises me when looking at that graph tbh
beep(4)