library(knitr) # Knit your markdown to Word,Pdf and etc.
library(beepr)
library(rio) # Swiss-army knife for data I/O
library(DescTools) # A collection of functions for efficiently describing data
library(ggpubr) # facilitates the creation of beautiful ggplot2-based graphs
library(car) # Some useful function for learning R
library(stargazer)
library(psych)
library(tidyverse) # tidyverse
library(rstatix) # simple and intuitive pipe-friendly framework
library(janitor) # simplified data cleaning tool
library(corrr) # correlation ish in R
library(summarytools) # for reporting your results
library(ggstatsplot) # save your time on box plot here
library(table1)
library(afex)
Data specification:
The dataset StudentInfo.csv contains survey results from 431 students enrolled at a university in the United States:
Gender: 0=Male 1=Female| Athlete : 0 = Non-athlete 1 = Athlete| Weight: weight in lbs| Height: height in inches| Sprint: 35-meter sprint time (seconds)| MileMinDur : Mile run time (hh:mm:ss)| English:Writing : Scores on placement test| State: Is student in-state or out-of-state resident?| LiveOnCampus: 0 = Off-campus 1 = On-campus| HowCommute: 1=Walk, 2 = Bike, 3 = Car, 4 = Public transit, 5 = Other| CommuteTime: Time in Mins per day SleepTime : Time in hours per week StudyTime : Time in hours per week
StudentInfo <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/EPSY594 Final Exam Package/StudentInfo.csv")
head(StudentInfo)
## ids Gender Athlete Height Weight Sprint MileMinDur English Reading Math
## 1 43783 0 0 72.35 NA 7.978 88.24 81.50 60.02
## 2 20278 0 0 70.66 179.20 8.004 0:06:21 89.45 85.25 70.19
## 3 22820 1 0 NA 198.34 8.473 0:12:44 74.06 88.68 55.89
## 4 24559 1 1 67.43 128.17 NA 0:06:25 82.61 77.30 65.52
## 5 28980 0 1 68.45 171.61 4.650 0:07:24 70.10 NA 61.40
## 6 33312 0 1 68.56 163.96 4.750 0:05:47 78.98 87.53 76.71
## Writing State LiveOnCampus HowCommute CommuteTime SleepTime StudyTime
## 1 81.44 In state 1 NA NA 7 1
## 2 73.27 In state 1 NA NA 5 2
## 3 73.16 In state 1 NA NA 2 6
## 4 80.45 Out of state 1 NA NA 7 3
## 5 77.48 In state 1 NA NA 8 3
## 6 70.79 In state 1 NA NA 9 9
str(StudentInfo)
## 'data.frame': 430 obs. of 17 variables:
## $ ids : int 43783 20278 22820 24559 28980 33312 40274 40390 29045 35676 ...
## $ Gender : int 0 0 1 1 0 0 0 1 0 1 ...
## $ Athlete : int 0 0 0 1 1 1 1 1 0 1 ...
## $ Height : num 72.3 70.7 NA 67.4 68.5 ...
## $ Weight : num NA 179 198 128 172 ...
## $ Sprint : num 7.98 8 8.47 NA 4.65 ...
## $ MileMinDur : chr "" "0:06:21" "0:12:44" "0:06:25" ...
## $ English : num 88.2 89.5 74.1 82.6 70.1 ...
## $ Reading : num 81.5 85.2 88.7 77.3 NA ...
## $ Math : num 60 70.2 55.9 65.5 61.4 ...
## $ Writing : num 81.4 73.3 73.2 80.5 77.5 ...
## $ State : chr "In state" "In state" "In state" "Out of state" ...
## $ LiveOnCampus: int 1 1 1 1 1 1 0 0 0 1 ...
## $ HowCommute : int NA NA NA NA NA NA NA 3 3 NA ...
## $ CommuteTime : int NA NA NA NA NA NA NA 26 20 NA ...
## $ SleepTime : int 7 5 2 7 8 9 8 NA 1 4 ...
## $ StudyTime : int 1 2 6 3 3 9 3 3 6 0 ...
sum(is.na(StudentInfo)) #679 total NA
## [1] 679
StudentInfo$Gender <- factor(StudentInfo$Gender, labels = c("Male", "Female"))
StudentInfo$Athlete <- factor(StudentInfo$Athlete, labels = c("Non-Athlete", "Athlete"))
StudentInfo$State[StudentInfo$State == "In state"] <-0
StudentInfo$State[StudentInfo$State == "Out of state"] <-1
StudentInfo$State <- factor(StudentInfo$State, labels = c("In State", "Out of State"))
StudentInfo$Athlete <- factor(StudentInfo$Athlete, labels = c("Non-Athlete", "Athlete"))
StudentInfo$Gender[is.na(StudentInfo$Gender)] #9 NAs in Gender
## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Male Female
StudentInfo$Gender[is.na(StudentInfo$Gender)]
## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Male Female
StudentInfo$LiveOnCampus <- factor(StudentInfo$LiveOnCampus, labels = c("Off-campus", "On-campus"))
StudentInfo.table <- describe(StudentInfo)
#Split the dataset into two subsets:
#students who are living on-campus and living off-campus
SI.split <- split(StudentInfo,StudentInfo$LiveOnCampus)
Students.OffCamp <- as.data.frame(SI.split[1])
Students.OnCamp <- as.data.frame(SI.split[2])
colnames(Students.OnCamp) <- c('ids','Gender','Athlete','Height','Weight','Sprint','MileMinDur','English','Reading','Math','Writing','State','LiveOnCampus','HowCommute','CommuteTime','SleepTime','StudyTime')
colnames(Students.OffCamp) <- c('ids','Gender','Athlete','Height','Weight','Sprint','MileMinDur','English','Reading','Math','Writing','State','LiveOnCampus','HowCommute','CommuteTime','SleepTime','StudyTime')
#Remove the original ID variable and make new ID variables for each subset you
#just created:
#For off-campus students, start the ID with 'com' and followed by a 4-digit
#random number
#For on-campus students, start with 'res' and follow by a 4-digits random
#number
NewID.On = data.frame(1:178)
NewID.Off = data.frame(1:250)
set.seed(420)
NewID.On <- NewID.On %>%
mutate(ID = paste("col",sample(1111:9999, 178,FALSE), sep = ""))
set.seed(420)
NewID.Off <- NewID.Off %>%
mutate(ID = paste("res",sample(1111:9999, 250,FALSE), sep = ""))
Students.OffCamp$ids <- NewID.Off$ID
Students.OnCamp$ids <- NewID.On$ID
#How many missing observations do the datasets contain?
#And from which column(s)?
NA.TOT.Off = sum(is.na(Students.OffCamp)) #191 NA
NA.TOT.Off.Gen = sum(is.na(Students.OffCamp$Gender)) #4 NA
NA.TOT.Off.Ath = sum(is.na(Students.OffCamp$Athlete))
NA.TOT.Off.He = sum(is.na(Students.OffCamp$Height)) #16 NA
NA.TOT.Off.We = sum(is.na(Students.OffCamp$Weight)) #31 NA
NA.TOT.Off.Spr = sum(is.na(Students.OffCamp$Sprint)) #31 NA
NA.TOT.Off.MM = sum(is.na(Students.OffCamp$MileMinDur))
NA.TOT.Off.Eng = sum(is.na(Students.OffCamp$English)) #18 NA
NA.TOT.Off.Read = sum(is.na(Students.OffCamp$Reading)) #5 NA
NA.TOT.Off.Math = sum(is.na(Students.OffCamp$Math)) #7 NA
NA.TOT.Off.Writ = sum(is.na(Students.OffCamp$Writing))#21NA
NA.TOT.Off.State = sum(is.na(Students.OffCamp$State))
NA.TOT.Off.OnC = sum(is.na(Students.OffCamp$LiveOnCampus))
NA.TOT.Off.HowC = sum(is.na(Students.OffCamp$HowCommute)) #5 NA
NA.TOT.Off.CTime = sum(is.na(Students.OffCamp$CommuteTime)) #5 N
NA.TOT.Off.SleepTime = sum(is.na(Students.OffCamp$SleepTime)) #19 NA
NA.TOT.Off.StudyTime = sum(is.na(Students.OffCamp$StudyTime)) #29 NA
NA.TOT.Off = sum(is.na(Students.OnCamp)) #480 NA
NA.TOT.Off.Gen = sum(is.na(Students.OnCamp$Gender)) #5 NA
NA.TOT.Off.Ath = sum(is.na(Students.OnCamp$Athlete))
NA.TOT.Off.He = sum(is.na(Students.OnCamp$Height)) #11 NA
NA.TOT.Off.We = sum(is.na(Students.OnCamp$Weight)) #27 NA
NA.TOT.Off.Spr = sum(is.na(Students.OnCamp$Sprint)) #28 NA
NA.TOT.Off.MM = sum(is.na(Students.OnCamp$MileMinDur))
NA.TOT.Off.Eng = sum(is.na(Students.OnCamp$English)) #8 NA
NA.TOT.Off.Read = sum(is.na(Students.OnCamp$Reading)) #5 NA
NA.TOT.Off.Math = sum(is.na(Students.OnCamp$Math)) #6 NA
NA.TOT.Off.Writ = sum(is.na(Students.OnCamp$Writing)) #10 NA
NA.TOT.Off.State = sum(is.na(Students.OnCamp$State))
NA.TOT.Off.OnC = sum(is.na(Students.OnCamp$LiveOnCampus))
NA.TOT.Off.HowC = sum(is.na(Students.OnCamp$HowCommute)) #178 NA
NA.TOT.Off.CTime = sum(is.na(Students.OnCamp$CommuteTime)) #178 NA
NA.TOT.Off.SleepTime = sum(is.na(Students.OnCamp$SleepTime)) #12 NA
NA.TOT.Off.StudyTime = sum(is.na(Students.OnCamp$StudyTime)) #12 NA
#Now, replace missing values for all continuous variables
#with group mean (mean from subset).
StudOff.HeightMean = round(mean(Students.OffCamp$Height, na.rm=TRUE),2)
Students.OffCamp$Height[is.na(Students.OffCamp$Height)] <- StudOff.HeightMean
StudOff.WeightMean = round(mean(Students.OffCamp$Weight, na.rm=TRUE),2)
Students.OffCamp$Weight[is.na(Students.OffCamp$Weight)] <- StudOff.WeightMean
StudOff.SprintMean = round(mean(Students.OffCamp$Sprint, na.rm=TRUE),3)
Students.OffCamp$Sprint[is.na(Students.OffCamp$Sprint)] <- StudOff.SprintMean
StudOff.EnglishMean = round(mean(Students.OffCamp$English, na.rm=TRUE),2)
Students.OffCamp$English[is.na(Students.OffCamp$English)] <- StudOff.EnglishMean
StudOff.ReadingMean = round(mean(Students.OffCamp$Reading, na.rm=TRUE),2)
Students.OffCamp$Reading[is.na(Students.OffCamp$Reading)] <- StudOff.ReadingMean
StudOff.MathMean = round(mean(Students.OffCamp$Math, na.rm=TRUE),2)
Students.OffCamp$Math[is.na(Students.OffCamp$Math)] <- StudOff.MathMean
StudOff.WritingMean = round(mean(Students.OffCamp$Writing, na.rm=TRUE),2)
Students.OffCamp$Writing[is.na(Students.OffCamp$Writing)] <- StudOff.WritingMean
StudOff.CommuteMean = round(mean(Students.OffCamp$CommuteTime, na.rm=TRUE),0)
Students.OffCamp$CommuteTime[is.na(Students.OffCamp$CommuteTime)] <- StudOff.CommuteMean
StudOff.SleepMean = round(mean(Students.OffCamp$SleepTime, na.rm=TRUE),0)
Students.OffCamp$SleepTime[is.na(Students.OffCamp$SleepTime)] <- StudOff.SleepMean
StudOff.StudyMean = round(mean(Students.OffCamp$StudyTime, na.rm=TRUE),0)
Students.OffCamp$StudyTime[is.na(Students.OffCamp$StudyTime)] <- StudOff.StudyMean
###
StudOn.HeightMean = round(mean(Students.OnCamp$Height, na.rm=TRUE),2)
Students.OnCamp$Height[is.na(Students.OnCamp$Height)] <- StudOn.HeightMean
StudOn.WeightMean = round(mean(Students.OnCamp$Weight, na.rm=TRUE),2)
Students.OnCamp$Weight[is.na(Students.OnCamp$Weight)] <- StudOn.WeightMean
StudOn.SprintMean = round(mean(Students.OnCamp$Sprint, na.rm=TRUE),3)
Students.OnCamp$Sprint[is.na(Students.OnCamp$Sprint)] <- StudOn.SprintMean
StudOn.EnglishMean = round(mean(Students.OnCamp$English, na.rm=TRUE),2)
Students.OnCamp$English[is.na(Students.OnCamp$English)] <- StudOn.EnglishMean
StudOn.ReadingMean = round(mean(Students.OnCamp$Reading, na.rm=TRUE),2)
Students.OnCamp$Reading[is.na(Students.OnCamp$Reading)] <- StudOn.ReadingMean
StudOn.MathMean = round(mean(Students.OnCamp$Math, na.rm=TRUE),2)
Students.OnCamp$Math[is.na(Students.OnCamp$Math)] <- StudOn.MathMean
StudOn.WritingMean = round(mean(Students.OnCamp$Writing, na.rm=TRUE),2)
Students.OnCamp$Writing[is.na(Students.OnCamp$Writing)] <- StudOn.WritingMean
StudOn.SleepMean = round(mean(Students.OnCamp$SleepTime, na.rm=TRUE),0)
Students.OnCamp$SleepTime[is.na(Students.OnCamp$SleepTime)] <- StudOn.SleepMean
StudOn.StudyMean = round(mean(Students.OnCamp$StudyTime, na.rm=TRUE),0)
Students.OnCamp$StudyTime[is.na(Students.OnCamp$StudyTime)] <- StudOn.StudyMean
#- Make a descriptive statistic summary table for each subset
#and output your result in a MS word. (choose any grouping that makes sense to you).
Students.OnCampus.Table <- describe(Students.OnCamp)
Students.OffCampus.Table <- describe(Students.OffCamp)
export(Students.OnCampus.Table,OnCampus <- tempfile(fileext = ".txt"))
export(Students.OffCampus.Table,OffCampus <- tempfile(fileext = ".txt"))
#Then copy and paste tables from .txt files to a .doc file
beep(1)
#Bind both subsets, then Z-score transform all continuous variables
Students <- rbind(Students.OnCamp,Students.OffCamp)
mHeight<- mean(Students$Height)
sdHeight<- sd(Students$Height)
Students <- Students %>%
mutate(zHeight = round(((Height-mHeight)/sdHeight),2))
mWeight<- mean(Students$Weight)
sdWeight<- sd(Students$Weight)
Students <- Students %>%
mutate(zWeight = round(((Weight-mWeight)/sdWeight),2))
mSprint <- mean(Students$Sprint)
sdSprint <- sd(Students$Sprint)
Students <- Students %>%
mutate(zSprint = round(((Sprint-mSprint)/sdSprint),2))
mEnglish <- mean(Students$English)
sdEnglish <- sd(Students$English)
Students <- Students %>%
mutate(zEnglish = round(((English-mEnglish)/sdEnglish),2))
mReading <- mean(Students$Reading)
sdReading <- sd(Students$Reading)
Students <- Students %>%
mutate(zReading = round(((Reading-mReading)/sdReading),2))
mWriting <- mean(Students$Writing)
sdWriting <- sd(Students$Writing)
Students <- Students %>%
mutate(zWriting = round(((Writing-mWriting)/sdWriting),2))
mMath <- mean(Students$Math)
sdMath <- sd(Students$Math)
Students <- Students %>%
mutate(zMath = round(((Math-mMath)/sdMath),2))
mSleepTime <- mean(Students$SleepTime)
sdSleepTime <- sd(Students$SleepTime)
Students <- Students %>%
mutate(zSleepTime = round(((SleepTime-mSleepTime)/sdSleepTime),2))
mStudyTime <- mean(Students$StudyTime)
sdStudyTime <- sd(Students$StudyTime)
Students <- Students %>%
mutate(zStudyTime = round(((StudyTime-mStudyTime)/sdStudyTime),2))
# and export an SPSS data file.
export(Students,Studentz <- tempfile(fileext = ".sav"))
beep(1)
Please choose the suitable graph(s) to complete the following tasks, and remember to label the x and y axes properly, also add title, sub-title where necessary:
#Make a plot matrix to show the distribution, histogram, and correlation
#for all the continuous variables from each subset.
Students <- Students %>%
mutate(TotScore = Writing + Reading + Math + English)
Students.con <- Students %>% #new df with only continuous variables
select(Height,Weight,Sprint,English,Reading,Writing,Math,SleepTime,StudyTime,TotScore)
panel.cor <- function(x, y, cex.cor = 0.8, method = "pearson", ...) {
options(warn = -1)
usr <- par("usr"); on.exit(par(usr)) # Saves current "usr" and resets on exit
par(usr = c(0, 1, 0, 1)) # Set plot size to 1 x 1
r <- cor(x, y, method = method, use = "pair") # correlation coef
p <- cor.test(x, y, method = method)$p.val # p-value
n <- sum(complete.cases(x, y)) # How many data pairs
txt <- format(r, digits = 3) # Format r-value
txt1 <- format(p, digits = 3) # Format p-value
txt2 <- paste0("r= ", txt, '\n', "p= ", txt1, '\n', 'n= ', n) # Make panel text
text(0.5, 0.5, txt2, cex = cex.cor, ...) # Place panel text
options(warn = 0) # Reset warning
}
pairs(Students.con, #creates a second array with pearson coefficient, etc.
upper.panel = panel.smooth, # add a smooth line
lower.panel = panel.cor,
method = "pearson",
cex.cor = 1.5)
Students.t.aov <- aov(TotScore ~ Height+Weight+Sprint+SleepTime+StudyTime,data = Students.con)
par(mfrow=c(2,2))
plot(Students.t.aov,1) #348,68 and 75 flagged as outliers
plot(Students.t.aov,2) #Very fitted, looks good.
qqPlot(Students.t.aov) # Most all in CI, looks good.
## 68 348
## 42 349
hist(Students.t.aov$residuals) #Looks normal tbh
#Make a plot to show their total placement test score grouped by their
#in-state status and gender.
Students.P2 <- Students %>% #new df with only con variables + gender and state
select(Height,Weight,Sprint,English,Reading,Writing,Math,SleepTime,StudyTime,Gender,State,TotScore)
#Graph with each student result visible
Students.P2 %>%
ggplot(aes(x=State, y=TotScore, color = Gender)) +
geom_point(alpha = 0.6) +
labs(x="State",
y="Total Score",
title = "Total Score by State and Gender") +
theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))
#Make a plot to show the relationship between study time and total placement
#test, group by on-campus status(with a fitted line, formula, and $R^2$).
#res(total~StudyTime)??????
my.formula = TotScore ~ StudyTime
sp <- Students %>%
ggplot(aes(x=StudyTime, y=TotScore, color = LiveOnCampus)) +
geom_point(alpha = 1) +
geom_smooth(method = lm,
alpha = .6,
se = F) +
labs(x="Study Time in Hours",
y="Total Score",
title = "Total Score by Time Spent Studying (for On and Off Campus Students)")+
theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))
sp2 <- sp + geom_text(aes(12,120, label=(paste(expression("On-Campus: y = 310 + 0.27x"^2*", R"^2*" = 0.0022")))),parse = TRUE)
spfinal <- sp2 + geom_text(aes(12,100, label=(paste(expression("Off-Campus: y = 310 + 0.36x"^2*", R"^2*" = 0.0074")))),parse = TRUE)
spfinal
## `geom_smooth()` using formula 'y ~ x'
beep(3) #beeps
Data specification:
The researcher collected data from 76 people who undertook one of three diets (referred to as diet A, B and C), the researcher also recorded participants’ background information.
Please load the diet.csv and complete the following tasks:
diet <- import("PATHWAY/diet.csv")
head(diet)
## id gender age height diet.type initial.weight final.weight
## 1 1 Female 22 159 A 58 54.2
## 2 2 Female 46 192 A 60 54.0
## 3 3 Female 55 170 A 64 63.3
## 4 4 Female 33 171 A 64 61.1
## 5 5 Female 50 170 A 65 62.2
## 6 6 Female 50 201 A 66 64.0
str(diet)
## 'data.frame': 76 obs. of 7 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr "Female" "Female" "Female" "Female" ...
## $ age : int 22 46 55 33 50 50 37 28 28 45 ...
## $ height : int 159 192 170 171 170 201 174 176 165 165 ...
## $ diet.type : chr "A" "A" "A" "A" ...
## $ initial.weight: int 58 60 64 64 65 66 67 69 70 70 ...
## $ final.weight : num 54.2 54 63.3 61.1 62.2 64 65 60.5 68.1 66.9 ...
sum(is.na(diet))
## [1] 0
diet$gender[diet$gender == "Female"] <-0
diet$gender[diet$gender == "Male"] <-1
diet$gender <- factor(diet$gender, labels = c("Female", "Male"))
diet$diet.type[diet$diet.type == "A"] <-0
diet$diet.type[diet$diet.type == "B"] <-1
diet$diet.type[diet$diet.type == "C"] <-2
diet$diet.type <- factor(diet$diet.type, labels = c("A", "B","C"))
diet.wc <- diet %>%
mutate(weight.change = initial.weight - final.weight)
#Graph of mean by group
diet.gdiet <- diet.wc %>%
group_by(diet.type) %>%
summarise(
weight.change = mean(weight.change),
)
ggplot(diet.gdiet, aes(diet.type,weight.change,fill=diet.type))+
geom_bar(stat="identity", position = "dodge") +
scale_fill_manual(breaks = c("A", "B", "C"),
values = c("#DE3533", "#0047AB", "#006644"))+
labs(title="Mean Weight Change by Diet Plan", x="Diet Plan", y="Mean Weight Change (kg)")+
theme(legend.position = "off")
# Test if the proportions of gender is equal
diet %>%
group_by(gender) %>%
summarise(
count = n()
) # there are 43 female participants relative to only 33 male participants
## # A tibble: 2 × 2
## gender count
## <fct> <int>
## 1 Female 43
## 2 Male 33
#Based on the result, whether there is a statistically significant difference
#in mean weight change between gender groups?
diet.wc %>%
group_by(gender) %>%
summarise(
mean = mean(weight.change)
)
## # A tibble: 2 × 2
## gender mean
## <fct> <dbl>
## 1 Female 3.89
## 2 Male 4.02
t.test(weight.change~gender, mu = 0,
alt="two.sided",conf=0.95, paired =F,data=diet.wc) #p id 0.83. No sig dif.
##
## Welch Two Sample t-test
##
## data: weight.change by gender
## t = -0.2091, df = 68.809, p-value = 0.835
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -1.287385 1.043128
## sample estimates:
## mean in group Female mean in group Male
## 3.893023 4.015152
# Normality & variance assumption test
diet.wc %>%
identify_outliers(weight.change)
## [1] id gender age height diet.type
## [6] initial.weight final.weight weight.change is.outlier is.extreme
## <0 rows> (or 0-length row.names)
diet.wc %>% # No outliers
shapiro_test(weight.change) #p is .79 so our assumption of normality is met
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 weight.change 0.990 0.790
diet.wc %>%
levene_test(weight.change~gender) #equal variances met
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 1 74 0.202 0.654
ggqqplot(diet.wc, x= "weight.change", facet.by = "gender")
#Whether there is a statistically significant differences in weight change
#with respect to their diet plan?
diet.wc %>%
shapiro_test(weight.change) #p is .79 so our assumption of normality is met
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 weight.change 0.990 0.790
diet.wc %>%
levene_test(weight.change~diet.type) #equal variances met
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 2 73 0.463 0.631
ggqqplot(diet.wc, x= "weight.change", facet.by = "diet.type")
diet.aov <- aov(weight.change ~ diet.type,data = diet.wc)
summary(diet.aov) #P smaller than 0.01, very significant difference
## Df Sum Sq Mean Sq F value Pr(>F)
## diet.type 2 60.5 30.264 5.383 0.0066 **
## Residuals 73 410.4 5.622
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
levene_test(diet.aov)
## # A tibble: 1 × 4
## df1 df2 statistic p
## <int> <int> <dbl> <dbl>
## 1 2 73 0.463 0.631
#p is larger than 0.05; Homogenity is met
par(mfrow=c(2,2))
plot(diet.aov,1)
plot(diet.aov,2) #Very fitted, looks good.
qqPlot(diet.aov) # All in CI, looks good.
## [1] 15 48
hist(diet.aov$residuals)
#Conduct a post-hoc comparison, report the result
TukeyHSD(diet.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weight.change ~ diet.type, data = diet.wc)
##
## $diet.type
## diff lwr upr p adj
## B-A -0.032000 -1.6530850 1.589085 0.9987711
## C-A 1.848148 0.2567422 3.439554 0.0188047
## C-B 1.880148 0.3056826 3.454614 0.0152020
#Tukey's probably isn't appropriate here due to small sample size
#C seems to have a significant difference in weight change relative to A and B
#and visualize the post-hoc analysis with your interpretation.
ggline(diet.wc,"diet.type","weight.change",
add=c("mean_se","jitter"),
ylab="Weight Change",
main="Mean Weight Change by Treatment Condition")
#It appears as thought diet C causes a statistically significant weight gain
#in patients relative to diet A and diet C.
beep(5)
Data Specification:
A researcher has a dataset that consists of points of students including their study time & number of courses. And the researcher wish to build a regression model to capture all the patterns in the dataset while predicting the students’ final points.
Please load the Math.csv and complete the following tasks:
Or set row to NULL Or try new.df <- df[-rownumber,] where # is row number
Setup a model, does study time significantly predict students’ math learning outcome? Report the result with an APA style table. Formula with [output~input] Correlation coefficient, R^2 value
Generate some diagnostic plots of your fitted model.
Y = B1x1 +B2x2 lm to fit the model, determine the Betas (coefficients?) model.math = lm(Output ~input, data =“df”)
Math <- import("PATHWAYS/Math.csv")
Math.table <- describe(Math)
#Output a descriptive statistics table in a Word document to describe the data.
export(Math.table,MathTable <- tempfile(fileext = ".txt"))
#Check the distribution of target variables
Math %>%
shapiro_test(number_courses) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 number_courses 0.883 0.000000243
Math %>%
shapiro_test(time_study) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 time_study 0.950 0.000883
Math %>%
shapiro_test(Marks) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 Marks 0.914 0.00000708
Math.m <- Math %>%
mutate(MeanMath = Marks/number_courses) #creates a "MeanMath" which is marks per course
par(mfrow=c(2,2)) #distribution of variables
hist(Math.m$number_courses,xlab="Number of Courses",main=paste("Frequency of Courses Taken"))
hist(Math.m$time_study,xlab="Time Studying (hrs)",main=paste("Frequency of Time Spent Studying"))
hist(Math.m$Marks,xlab="Total Marks",main=paste("Frequency of Marks"))
hist(Math.m$MeanMath,xlab="Average Score",main=paste("Frequency of Mean Marks"))
Math.m %>%
shapiro_test(MeanMath) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
## variable statistic p
## <chr> <dbl> <dbl>
## 1 MeanMath 0.879 0.000000165
#and visualize the mean math score as per the number of courses taken.
ggline(Math.m,"number_courses","MeanMath",
add=c("mean_se","jitter"),
xlab="Number of Courses Taken",
ylab="Mean Score",
main="Mean Math Scores by Courses Taken")
#Make some plots to help this researcher understand the relationships
#between all variables.
pairs(Math.m, #creates a second array with pearson coefficient, etc.
upper.panel = panel.smooth, # add a smooth line
lower.panel = panel.cor,
method = "pearson",
cex.cor = 1.5)
#Did you see any outliers? if so remove them from the dataset.
Math.reg1 <- lm(MeanMath ~number_courses, data = Math.m)
summary(Math.reg1)
##
## Call:
## lm(formula = MeanMath ~ number_courses, data = Math.m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6586 -2.3561 -0.8911 1.6500 9.1311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.4734 0.8848 7.316 7.07e-11 ***
## number_courses -0.3151 0.1584 -1.989 0.0495 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.837 on 98 degrees of freedom
## Multiple R-squared: 0.03879, Adjusted R-squared: 0.02898
## F-statistic: 3.955 on 1 and 98 DF, p-value: 0.04953
math.aov <- aov(formula = MeanMath ~ time_study, data = Math.m)
outlierTest(Math.reg1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 36 3.437488 0.00086635 0.086635
par(mfrow=c(2,2))
plot(math.aov,1) #36,4 and 14 flagged as outliers
plot(math.aov,2) #Very fitted, looks good.
qqPlot(math.aov) # Most all in CI, looks good.
## [1] 11 36
hist(math.aov$residuals) #Hm.
Math.m %>%
identify_outliers(MeanMath) #Outliers are NOT extreme. 11 and 36 flagged
## number_courses time_study Marks MeanMath is.outlier is.extreme
## 1 3 7.353 42.036 14.01200 TRUE FALSE
## 2 3 7.543 43.978 14.65933 TRUE FALSE
#Creating a new df with outliers removed
Math.m1 <- Math.m[-36,]
Math.m1 <- Math.m1[-11,]
math.aov.1 <- aov(formula = MeanMath ~ time_study, data = Math.m1)
summary(math.aov.1) #p statistically significant at the 0.001 level
## Df Sum Sq Mean Sq F value Pr(>F)
## time_study 1 476.6 476.6 289 <2e-16 ***
## Residuals 96 158.3 1.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Setup a model, does study time significantly predict students' math learning outcome?
#Report the result with an APA style table.
math.aov.1
## Call:
## aov(formula = MeanMath ~ time_study, data = Math.m1)
##
## Terms:
## time_study Residuals
## Sum of Squares 476.5901 158.3177
## Deg. of Freedom 1 96
##
## Residual standard error: 1.28419
## Estimated effects may be unbalanced
Math.reg2 <- lm(MeanMath ~time_study, data = Math.m1) #Regression model (linear)
summary(Math.reg2) #Regression model (linear)
##
## Call:
## lm(formula = MeanMath ~ time_study, data = Math.m1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7577 -0.9503 -0.2074 0.5056 3.9758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.82636 0.25773 3.206 0.00183 **
## time_study 0.94451 0.05556 17.000 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.284 on 96 degrees of freedom
## Multiple R-squared: 0.7506, Adjusted R-squared: 0.748
## F-statistic: 289 on 1 and 96 DF, p-value: < 2.2e-16
Math.m1 %>%
ggplot(aes(x=time_study, y=MeanMath)) +
geom_point(alpha = 1) +
geom_smooth(method = lm,
alpha = .6,
se = F) +
labs(x="Study Time in Hours",
y="Total Score",
title = "Total Score by Time Spent Studying")+
theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))
## `geom_smooth()` using formula 'y ~ x'
Math.APA <- Math.m1
Math.APA$time_study <- round(Math.APA$time_study,0)
apaTables::apa.1way.table(iv=time_study,
dv=MeanMath,
data=Math.APA,
file="MathScoresbyStudy.doc")
##
##
## Descriptive statistics for MeanMath as a function of time_study.
##
## time_study M SD
## 0 1.81 0.08
## 1 2.06 0.14
## 2 2.41 0.29
## 3 3.05 0.45
## 4 3.88 0.87
## 5 6.05 1.01
## 6 7.19 2.36
## 7 7.99 1.93
## 8 7.65 0.89
##
## Note. M and SD represent mean and standard deviation, respectively.
##
#Generate some diagnostic plots of your fitted model.
ggplot(Math.m1, aes(x = time_study, y = MeanMath))+
geom_point() +
geom_smooth(method = lm,
alpha = .6,
se = F) +
labs( x = "Wind Speed MPH",
y = "Ozone Concentration BBP",
title = "Relationship Between Wind Speed and Ozone Level")+
stat_regline_equation(label.x = 5, label.y = 20, aes(label = ..eq.label..)) + # show the lm formula
stat_regline_equation(label.x = 5, label.y = 15, aes(label = ..rr.label..)) +# show the R^2
theme_linedraw()
## `geom_smooth()` using formula 'y ~ x'
#Formula with [output~input] Correlation coefficient, R^2 value
outlierTest(Math.reg2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 100 3.283435 0.0014362 0.14074
# makes a qq plot of residuals
qqPlot(Math.reg2)
## 59 100
## 57 98
# calculates influence stats for every observation
math.inf <- influence.measures(Math.reg2)
summary(math.inf)
## Potentially influential observations of
## lm(formula = MeanMath ~ time_study, data = Math.m1) :
##
## dfb.1_ dfb.tm_s dffit cov.r cook.d hat
## 7 -0.07 0.25 0.38 0.89_* 0.07 0.02
## 10 -0.09 0.28 0.41 0.87_* 0.08 0.02
## 49 -0.02 0.16 0.28 0.93_* 0.04 0.02
## 59 -0.10 0.29 0.43 0.86_* 0.08 0.02
## 83 -0.20 0.39 0.47_* 0.92_* 0.11 0.03
## 100 -0.12 0.33 0.47_* 0.84_* 0.10 0.02
# check for non-linearity
crPlots(Math.reg2)
# non-constant variance test
ncvTest(Math.reg2)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 24.07483, Df = 1, p = 9.2664e-07
# variable-level information useful for analysis.*
augment(Math.reg2)
## # A tibble: 98 × 9
## .rownames MeanMath time_study .fitted .resid .hat .sigma .cooksd
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6.40 4.51 5.08 1.32 0.0107 1.28 0.00573
## 2 2 1.93 0.096 0.917 1.02 0.0389 1.29 0.0132
## 3 3 3.45 3.13 3.79 -0.333 0.0116 1.29 0.000400
## 4 4 8.84 7.91 8.30 0.540 0.0387 1.29 0.00370
## 5 5 6.91 7.81 8.20 -1.29 0.0373 1.28 0.0203
## 6 6 2.97 3.21 3.86 -0.889 0.0114 1.29 0.00279
## 7 7 9.96 6.06 6.55 3.41 0.0181 1.24 0.0662
## 8 8 3.45 3.41 4.05 -0.597 0.0109 1.29 0.00120
## 9 9 5.09 4.41 4.99 0.0953 0.0105 1.29 0.0000296
## 10 10 10.3 6.17 6.66 3.63 0.0190 1.23 0.0788
## # … with 88 more rows, and 1 more variable: .std.resid <dbl>
# fitted values (.fitted), residuals (.resid),
# Cook’s distance (.cooksd) -a measure of outliers
# Let's take a close look
dig.M <- augment(Math.reg2)
ggplot(dig.M, aes(x = time_study, y = MeanMath))+
geom_point() +
geom_smooth(method = lm,
alpha = .6,
se = F) +
geom_segment(aes(xend = time_study, yend = .fitted), color = "red", size = 0.3)+
labs( x = "Time Spent Studying (Hours)",
y = "Mean Score in Mathematics Courses",
title = "Relation Between Time Studying and Math Marks (with Residuals)")
## `geom_smooth()` using formula 'y ~ x'
# the plot shows the residual errors between observed values and the fitted regression line,.
beep(8)
#Thank you for everything! Sorry I didn't have time to try the bonus activities!
#Hopefully I will be able to later this week or when I have a break!