Juan Madrigal

Please see my analysis and response to research questions in the code below.

library(knitr) # Knit your markdown to Word,Pdf and etc. 
library(beepr)
library(rio) # Swiss-army knife for data I/O
library(DescTools) # A collection of functions for efficiently describing data
library(ggpubr) # facilitates the creation of beautiful ggplot2-based graphs 
library(car) # Some useful function for learning R 
library(stargazer)
library(psych)
library(tidyverse) # tidyverse
library(rstatix) # simple and intuitive pipe-friendly framework
library(janitor) # simplified data cleaning tool 
library(corrr) # correlation ish in R
library(summarytools) # for reporting your results
library(ggstatsplot) # save your time on box plot here 
library(table1)
library(afex)

Task 1 Data Manipulation and visualization-10pts

Task 1A

Data specification:

The dataset StudentInfo.csv contains survey results from 431 students enrolled at a university in the United States:

Gender: 0=Male 1=Female| Athlete : 0 = Non-athlete 1 = Athlete| Weight: weight in lbs| Height: height in inches| Sprint: 35-meter sprint time (seconds)| MileMinDur : Mile run time (hh:mm:ss)| English:Writing : Scores on placement test| State: Is student in-state or out-of-state resident?| LiveOnCampus: 0 = Off-campus 1 = On-campus| HowCommute: 1=Walk, 2 = Bike, 3 = Car, 4 = Public transit, 5 = Other| CommuteTime: Time in Mins per day SleepTime : Time in hours per week StudyTime : Time in hours per week

  • Load the dataset. Investigate the descriptive statistics of all the variables.
StudentInfo <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/EPSY594 Final Exam Package/StudentInfo.csv")
head(StudentInfo)
##     ids Gender Athlete Height Weight Sprint MileMinDur English Reading  Math
## 1 43783      0       0  72.35     NA  7.978              88.24   81.50 60.02
## 2 20278      0       0  70.66 179.20  8.004    0:06:21   89.45   85.25 70.19
## 3 22820      1       0     NA 198.34  8.473    0:12:44   74.06   88.68 55.89
## 4 24559      1       1  67.43 128.17     NA    0:06:25   82.61   77.30 65.52
## 5 28980      0       1  68.45 171.61  4.650    0:07:24   70.10      NA 61.40
## 6 33312      0       1  68.56 163.96  4.750    0:05:47   78.98   87.53 76.71
##   Writing        State LiveOnCampus HowCommute CommuteTime SleepTime StudyTime
## 1   81.44     In state            1         NA          NA         7         1
## 2   73.27     In state            1         NA          NA         5         2
## 3   73.16     In state            1         NA          NA         2         6
## 4   80.45 Out of state            1         NA          NA         7         3
## 5   77.48     In state            1         NA          NA         8         3
## 6   70.79     In state            1         NA          NA         9         9
str(StudentInfo)
## 'data.frame':    430 obs. of  17 variables:
##  $ ids         : int  43783 20278 22820 24559 28980 33312 40274 40390 29045 35676 ...
##  $ Gender      : int  0 0 1 1 0 0 0 1 0 1 ...
##  $ Athlete     : int  0 0 0 1 1 1 1 1 0 1 ...
##  $ Height      : num  72.3 70.7 NA 67.4 68.5 ...
##  $ Weight      : num  NA 179 198 128 172 ...
##  $ Sprint      : num  7.98 8 8.47 NA 4.65 ...
##  $ MileMinDur  : chr  "" "0:06:21" "0:12:44" "0:06:25" ...
##  $ English     : num  88.2 89.5 74.1 82.6 70.1 ...
##  $ Reading     : num  81.5 85.2 88.7 77.3 NA ...
##  $ Math        : num  60 70.2 55.9 65.5 61.4 ...
##  $ Writing     : num  81.4 73.3 73.2 80.5 77.5 ...
##  $ State       : chr  "In state" "In state" "In state" "Out of state" ...
##  $ LiveOnCampus: int  1 1 1 1 1 1 0 0 0 1 ...
##  $ HowCommute  : int  NA NA NA NA NA NA NA 3 3 NA ...
##  $ CommuteTime : int  NA NA NA NA NA NA NA 26 20 NA ...
##  $ SleepTime   : int  7 5 2 7 8 9 8 NA 1 4 ...
##  $ StudyTime   : int  1 2 6 3 3 9 3 3 6 0 ...
sum(is.na(StudentInfo)) #679 total NA
## [1] 679
StudentInfo$Gender <- factor(StudentInfo$Gender, labels = c("Male", "Female"))
StudentInfo$Athlete <- factor(StudentInfo$Athlete, labels = c("Non-Athlete", "Athlete"))
StudentInfo$State[StudentInfo$State == "In state"] <-0
StudentInfo$State[StudentInfo$State == "Out of state"] <-1
StudentInfo$State <- factor(StudentInfo$State, labels = c("In State", "Out of State"))
StudentInfo$Athlete <- factor(StudentInfo$Athlete, labels = c("Non-Athlete", "Athlete"))
StudentInfo$Gender[is.na(StudentInfo$Gender)] #9 NAs in Gender
## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Male Female
StudentInfo$Gender[is.na(StudentInfo$Gender)] 
## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Male Female
StudentInfo$LiveOnCampus <- factor(StudentInfo$LiveOnCampus, labels = c("Off-campus", "On-campus"))
StudentInfo.table <- describe(StudentInfo)

#Split the dataset into two subsets:
#students who are living on-campus and living off-campus 

SI.split <- split(StudentInfo,StudentInfo$LiveOnCampus)
  
Students.OffCamp <- as.data.frame(SI.split[1])
Students.OnCamp <- as.data.frame(SI.split[2])

colnames(Students.OnCamp) <- c('ids','Gender','Athlete','Height','Weight','Sprint','MileMinDur','English','Reading','Math','Writing','State','LiveOnCampus','HowCommute','CommuteTime','SleepTime','StudyTime')

colnames(Students.OffCamp) <- c('ids','Gender','Athlete','Height','Weight','Sprint','MileMinDur','English','Reading','Math','Writing','State','LiveOnCampus','HowCommute','CommuteTime','SleepTime','StudyTime')

#Remove the original ID variable and make new ID variables for each subset you 
  #just created: 
     #For off-campus students, start the ID with 'com' and followed by a 4-digit 
        #random number 
     #For on-campus students, start with 'res' and follow by a 4-digits random
        #number

NewID.On = data.frame(1:178)
NewID.Off = data.frame(1:250)

set.seed(420)
NewID.On <- NewID.On %>% 
  mutate(ID = paste("col",sample(1111:9999, 178,FALSE), sep = ""))

set.seed(420)
NewID.Off <- NewID.Off %>% 
    mutate(ID = paste("res",sample(1111:9999, 250,FALSE), sep = ""))

Students.OffCamp$ids <- NewID.Off$ID
Students.OnCamp$ids <-  NewID.On$ID

#How many missing observations do the datasets contain?
#And from which column(s)? 
NA.TOT.Off = sum(is.na(Students.OffCamp)) #191 NA
NA.TOT.Off.Gen = sum(is.na(Students.OffCamp$Gender)) #4 NA
NA.TOT.Off.Ath = sum(is.na(Students.OffCamp$Athlete))
NA.TOT.Off.He = sum(is.na(Students.OffCamp$Height)) #16 NA
NA.TOT.Off.We = sum(is.na(Students.OffCamp$Weight)) #31 NA
NA.TOT.Off.Spr = sum(is.na(Students.OffCamp$Sprint)) #31 NA
NA.TOT.Off.MM = sum(is.na(Students.OffCamp$MileMinDur))
NA.TOT.Off.Eng = sum(is.na(Students.OffCamp$English)) #18 NA
NA.TOT.Off.Read = sum(is.na(Students.OffCamp$Reading)) #5 NA
NA.TOT.Off.Math = sum(is.na(Students.OffCamp$Math)) #7 NA
NA.TOT.Off.Writ = sum(is.na(Students.OffCamp$Writing))#21NA
NA.TOT.Off.State = sum(is.na(Students.OffCamp$State))
NA.TOT.Off.OnC = sum(is.na(Students.OffCamp$LiveOnCampus))
NA.TOT.Off.HowC = sum(is.na(Students.OffCamp$HowCommute)) #5 NA
NA.TOT.Off.CTime = sum(is.na(Students.OffCamp$CommuteTime)) #5 N
NA.TOT.Off.SleepTime = sum(is.na(Students.OffCamp$SleepTime)) #19 NA
NA.TOT.Off.StudyTime = sum(is.na(Students.OffCamp$StudyTime)) #29 NA


NA.TOT.Off  = sum(is.na(Students.OnCamp)) #480 NA
NA.TOT.Off.Gen = sum(is.na(Students.OnCamp$Gender)) #5 NA
NA.TOT.Off.Ath = sum(is.na(Students.OnCamp$Athlete))
NA.TOT.Off.He = sum(is.na(Students.OnCamp$Height)) #11 NA
NA.TOT.Off.We = sum(is.na(Students.OnCamp$Weight)) #27 NA
NA.TOT.Off.Spr = sum(is.na(Students.OnCamp$Sprint)) #28 NA
NA.TOT.Off.MM = sum(is.na(Students.OnCamp$MileMinDur))
NA.TOT.Off.Eng = sum(is.na(Students.OnCamp$English)) #8 NA
NA.TOT.Off.Read = sum(is.na(Students.OnCamp$Reading)) #5 NA
NA.TOT.Off.Math = sum(is.na(Students.OnCamp$Math)) #6 NA
NA.TOT.Off.Writ = sum(is.na(Students.OnCamp$Writing)) #10 NA
NA.TOT.Off.State = sum(is.na(Students.OnCamp$State))
NA.TOT.Off.OnC = sum(is.na(Students.OnCamp$LiveOnCampus))
NA.TOT.Off.HowC = sum(is.na(Students.OnCamp$HowCommute)) #178 NA
NA.TOT.Off.CTime = sum(is.na(Students.OnCamp$CommuteTime)) #178 NA
NA.TOT.Off.SleepTime = sum(is.na(Students.OnCamp$SleepTime)) #12 NA
NA.TOT.Off.StudyTime = sum(is.na(Students.OnCamp$StudyTime)) #12 NA

#Now, replace missing values for all continuous variables
#with group mean (mean from subset).

StudOff.HeightMean = round(mean(Students.OffCamp$Height, na.rm=TRUE),2)
Students.OffCamp$Height[is.na(Students.OffCamp$Height)] <- StudOff.HeightMean

StudOff.WeightMean = round(mean(Students.OffCamp$Weight, na.rm=TRUE),2)
Students.OffCamp$Weight[is.na(Students.OffCamp$Weight)] <- StudOff.WeightMean

StudOff.SprintMean = round(mean(Students.OffCamp$Sprint, na.rm=TRUE),3)
Students.OffCamp$Sprint[is.na(Students.OffCamp$Sprint)] <- StudOff.SprintMean

StudOff.EnglishMean = round(mean(Students.OffCamp$English, na.rm=TRUE),2)
Students.OffCamp$English[is.na(Students.OffCamp$English)] <- StudOff.EnglishMean

StudOff.ReadingMean = round(mean(Students.OffCamp$Reading, na.rm=TRUE),2)
Students.OffCamp$Reading[is.na(Students.OffCamp$Reading)] <- StudOff.ReadingMean

StudOff.MathMean = round(mean(Students.OffCamp$Math, na.rm=TRUE),2)
Students.OffCamp$Math[is.na(Students.OffCamp$Math)] <- StudOff.MathMean

StudOff.WritingMean = round(mean(Students.OffCamp$Writing, na.rm=TRUE),2)
Students.OffCamp$Writing[is.na(Students.OffCamp$Writing)] <- StudOff.WritingMean

StudOff.CommuteMean = round(mean(Students.OffCamp$CommuteTime, na.rm=TRUE),0)
Students.OffCamp$CommuteTime[is.na(Students.OffCamp$CommuteTime)] <- StudOff.CommuteMean

StudOff.SleepMean = round(mean(Students.OffCamp$SleepTime, na.rm=TRUE),0)
Students.OffCamp$SleepTime[is.na(Students.OffCamp$SleepTime)] <- StudOff.SleepMean

StudOff.StudyMean = round(mean(Students.OffCamp$StudyTime, na.rm=TRUE),0)
Students.OffCamp$StudyTime[is.na(Students.OffCamp$StudyTime)] <- StudOff.StudyMean

###

StudOn.HeightMean = round(mean(Students.OnCamp$Height, na.rm=TRUE),2)
Students.OnCamp$Height[is.na(Students.OnCamp$Height)] <- StudOn.HeightMean

StudOn.WeightMean = round(mean(Students.OnCamp$Weight, na.rm=TRUE),2)
Students.OnCamp$Weight[is.na(Students.OnCamp$Weight)] <- StudOn.WeightMean

StudOn.SprintMean = round(mean(Students.OnCamp$Sprint, na.rm=TRUE),3)
Students.OnCamp$Sprint[is.na(Students.OnCamp$Sprint)] <- StudOn.SprintMean

StudOn.EnglishMean = round(mean(Students.OnCamp$English, na.rm=TRUE),2)
Students.OnCamp$English[is.na(Students.OnCamp$English)] <- StudOn.EnglishMean

StudOn.ReadingMean = round(mean(Students.OnCamp$Reading, na.rm=TRUE),2)
Students.OnCamp$Reading[is.na(Students.OnCamp$Reading)] <- StudOn.ReadingMean

StudOn.MathMean = round(mean(Students.OnCamp$Math, na.rm=TRUE),2)
Students.OnCamp$Math[is.na(Students.OnCamp$Math)] <- StudOn.MathMean

StudOn.WritingMean = round(mean(Students.OnCamp$Writing, na.rm=TRUE),2)
Students.OnCamp$Writing[is.na(Students.OnCamp$Writing)] <- StudOn.WritingMean

StudOn.SleepMean = round(mean(Students.OnCamp$SleepTime, na.rm=TRUE),0)
Students.OnCamp$SleepTime[is.na(Students.OnCamp$SleepTime)] <- StudOn.SleepMean

StudOn.StudyMean = round(mean(Students.OnCamp$StudyTime, na.rm=TRUE),0)
Students.OnCamp$StudyTime[is.na(Students.OnCamp$StudyTime)] <- StudOn.StudyMean


#- Make a descriptive statistic summary table for each subset
#and output your result in a MS word. (choose any grouping that makes sense to you).
Students.OnCampus.Table <- describe(Students.OnCamp)
Students.OffCampus.Table <- describe(Students.OffCamp)


export(Students.OnCampus.Table,OnCampus <- tempfile(fileext = ".txt")) 
export(Students.OffCampus.Table,OffCampus <- tempfile(fileext = ".txt")) 
#Then copy and paste tables from .txt files to a .doc file
beep(1)


#Bind both subsets, then Z-score transform all continuous variables

Students <- rbind(Students.OnCamp,Students.OffCamp)

mHeight<- mean(Students$Height)
sdHeight<- sd(Students$Height)
Students <- Students %>% 
  mutate(zHeight = round(((Height-mHeight)/sdHeight),2))

mWeight<- mean(Students$Weight)
sdWeight<- sd(Students$Weight)
Students <- Students %>% 
  mutate(zWeight = round(((Weight-mWeight)/sdWeight),2))

mSprint <- mean(Students$Sprint)
sdSprint <- sd(Students$Sprint)
Students <- Students %>% 
  mutate(zSprint = round(((Sprint-mSprint)/sdSprint),2))

mEnglish <- mean(Students$English)
sdEnglish <- sd(Students$English)
Students <- Students %>% 
  mutate(zEnglish = round(((English-mEnglish)/sdEnglish),2))

mReading <- mean(Students$Reading)
sdReading <- sd(Students$Reading)
Students <- Students %>% 
  mutate(zReading = round(((Reading-mReading)/sdReading),2))

mWriting <- mean(Students$Writing)
sdWriting <- sd(Students$Writing)
Students <- Students %>% 
  mutate(zWriting = round(((Writing-mWriting)/sdWriting),2))

mMath <- mean(Students$Math)
sdMath <- sd(Students$Math)
Students <- Students %>% 
  mutate(zMath = round(((Math-mMath)/sdMath),2))

mSleepTime <- mean(Students$SleepTime)
sdSleepTime <- sd(Students$SleepTime)
Students <- Students %>% 
  mutate(zSleepTime = round(((SleepTime-mSleepTime)/sdSleepTime),2))

mStudyTime <- mean(Students$StudyTime)
sdStudyTime <- sd(Students$StudyTime)
Students <- Students %>% 
  mutate(zStudyTime = round(((StudyTime-mStudyTime)/sdStudyTime),2))


# and export an SPSS data file. 

export(Students,Studentz <- tempfile(fileext = ".sav"))
beep(1)

Task 1B

Please choose the suitable graph(s) to complete the following tasks, and remember to label the x and y axes properly, also add title, sub-title where necessary:

  • Make a plot matrix to show the distribution, histogram, and correlation for all the continuous variables from each subset.
  • Make a plot to show their total placement test score grouped by their in-state status and gender.
  • Make a plot to show the relationship between study time and total placement test, group by on-campus status(with a fitted line, formula, and \(R^2\)). #res(total~StudyTime)??????
  • *Combine all the plots you made from Task 1B, try your best to tweak it “Aesthetically pleasing”.
#Make a plot matrix to show the distribution, histogram, and correlation
#for all the continuous variables from each subset.
Students <- Students %>% 
  mutate(TotScore = Writing + Reading + Math + English)

Students.con <- Students %>%  #new df with only continuous variables
  select(Height,Weight,Sprint,English,Reading,Writing,Math,SleepTime,StudyTime,TotScore) 

panel.cor <- function(x, y, cex.cor = 0.8, method = "pearson", ...) { 
options(warn = -1)                   
usr <- par("usr"); on.exit(par(usr)) # Saves current "usr" and resets on exit
par(usr = c(0, 1, 0, 1))             # Set plot size to 1 x 1
r <- cor(x, y, method = method, use = "pair")               # correlation coef
p <- cor.test(x, y, method = method)$p.val                  # p-value
n <- sum(complete.cases(x, y))                              # How many data pairs
txt <- format(r, digits = 3)                                # Format r-value
txt1 <- format(p, digits = 3)                                 # Format p-value
txt2 <- paste0("r= ", txt, '\n', "p= ", txt1, '\n', 'n= ', n) # Make panel text
text(0.5, 0.5, txt2, cex = cex.cor, ...)                      # Place panel text
options(warn = 0)                                             # Reset warning
}

pairs(Students.con, #creates a second array with pearson coefficient, etc.
      upper.panel = panel.smooth, # add a smooth line 
      lower.panel = panel.cor,
      method = "pearson",
      cex.cor = 1.5)

Students.t.aov <- aov(TotScore ~ Height+Weight+Sprint+SleepTime+StudyTime,data = Students.con)

par(mfrow=c(2,2))
  plot(Students.t.aov,1) #348,68 and 75 flagged as outliers
  plot(Students.t.aov,2) #Very fitted, looks good.
  qqPlot(Students.t.aov) # Most all in CI, looks good.
##  68 348 
##  42 349
  hist(Students.t.aov$residuals) #Looks normal tbh

#Make a plot to show their total placement test score grouped by their 
  #in-state status and gender.
Students.P2 <- Students %>%  #new df with only con variables + gender and state
  select(Height,Weight,Sprint,English,Reading,Writing,Math,SleepTime,StudyTime,Gender,State,TotScore) 


#Graph with each student result visible
Students.P2 %>%
  ggplot(aes(x=State, y=TotScore, color = Gender)) + 
  geom_point(alpha = 0.6) +
  labs(x="State", 
       y="Total Score", 
       title = "Total Score by State and Gender") +
 theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))

#Make a plot to show the relationship between study time and total placement 
#test, group by on-campus status(with a fitted line, formula, and $R^2$). 
#res(total~StudyTime)??????



my.formula = TotScore ~ StudyTime

sp <- Students %>%
  ggplot(aes(x=StudyTime, y=TotScore, color = LiveOnCampus)) + 
  geom_point(alpha = 1) +
  geom_smooth(method = lm,
              alpha = .6,
              se = F) +
  labs(x="Study Time in Hours", 
       y="Total Score", 
       title = "Total Score by Time Spent Studying (for On and Off Campus Students)")+
 theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))




sp2 <- sp + geom_text(aes(12,120, label=(paste(expression("On-Campus: y = 310 + 0.27x"^2*", R"^2*" = 0.0022")))),parse = TRUE)
spfinal <- sp2 + geom_text(aes(12,100, label=(paste(expression("Off-Campus: y = 310 + 0.36x"^2*", R"^2*" = 0.0074")))),parse = TRUE)

spfinal
## `geom_smooth()` using formula 'y ~ x'

beep(3) #beeps

Task 2 Data Analysis

Task 2A

Data specification:

The researcher collected data from 76 people who undertook one of three diets (referred to as diet A, B and C), the researcher also recorded participants’ background information.

Please load the diet.csv and complete the following tasks:

  • Plot out the mean weight change from different diet plans.
  • Did the researcher collect an equal proportion of both genders of participants?
  • Based on the result, whether there is a statistically significant difference in mean weight change between gender groups?
  • Whether there is a statistically significant differences in weight change with respect to their diet plan? -*Also check the assumptions before you conduct the analysis. -Conduct a post-hoc comparison, report the result and visualize the post-hoc analysis with your interpretation.
diet <- import("PATHWAY/diet.csv")
head(diet)
##   id gender age height diet.type initial.weight final.weight
## 1  1 Female  22    159         A             58         54.2
## 2  2 Female  46    192         A             60         54.0
## 3  3 Female  55    170         A             64         63.3
## 4  4 Female  33    171         A             64         61.1
## 5  5 Female  50    170         A             65         62.2
## 6  6 Female  50    201         A             66         64.0
str(diet)
## 'data.frame':    76 obs. of  7 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender        : chr  "Female" "Female" "Female" "Female" ...
##  $ age           : int  22 46 55 33 50 50 37 28 28 45 ...
##  $ height        : int  159 192 170 171 170 201 174 176 165 165 ...
##  $ diet.type     : chr  "A" "A" "A" "A" ...
##  $ initial.weight: int  58 60 64 64 65 66 67 69 70 70 ...
##  $ final.weight  : num  54.2 54 63.3 61.1 62.2 64 65 60.5 68.1 66.9 ...
sum(is.na(diet))
## [1] 0
diet$gender[diet$gender == "Female"] <-0
diet$gender[diet$gender == "Male"] <-1
diet$gender <- factor(diet$gender, labels = c("Female", "Male"))

diet$diet.type[diet$diet.type == "A"] <-0
diet$diet.type[diet$diet.type == "B"] <-1
diet$diet.type[diet$diet.type == "C"] <-2
diet$diet.type <- factor(diet$diet.type, labels = c("A", "B","C"))

diet.wc <- diet %>% 
  mutate(weight.change = initial.weight - final.weight)


#Graph of mean by group
diet.gdiet <- diet.wc %>% 
        group_by(diet.type) %>% 
        summarise(
          weight.change = mean(weight.change),
        )


ggplot(diet.gdiet, aes(diet.type,weight.change,fill=diet.type))+
geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(breaks = c("A", "B", "C"),
                    values = c("#DE3533", "#0047AB", "#006644"))+
  labs(title="Mean Weight Change by Diet Plan", x="Diet Plan", y="Mean Weight Change (kg)")+
  theme(legend.position = "off")

# Test if the proportions of gender is equal
diet  %>% 
  group_by(gender) %>% 
    summarise(
      count = n()
    ) # there are 43 female participants relative to only 33 male participants
## # A tibble: 2 × 2
##   gender count
##   <fct>  <int>
## 1 Female    43
## 2 Male      33
#Based on the result, whether there is a statistically significant difference
#in mean weight change between gender groups? 
diet.wc  %>% 
  group_by(gender) %>% 
    summarise(
      mean = mean(weight.change)
    ) 
## # A tibble: 2 × 2
##   gender  mean
##   <fct>  <dbl>
## 1 Female  3.89
## 2 Male    4.02
t.test(weight.change~gender, mu = 0,
       alt="two.sided",conf=0.95, paired =F,data=diet.wc) #p id 0.83. No sig dif.
## 
##  Welch Two Sample t-test
## 
## data:  weight.change by gender
## t = -0.2091, df = 68.809, p-value = 0.835
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -1.287385  1.043128
## sample estimates:
## mean in group Female   mean in group Male 
##             3.893023             4.015152
# Normality & variance assumption test
diet.wc %>% 
    identify_outliers(weight.change) 
##  [1] id             gender         age            height         diet.type     
##  [6] initial.weight final.weight   weight.change  is.outlier     is.extreme    
## <0 rows> (or 0-length row.names)
diet.wc %>% # No outliers
  shapiro_test(weight.change) #p is .79 so our assumption of normality is met
## # A tibble: 1 × 3
##   variable      statistic     p
##   <chr>             <dbl> <dbl>
## 1 weight.change     0.990 0.790
diet.wc %>%
  levene_test(weight.change~gender) #equal variances met
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     1    74     0.202 0.654
ggqqplot(diet.wc, x= "weight.change", facet.by = "gender")

#Whether there is a statistically significant differences in weight change
#with respect to their diet plan? 
diet.wc %>%
  shapiro_test(weight.change)  #p is .79 so our assumption of normality is met
## # A tibble: 1 × 3
##   variable      statistic     p
##   <chr>             <dbl> <dbl>
## 1 weight.change     0.990 0.790
diet.wc %>%
  levene_test(weight.change~diet.type) #equal variances met
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     2    73     0.463 0.631
ggqqplot(diet.wc, x= "weight.change", facet.by = "diet.type")

diet.aov <- aov(weight.change ~ diet.type,data = diet.wc)
summary(diet.aov) #P smaller than 0.01, very significant difference
##             Df Sum Sq Mean Sq F value Pr(>F)   
## diet.type    2   60.5  30.264   5.383 0.0066 **
## Residuals   73  410.4   5.622                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
levene_test(diet.aov) 
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     2    73     0.463 0.631
#p is larger than 0.05; Homogenity is met
par(mfrow=c(2,2))
plot(diet.aov,1)
plot(diet.aov,2) #Very fitted, looks good.
qqPlot(diet.aov) # All in CI, looks good.
## [1] 15 48
hist(diet.aov$residuals)

#Conduct a post-hoc comparison, report the result
  
TukeyHSD(diet.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight.change ~ diet.type, data = diet.wc)
## 
## $diet.type
##          diff        lwr      upr     p adj
## B-A -0.032000 -1.6530850 1.589085 0.9987711
## C-A  1.848148  0.2567422 3.439554 0.0188047
## C-B  1.880148  0.3056826 3.454614 0.0152020
#Tukey's probably isn't appropriate here due to small sample size
#C seems to have a significant difference in weight change relative to A and B


#and visualize the post-hoc analysis with your interpretation. 

ggline(diet.wc,"diet.type","weight.change", 
       add=c("mean_se","jitter"),
       ylab="Weight Change",
       main="Mean Weight Change by Treatment Condition")

#It appears as thought diet C causes a statistically significant weight gain
#in patients relative to diet A and diet C.
beep(5)

Task 2B

Data Specification:

A researcher has a dataset that consists of points of students including their study time & number of courses. And the researcher wish to build a regression model to capture all the patterns in the dataset while predicting the students’ final points.

Please load the Math.csv and complete the following tasks:

  • Output a descriptive statistics table in a Word document to describe the data.
  • Check the distribution of target variables, and visualize the mean math score as per the number of courses taken.
  • Make some plots to help this researcher understand the relationships between all variables.
  • Did you see any outliers? if so remove them from the dataset. Use filter() in tidyverse to remove the outliers group_by() summarize(meanScore= mean(score)) ungroup()

Or set row to NULL Or try new.df <- df[-rownumber,] where # is row number

  • Setup a model, does study time significantly predict students’ math learning outcome? Report the result with an APA style table. Formula with [output~input] Correlation coefficient, R^2 value

  • Generate some diagnostic plots of your fitted model.

Y = B1x1 +B2x2 lm to fit the model, determine the Betas (coefficients?) model.math = lm(Output ~input, data =“df”)

Math <- import("PATHWAYS/Math.csv")
Math.table <- describe(Math)


#Output a descriptive statistics table in a Word document to describe the data.
export(Math.table,MathTable <- tempfile(fileext = ".txt")) 

#Check the distribution of target variables
Math %>%
  shapiro_test(number_courses)  #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
##   variable       statistic           p
##   <chr>              <dbl>       <dbl>
## 1 number_courses     0.883 0.000000243
Math %>%
  shapiro_test(time_study) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
##   variable   statistic        p
##   <chr>          <dbl>    <dbl>
## 1 time_study     0.950 0.000883
Math %>%
  shapiro_test(Marks) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
##   variable statistic          p
##   <chr>        <dbl>      <dbl>
## 1 Marks        0.914 0.00000708
Math.m <- Math %>% 
  mutate(MeanMath = Marks/number_courses) #creates a "MeanMath" which is marks per course
par(mfrow=c(2,2)) #distribution of  variables
hist(Math.m$number_courses,xlab="Number of Courses",main=paste("Frequency of Courses Taken"))
hist(Math.m$time_study,xlab="Time Studying (hrs)",main=paste("Frequency of Time Spent Studying"))
hist(Math.m$Marks,xlab="Total Marks",main=paste("Frequency of Marks"))
hist(Math.m$MeanMath,xlab="Average Score",main=paste("Frequency of Mean Marks"))

Math.m %>%
  shapiro_test(MeanMath) #p is less than 0.001 NOT NORMAL
## # A tibble: 1 × 3
##   variable statistic           p
##   <chr>        <dbl>       <dbl>
## 1 MeanMath     0.879 0.000000165
#and visualize the mean math score as per the number of courses taken.

ggline(Math.m,"number_courses","MeanMath", 
       add=c("mean_se","jitter"),
       xlab="Number of Courses Taken",
       ylab="Mean Score",
       main="Mean Math Scores by Courses Taken")

#Make some plots to help this researcher understand the relationships
#between all variables. 
pairs(Math.m, #creates a second array with pearson coefficient, etc.
      upper.panel = panel.smooth, # add a smooth line 
      lower.panel = panel.cor,
      method = "pearson",
      cex.cor = 1.5)

#Did you see any outliers? if so remove them from the dataset.
Math.reg1 <- lm(MeanMath ~number_courses, data = Math.m)
summary(Math.reg1)
## 
## Call:
## lm(formula = MeanMath ~ number_courses, data = Math.m)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6586 -2.3561 -0.8911  1.6500  9.1311 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.4734     0.8848   7.316 7.07e-11 ***
## number_courses  -0.3151     0.1584  -1.989   0.0495 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.837 on 98 degrees of freedom
## Multiple R-squared:  0.03879,    Adjusted R-squared:  0.02898 
## F-statistic: 3.955 on 1 and 98 DF,  p-value: 0.04953
math.aov <- aov(formula = MeanMath ~ time_study, data = Math.m)

outlierTest(Math.reg1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferroni p
## 36 3.437488         0.00086635     0.086635
par(mfrow=c(2,2))
  plot(math.aov,1) #36,4 and 14 flagged as outliers
  plot(math.aov,2) #Very fitted, looks good.
  qqPlot(math.aov) # Most all in CI, looks good.
## [1] 11 36
  hist(math.aov$residuals) #Hm.

Math.m %>% 
  identify_outliers(MeanMath) #Outliers are NOT extreme. 11 and 36 flagged
##   number_courses time_study  Marks MeanMath is.outlier is.extreme
## 1              3      7.353 42.036 14.01200       TRUE      FALSE
## 2              3      7.543 43.978 14.65933       TRUE      FALSE
#Creating a new df with outliers removed
Math.m1 <- Math.m[-36,]
Math.m1 <- Math.m1[-11,]


math.aov.1 <- aov(formula = MeanMath ~ time_study, data = Math.m1)
summary(math.aov.1) #p statistically significant at the 0.001 level
##             Df Sum Sq Mean Sq F value Pr(>F)    
## time_study   1  476.6   476.6     289 <2e-16 ***
## Residuals   96  158.3     1.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Setup a model, does study time significantly predict students' math learning outcome? 
#Report the result with an APA style table.

math.aov.1
## Call:
##    aov(formula = MeanMath ~ time_study, data = Math.m1)
## 
## Terms:
##                 time_study Residuals
## Sum of Squares    476.5901  158.3177
## Deg. of Freedom          1        96
## 
## Residual standard error: 1.28419
## Estimated effects may be unbalanced
Math.reg2 <- lm(MeanMath ~time_study, data = Math.m1) #Regression model (linear)
summary(Math.reg2) #Regression model (linear)
## 
## Call:
## lm(formula = MeanMath ~ time_study, data = Math.m1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7577 -0.9503 -0.2074  0.5056  3.9758 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.82636    0.25773   3.206  0.00183 ** 
## time_study   0.94451    0.05556  17.000  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.284 on 96 degrees of freedom
## Multiple R-squared:  0.7506, Adjusted R-squared:  0.748 
## F-statistic:   289 on 1 and 96 DF,  p-value: < 2.2e-16
Math.m1 %>%
  ggplot(aes(x=time_study, y=MeanMath)) + 
  geom_point(alpha = 1) +
  geom_smooth(method = lm,
              alpha = .6,
              se = F) +
  labs(x="Study Time in Hours", 
       y="Total Score", 
       title = "Total Score by Time Spent Studying")+
 theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))
## `geom_smooth()` using formula 'y ~ x'

Math.APA <- Math.m1
Math.APA$time_study <- round(Math.APA$time_study,0)

apaTables::apa.1way.table(iv=time_study,
                          dv=MeanMath,
                          data=Math.APA,
                          file="MathScoresbyStudy.doc")
## 
## 
## Descriptive statistics for MeanMath as a function of time_study.  
## 
##  time_study    M   SD
##           0 1.81 0.08
##           1 2.06 0.14
##           2 2.41 0.29
##           3 3.05 0.45
##           4 3.88 0.87
##           5 6.05 1.01
##           6 7.19 2.36
##           7 7.99 1.93
##           8 7.65 0.89
## 
## Note. M and SD represent mean and standard deviation, respectively.
## 
#Generate some diagnostic plots of your fitted model.

ggplot(Math.m1, aes(x = time_study, y = MeanMath))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .6, 
              se = F) +  
  labs( x = "Wind Speed MPH",
        y = "Ozone Concentration BBP", 
        title = "Relationship Between Wind Speed and Ozone Level")+
  stat_regline_equation(label.x = 5, label.y = 20, aes(label = ..eq.label..)) + # show the lm formula 
  stat_regline_equation(label.x = 5, label.y = 15, aes(label = ..rr.label..)) +#  show the R^2
  theme_linedraw()
## `geom_smooth()` using formula 'y ~ x'

#Formula with [output~input] Correlation coefficient, R^2 value

outlierTest(Math.reg2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 100 3.283435          0.0014362      0.14074
# makes a qq plot of residuals
qqPlot(Math.reg2)
##  59 100 
##  57  98
# calculates influence stats for every observation
math.inf <- influence.measures(Math.reg2)
summary(math.inf)
## Potentially influential observations of
##   lm(formula = MeanMath ~ time_study, data = Math.m1) :
## 
##     dfb.1_ dfb.tm_s dffit   cov.r   cook.d hat  
## 7   -0.07   0.25     0.38    0.89_*  0.07   0.02
## 10  -0.09   0.28     0.41    0.87_*  0.08   0.02
## 49  -0.02   0.16     0.28    0.93_*  0.04   0.02
## 59  -0.10   0.29     0.43    0.86_*  0.08   0.02
## 83  -0.20   0.39     0.47_*  0.92_*  0.11   0.03
## 100 -0.12   0.33     0.47_*  0.84_*  0.10   0.02
# check for non-linearity
crPlots(Math.reg2)

# non-constant variance test
ncvTest(Math.reg2)
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 24.07483, Df = 1, p = 9.2664e-07
# variable-level information useful for analysis.*
augment(Math.reg2)
## # A tibble: 98 × 9
##    .rownames MeanMath time_study .fitted  .resid   .hat .sigma   .cooksd
##    <chr>        <dbl>      <dbl>   <dbl>   <dbl>  <dbl>  <dbl>     <dbl>
##  1 1             6.40      4.51    5.08   1.32   0.0107   1.28 0.00573  
##  2 2             1.93      0.096   0.917  1.02   0.0389   1.29 0.0132   
##  3 3             3.45      3.13    3.79  -0.333  0.0116   1.29 0.000400 
##  4 4             8.84      7.91    8.30   0.540  0.0387   1.29 0.00370  
##  5 5             6.91      7.81    8.20  -1.29   0.0373   1.28 0.0203   
##  6 6             2.97      3.21    3.86  -0.889  0.0114   1.29 0.00279  
##  7 7             9.96      6.06    6.55   3.41   0.0181   1.24 0.0662   
##  8 8             3.45      3.41    4.05  -0.597  0.0109   1.29 0.00120  
##  9 9             5.09      4.41    4.99   0.0953 0.0105   1.29 0.0000296
## 10 10           10.3       6.17    6.66   3.63   0.0190   1.23 0.0788   
## # … with 88 more rows, and 1 more variable: .std.resid <dbl>
# fitted values (.fitted), residuals (.resid), 
# Cook’s distance (.cooksd) -a measure of outliers 

# Let's take a close look 
dig.M <- augment(Math.reg2)


ggplot(dig.M, aes(x = time_study, y = MeanMath))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .6, 
              se = F) + 
  geom_segment(aes(xend = time_study, yend = .fitted), color = "red", size = 0.3)+
  labs( x = "Time Spent Studying (Hours)",
        y = "Mean Score in Mathematics Courses", 
        title = "Relation Between Time Studying and Math Marks (with Residuals)")
## `geom_smooth()` using formula 'y ~ x'
# the plot shows the residual errors between observed values and the fitted regression line,. 
beep(8)


#Thank you for everything! Sorry I didn't have time to try the bonus activities!
#Hopefully I will be able to later this week or when I have a break!