Please see my analysis and response to research questions in the code below.

library(knitr) # Knit your markdown to Word,Pdf and etc. 
library(beepr)
library(rio) # Swiss-army knife for data I/O
library(DescTools) # A collection of functions for efficiently describing data
library(ggpubr) # facilitates the creation of beautiful ggplot2-based graphs 
library(car) # Some useful function for learning R 
library(stargazer)
library(psych)
library(tidyverse) # tidyverse
library(rstatix) # simple and intuitive pipe-friendly framework
library(janitor) # simplified data cleaning tool 
library(corrr) # correlation ish in R
library(summarytools) # for reporting your results
library(ggstatsplot) # save your time on box plot here 
library(table1)
library(afex)

Task 1 Data Manipulation and visualization-10pts

Task 1A

Data specification:

The dataset StudentInfo.csv contains survey results from 431 students enrolled at a university in the United States:

Gender: 0=Male 1=Female| Athlete : 0 = Non-athlete 1 = Athlete| Weight: weight in lbs| Height: height in inches| Sprint: 35-meter sprint time (seconds)| MileMinDur : Mile run time (hh:mm:ss)| English:Writing : Scores on placement test| State: Is student in-state or out-of-state resident?| LiveOnCampus: 0 = Off-campus 1 = On-campus| HowCommute: 1=Walk, 2 = Bike, 3 = Car, 4 = Public transit, 5 = Other| CommuteTime: Time in Mins per day SleepTime : Time in hours per week StudyTime : Time in hours per week

Load the dataset. Investigate the descriptive statistics of all the variables.

StudentInfo <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/EPSY594 Final Exam Package/StudentInfo.csv")
head(StudentInfo)

##     ids Gender Athlete Height Weight Sprint MileMinDur English Reading  Math
## 1 43783      0       0  72.35     NA  7.978              88.24   81.50 60.02
## 2 20278      0       0  70.66 179.20  8.004    0:06:21   89.45   85.25 70.19
## 3 22820      1       0     NA 198.34  8.473    0:12:44   74.06   88.68 55.89
## 4 24559      1       1  67.43 128.17     NA    0:06:25   82.61   77.30 65.52
## 5 28980      0       1  68.45 171.61  4.650    0:07:24   70.10      NA 61.40
## 6 33312      0       1  68.56 163.96  4.750    0:05:47   78.98   87.53 76.71
##   Writing        State LiveOnCampus HowCommute CommuteTime SleepTime StudyTime
## 1   81.44     In state            1         NA          NA         7         1
## 2   73.27     In state            1         NA          NA         5         2
## 3   73.16     In state            1         NA          NA         2         6
## 4   80.45 Out of state            1         NA          NA         7         3
## 5   77.48     In state            1         NA          NA         8         3
## 6   70.79     In state            1         NA          NA         9         9

str(StudentInfo)

## 'data.frame':    430 obs. of  17 variables:
##  $ ids         : int  43783 20278 22820 24559 28980 33312 40274 40390 29045 35676 ...
##  $ Gender      : int  0 0 1 1 0 0 0 1 0 1 ...
##  $ Athlete     : int  0 0 0 1 1 1 1 1 0 1 ...
##  $ Height      : num  72.3 70.7 NA 67.4 68.5 ...
##  $ Weight      : num  NA 179 198 128 172 ...
##  $ Sprint      : num  7.98 8 8.47 NA 4.65 ...
##  $ MileMinDur  : chr  "" "0:06:21" "0:12:44" "0:06:25" ...
##  $ English     : num  88.2 89.5 74.1 82.6 70.1 ...
##  $ Reading     : num  81.5 85.2 88.7 77.3 NA ...
##  $ Math        : num  60 70.2 55.9 65.5 61.4 ...
##  $ Writing     : num  81.4 73.3 73.2 80.5 77.5 ...
##  $ State       : chr  "In state" "In state" "In state" "Out of state" ...
##  $ LiveOnCampus: int  1 1 1 1 1 1 0 0 0 1 ...
##  $ HowCommute  : int  NA NA NA NA NA NA NA 3 3 NA ...
##  $ CommuteTime : int  NA NA NA NA NA NA NA 26 20 NA ...
##  $ SleepTime   : int  7 5 2 7 8 9 8 NA 1 4 ...
##  $ StudyTime   : int  1 2 6 3 3 9 3 3 6 0 ...

sum(is.na(StudentInfo)) #679 total NA

## [1] 679

StudentInfo$Gender <- factor(StudentInfo$Gender, labels = c("Male", "Female"))
StudentInfo$Athlete <- factor(StudentInfo$Athlete, labels = c("Non-Athlete", "Athlete"))
StudentInfo$State[StudentInfo$State == "In state"] <-0
StudentInfo$State[StudentInfo$State == "Out of state"] <-1
StudentInfo$State <- factor(StudentInfo$State, labels = c("In State", "Out of State"))
StudentInfo$Athlete <- factor(StudentInfo$Athlete, labels = c("Non-Athlete", "Athlete"))
StudentInfo$Gender[is.na(StudentInfo$Gender)] #9 NAs in Gender

## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Male Female

StudentInfo$Gender[is.na(StudentInfo$Gender)]

## [1] <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## Levels: Male Female

StudentInfo$LiveOnCampus <- factor(StudentInfo$LiveOnCampus, labels = c("Off-campus", "On-campus"))
StudentInfo.table <- describe(StudentInfo)

#Split the dataset into two subsets:
#students who are living on-campus and living off-campus 

SI.split <- split(StudentInfo,StudentInfo$LiveOnCampus)
  
Students.OffCamp <- as.data.frame(SI.split[1])
Students.OnCamp <- as.data.frame(SI.split[2])

colnames(Students.OnCamp) <- c('ids','Gender','Athlete','Height','Weight','Sprint','MileMinDur','English','Reading','Math','Writing','State','LiveOnCampus','HowCommute','CommuteTime','SleepTime','StudyTime')

colnames(Students.OffCamp) <- c('ids','Gender','Athlete','Height','Weight','Sprint','MileMinDur','English','Reading','Math','Writing','State','LiveOnCampus','HowCommute','CommuteTime','SleepTime','StudyTime')

#Remove the original ID variable and make new ID variables for each subset you 
  #just created: 
     #For off-campus students, start the ID with 'com' and followed by a 4-digit 
        #random number 
     #For on-campus students, start with 'res' and follow by a 4-digits random
        #number

NewID.On = data.frame(1:178)
NewID.Off = data.frame(1:250)

set.seed(420)
NewID.On <- NewID.On %>% 
  mutate(ID = paste("col",sample(1111:9999, 178,FALSE), sep = ""))

set.seed(420)
NewID.Off <- NewID.Off %>% 
    mutate(ID = paste("res",sample(1111:9999, 250,FALSE), sep = ""))

Students.OffCamp$ids <- NewID.Off$ID
Students.OnCamp$ids <-  NewID.On$ID

#How many missing observations do the datasets contain?
#And from which column(s)? 
NA.TOT.Off = sum(is.na(Students.OffCamp)) #191 NA
NA.TOT.Off.Gen = sum(is.na(Students.OffCamp$Gender)) #4 NA
NA.TOT.Off.Ath = sum(is.na(Students.OffCamp$Athlete))
NA.TOT.Off.He = sum(is.na(Students.OffCamp$Height)) #16 NA
NA.TOT.Off.We = sum(is.na(Students.OffCamp$Weight)) #31 NA
NA.TOT.Off.Spr = sum(is.na(Students.OffCamp$Sprint)) #31 NA
NA.TOT.Off.MM = sum(is.na(Students.OffCamp$MileMinDur))
NA.TOT.Off.Eng = sum(is.na(Students.OffCamp$English)) #18 NA
NA.TOT.Off.Read = sum(is.na(Students.OffCamp$Reading)) #5 NA
NA.TOT.Off.Math = sum(is.na(Students.OffCamp$Math)) #7 NA
NA.TOT.Off.Writ = sum(is.na(Students.OffCamp$Writing))#21NA
NA.TOT.Off.State = sum(is.na(Students.OffCamp$State))
NA.TOT.Off.OnC = sum(is.na(Students.OffCamp$LiveOnCampus))
NA.TOT.Off.HowC = sum(is.na(Students.OffCamp$HowCommute)) #5 NA
NA.TOT.Off.CTime = sum(is.na(Students.OffCamp$CommuteTime)) #5 N
NA.TOT.Off.SleepTime = sum(is.na(Students.OffCamp$SleepTime)) #19 NA
NA.TOT.Off.StudyTime = sum(is.na(Students.OffCamp$StudyTime)) #29 NA


NA.TOT.Off  = sum(is.na(Students.OnCamp)) #480 NA
NA.TOT.Off.Gen = sum(is.na(Students.OnCamp$Gender)) #5 NA
NA.TOT.Off.Ath = sum(is.na(Students.OnCamp$Athlete))
NA.TOT.Off.He = sum(is.na(Students.OnCamp$Height)) #11 NA
NA.TOT.Off.We = sum(is.na(Students.OnCamp$Weight)) #27 NA
NA.TOT.Off.Spr = sum(is.na(Students.OnCamp$Sprint)) #28 NA
NA.TOT.Off.MM = sum(is.na(Students.OnCamp$MileMinDur))
NA.TOT.Off.Eng = sum(is.na(Students.OnCamp$English)) #8 NA
NA.TOT.Off.Read = sum(is.na(Students.OnCamp$Reading)) #5 NA
NA.TOT.Off.Math = sum(is.na(Students.OnCamp$Math)) #6 NA
NA.TOT.Off.Writ = sum(is.na(Students.OnCamp$Writing)) #10 NA
NA.TOT.Off.State = sum(is.na(Students.OnCamp$State))
NA.TOT.Off.OnC = sum(is.na(Students.OnCamp$LiveOnCampus))
NA.TOT.Off.HowC = sum(is.na(Students.OnCamp$HowCommute)) #178 NA
NA.TOT.Off.CTime = sum(is.na(Students.OnCamp$CommuteTime)) #178 NA
NA.TOT.Off.SleepTime = sum(is.na(Students.OnCamp$SleepTime)) #12 NA
NA.TOT.Off.StudyTime = sum(is.na(Students.OnCamp$StudyTime)) #12 NA

#Now, replace missing values for all continuous variables
#with group mean (mean from subset).

StudOff.HeightMean = round(mean(Students.OffCamp$Height, na.rm=TRUE),2)
Students.OffCamp$Height[is.na(Students.OffCamp$Height)] <- StudOff.HeightMean

StudOff.WeightMean = round(mean(Students.OffCamp$Weight, na.rm=TRUE),2)
Students.OffCamp$Weight[is.na(Students.OffCamp$Weight)] <- StudOff.WeightMean

StudOff.SprintMean = round(mean(Students.OffCamp$Sprint, na.rm=TRUE),3)
Students.OffCamp$Sprint[is.na(Students.OffCamp$Sprint)] <- StudOff.SprintMean

StudOff.EnglishMean = round(mean(Students.OffCamp$English, na.rm=TRUE),2)
Students.OffCamp$English[is.na(Students.OffCamp$English)] <- StudOff.EnglishMean

StudOff.ReadingMean = round(mean(Students.OffCamp$Reading, na.rm=TRUE),2)
Students.OffCamp$Reading[is.na(Students.OffCamp$Reading)] <- StudOff.ReadingMean

StudOff.MathMean = round(mean(Students.OffCamp$Math, na.rm=TRUE),2)
Students.OffCamp$Math[is.na(Students.OffCamp$Math)] <- StudOff.MathMean

StudOff.WritingMean = round(mean(Students.OffCamp$Writing, na.rm=TRUE),2)
Students.OffCamp$Writing[is.na(Students.OffCamp$Writing)] <- StudOff.WritingMean

StudOff.CommuteMean = round(mean(Students.OffCamp$CommuteTime, na.rm=TRUE),0)
Students.OffCamp$CommuteTime[is.na(Students.OffCamp$CommuteTime)] <- StudOff.CommuteMean

StudOff.SleepMean = round(mean(Students.OffCamp$SleepTime, na.rm=TRUE),0)
Students.OffCamp$SleepTime[is.na(Students.OffCamp$SleepTime)] <- StudOff.SleepMean

StudOff.StudyMean = round(mean(Students.OffCamp$StudyTime, na.rm=TRUE),0)
Students.OffCamp$StudyTime[is.na(Students.OffCamp$StudyTime)] <- StudOff.StudyMean

###

StudOn.HeightMean = round(mean(Students.OnCamp$Height, na.rm=TRUE),2)
Students.OnCamp$Height[is.na(Students.OnCamp$Height)] <- StudOn.HeightMean

StudOn.WeightMean = round(mean(Students.OnCamp$Weight, na.rm=TRUE),2)
Students.OnCamp$Weight[is.na(Students.OnCamp$Weight)] <- StudOn.WeightMean

StudOn.SprintMean = round(mean(Students.OnCamp$Sprint, na.rm=TRUE),3)
Students.OnCamp$Sprint[is.na(Students.OnCamp$Sprint)] <- StudOn.SprintMean

StudOn.EnglishMean = round(mean(Students.OnCamp$English, na.rm=TRUE),2)
Students.OnCamp$English[is.na(Students.OnCamp$English)] <- StudOn.EnglishMean

StudOn.ReadingMean = round(mean(Students.OnCamp$Reading, na.rm=TRUE),2)
Students.OnCamp$Reading[is.na(Students.OnCamp$Reading)] <- StudOn.ReadingMean

StudOn.MathMean = round(mean(Students.OnCamp$Math, na.rm=TRUE),2)
Students.OnCamp$Math[is.na(Students.OnCamp$Math)] <- StudOn.MathMean

StudOn.WritingMean = round(mean(Students.OnCamp$Writing, na.rm=TRUE),2)
Students.OnCamp$Writing[is.na(Students.OnCamp$Writing)] <- StudOn.WritingMean

StudOn.SleepMean = round(mean(Students.OnCamp$SleepTime, na.rm=TRUE),0)
Students.OnCamp$SleepTime[is.na(Students.OnCamp$SleepTime)] <- StudOn.SleepMean

StudOn.StudyMean = round(mean(Students.OnCamp$StudyTime, na.rm=TRUE),0)
Students.OnCamp$StudyTime[is.na(Students.OnCamp$StudyTime)] <- StudOn.StudyMean


#- Make a descriptive statistic summary table for each subset
#and output your result in a MS word. (choose any grouping that makes sense to you).
Students.OnCampus.Table <- describe(Students.OnCamp)
Students.OffCampus.Table <- describe(Students.OffCamp)


export(Students.OnCampus.Table,OnCampus <- tempfile(fileext = ".txt")) 
export(Students.OffCampus.Table,OffCampus <- tempfile(fileext = ".txt")) 
#Then copy and paste tables from .txt files to a .doc file
beep(1)


#Bind both subsets, then Z-score transform all continuous variables

Students <- rbind(Students.OnCamp,Students.OffCamp)

mHeight<- mean(Students$Height)
sdHeight<- sd(Students$Height)
Students <- Students %>% 
  mutate(zHeight = round(((Height-mHeight)/sdHeight),2))

mWeight<- mean(Students$Weight)
sdWeight<- sd(Students$Weight)
Students <- Students %>% 
  mutate(zWeight = round(((Weight-mWeight)/sdWeight),2))

mSprint <- mean(Students$Sprint)
sdSprint <- sd(Students$Sprint)
Students <- Students %>% 
  mutate(zSprint = round(((Sprint-mSprint)/sdSprint),2))

mEnglish <- mean(Students$English)
sdEnglish <- sd(Students$English)
Students <- Students %>% 
  mutate(zEnglish = round(((English-mEnglish)/sdEnglish),2))

mReading <- mean(Students$Reading)
sdReading <- sd(Students$Reading)
Students <- Students %>% 
  mutate(zReading = round(((Reading-mReading)/sdReading),2))

mWriting <- mean(Students$Writing)
sdWriting <- sd(Students$Writing)
Students <- Students %>% 
  mutate(zWriting = round(((Writing-mWriting)/sdWriting),2))

mMath <- mean(Students$Math)
sdMath <- sd(Students$Math)
Students <- Students %>% 
  mutate(zMath = round(((Math-mMath)/sdMath),2))

mSleepTime <- mean(Students$SleepTime)
sdSleepTime <- sd(Students$SleepTime)
Students <- Students %>% 
  mutate(zSleepTime = round(((SleepTime-mSleepTime)/sdSleepTime),2))

mStudyTime <- mean(Students$StudyTime)
sdStudyTime <- sd(Students$StudyTime)
Students <- Students %>% 
  mutate(zStudyTime = round(((StudyTime-mStudyTime)/sdStudyTime),2))


# and export an SPSS data file. 

export(Students,Studentz <- tempfile(fileext = ".sav"))
beep(1)

Task 1B

Please choose the suitable graph(s) to complete the following tasks, and remember to label the x and y axes properly, also add title, sub-title where necessary:

Make a plot matrix to show the distribution, histogram, and correlation for all the continuous variables from each subset.
Make a plot to show their total placement test score grouped by their in-state status and gender.
Make a plot to show the relationship between study time and total placement test, group by on-campus status(with a fitted line, formula, and \(R^2\)). #res(total~StudyTime)??????
*Combine all the plots you made from Task 1B, try your best to tweak it “Aesthetically pleasing”.

#Make a plot matrix to show the distribution, histogram, and correlation
#for all the continuous variables from each subset.
Students <- Students %>% 
  mutate(TotScore = Writing + Reading + Math + English)

Students.con <- Students %>%  #new df with only continuous variables
  select(Height,Weight,Sprint,English,Reading,Writing,Math,SleepTime,StudyTime,TotScore) 

panel.cor <- function(x, y, cex.cor = 0.8, method = "pearson", ...) { 
options(warn = -1)                   
usr <- par("usr"); on.exit(par(usr)) # Saves current "usr" and resets on exit
par(usr = c(0, 1, 0, 1))             # Set plot size to 1 x 1
r <- cor(x, y, method = method, use = "pair")               # correlation coef
p <- cor.test(x, y, method = method)$p.val                  # p-value
n <- sum(complete.cases(x, y))                              # How many data pairs
txt <- format(r, digits = 3)                                # Format r-value
txt1 <- format(p, digits = 3)                                 # Format p-value
txt2 <- paste0("r= ", txt, '\n', "p= ", txt1, '\n', 'n= ', n) # Make panel text
text(0.5, 0.5, txt2, cex = cex.cor, ...)                      # Place panel text
options(warn = 0)                                             # Reset warning
}

pairs(Students.con, #creates a second array with pearson coefficient, etc.
      upper.panel = panel.smooth, # add a smooth line 
      lower.panel = panel.cor,
      method = "pearson",
      cex.cor = 1.5)

Students.t.aov <- aov(TotScore ~ Height+Weight+Sprint+SleepTime+StudyTime,data = Students.con)

par(mfrow=c(2,2))
  plot(Students.t.aov,1) #348,68 and 75 flagged as outliers
  plot(Students.t.aov,2) #Very fitted, looks good.
  qqPlot(Students.t.aov) # Most all in CI, looks good.

##  68 348 
##  42 349

  hist(Students.t.aov$residuals) #Looks normal tbh

#Make a plot to show their total placement test score grouped by their 
  #in-state status and gender.
Students.P2 <- Students %>%  #new df with only con variables + gender and state
  select(Height,Weight,Sprint,English,Reading,Writing,Math,SleepTime,StudyTime,Gender,State,TotScore) 


#Graph with each student result visible
Students.P2 %>%
  ggplot(aes(x=State, y=TotScore, color = Gender)) + 
  geom_point(alpha = 0.6) +
  labs(x="State", 
       y="Total Score", 
       title = "Total Score by State and Gender") +
 theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))

#Make a plot to show the relationship between study time and total placement 
#test, group by on-campus status(with a fitted line, formula, and $R^2$). 
#res(total~StudyTime)??????



my.formula = TotScore ~ StudyTime

sp <- Students %>%
  ggplot(aes(x=StudyTime, y=TotScore, color = LiveOnCampus)) + 
  geom_point(alpha = 1) +
  geom_smooth(method = lm,
              alpha = .6,
              se = F) +
  labs(x="Study Time in Hours", 
       y="Total Score", 
       title = "Total Score by Time Spent Studying (for On and Off Campus Students)")+
 theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))




sp2 <- sp + geom_text(aes(12,120, label=(paste(expression("On-Campus: y = 310 + 0.27x"^2*", R"^2*" = 0.0022")))),parse = TRUE)
spfinal <- sp2 + geom_text(aes(12,100, label=(paste(expression("Off-Campus: y = 310 + 0.36x"^2*", R"^2*" = 0.0074")))),parse = TRUE)

spfinal

## `geom_smooth()` using formula 'y ~ x'

beep(3) #beeps

Task 2 Data Analysis

Task 2A

Data specification:

The researcher collected data from 76 people who undertook one of three diets (referred to as diet A, B and C), the researcher also recorded participants’ background information.

Please load the diet.csv and complete the following tasks:

Plot out the mean weight change from different diet plans.
Did the researcher collect an equal proportion of both genders of participants?
Based on the result, whether there is a statistically significant difference in mean weight change between gender groups?
Whether there is a statistically significant differences in weight change with respect to their diet plan? -*Also check the assumptions before you conduct the analysis. -Conduct a post-hoc comparison, report the result and visualize the post-hoc analysis with your interpretation.

diet <- import("PATHWAY/diet.csv")
head(diet)

##   id gender age height diet.type initial.weight final.weight
## 1  1 Female  22    159         A             58         54.2
## 2  2 Female  46    192         A             60         54.0
## 3  3 Female  55    170         A             64         63.3
## 4  4 Female  33    171         A             64         61.1
## 5  5 Female  50    170         A             65         62.2
## 6  6 Female  50    201         A             66         64.0

str(diet)

## 'data.frame':    76 obs. of  7 variables:
##  $ id            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender        : chr  "Female" "Female" "Female" "Female" ...
##  $ age           : int  22 46 55 33 50 50 37 28 28 45 ...
##  $ height        : int  159 192 170 171 170 201 174 176 165 165 ...
##  $ diet.type     : chr  "A" "A" "A" "A" ...
##  $ initial.weight: int  58 60 64 64 65 66 67 69 70 70 ...
##  $ final.weight  : num  54.2 54 63.3 61.1 62.2 64 65 60.5 68.1 66.9 ...

sum(is.na(diet))

## [1] 0

diet$gender[diet$gender == "Female"] <-0
diet$gender[diet$gender == "Male"] <-1
diet$gender <- factor(diet$gender, labels = c("Female", "Male"))

diet$diet.type[diet$diet.type == "A"] <-0
diet$diet.type[diet$diet.type == "B"] <-1
diet$diet.type[diet$diet.type == "C"] <-2
diet$diet.type <- factor(diet$diet.type, labels = c("A", "B","C"))

diet.wc <- diet %>% 
  mutate(weight.change = initial.weight - final.weight)


#Graph of mean by group
diet.gdiet <- diet.wc %>% 
        group_by(diet.type) %>% 
        summarise(
          weight.change = mean(weight.change),
        )


ggplot(diet.gdiet, aes(diet.type,weight.change,fill=diet.type))+
geom_bar(stat="identity", position = "dodge") +
  scale_fill_manual(breaks = c("A", "B", "C"),
                    values = c("#DE3533", "#0047AB", "#006644"))+
  labs(title="Mean Weight Change by Diet Plan", x="Diet Plan", y="Mean Weight Change (kg)")+
  theme(legend.position = "off")

# Test if the proportions of gender is equal
diet  %>% 
  group_by(gender) %>% 
    summarise(
      count = n()
    ) # there are 43 female participants relative to only 33 male participants

## # A tibble: 2 × 2
##   gender count
##   <fct>  <int>
## 1 Female    43
## 2 Male      33

#Based on the result, whether there is a statistically significant difference
#in mean weight change between gender groups? 
diet.wc  %>% 
  group_by(gender) %>% 
    summarise(
      mean = mean(weight.change)
    )

## # A tibble: 2 × 2
##   gender  mean
##   <fct>  <dbl>
## 1 Female  3.89
## 2 Male    4.02

t.test(weight.change~gender, mu = 0,
       alt="two.sided",conf=0.95, paired =F,data=diet.wc) #p id 0.83. No sig dif.

## 
##  Welch Two Sample t-test
## 
## data:  weight.change by gender
## t = -0.2091, df = 68.809, p-value = 0.835
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -1.287385  1.043128
## sample estimates:
## mean in group Female   mean in group Male 
##             3.893023             4.015152

# Normality & variance assumption test
diet.wc %>% 
    identify_outliers(weight.change)

##  [1] id             gender         age            height         diet.type     
##  [6] initial.weight final.weight   weight.change  is.outlier     is.extreme    
## <0 rows> (or 0-length row.names)

diet.wc %>% # No outliers
  shapiro_test(weight.change) #p is .79 so our assumption of normality is met

## # A tibble: 1 × 3
##   variable      statistic     p
##   <chr>             <dbl> <dbl>
## 1 weight.change     0.990 0.790

diet.wc %>%
  levene_test(weight.change~gender) #equal variances met

## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     1    74     0.202 0.654

ggqqplot(diet.wc, x= "weight.change", facet.by = "gender")

#Whether there is a statistically significant differences in weight change
#with respect to their diet plan? 
diet.wc %>%
  shapiro_test(weight.change)  #p is .79 so our assumption of normality is met

## # A tibble: 1 × 3
##   variable      statistic     p
##   <chr>             <dbl> <dbl>
## 1 weight.change     0.990 0.790

diet.wc %>%
  levene_test(weight.change~diet.type) #equal variances met

## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     2    73     0.463 0.631

ggqqplot(diet.wc, x= "weight.change", facet.by = "diet.type")

diet.aov <- aov(weight.change ~ diet.type,data = diet.wc)
summary(diet.aov) #P smaller than 0.01, very significant difference

##             Df Sum Sq Mean Sq F value Pr(>F)   
## diet.type    2   60.5  30.264   5.383 0.0066 **
## Residuals   73  410.4   5.622                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

levene_test(diet.aov)

## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     2    73     0.463 0.631

#p is larger than 0.05; Homogenity is met
par(mfrow=c(2,2))
plot(diet.aov,1)
plot(diet.aov,2) #Very fitted, looks good.
qqPlot(diet.aov) # All in CI, looks good.

## [1] 15 48

hist(diet.aov$residuals)

#Conduct a post-hoc comparison, report the result
  
TukeyHSD(diet.aov)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight.change ~ diet.type, data = diet.wc)
## 
## $diet.type
##          diff        lwr      upr     p adj
## B-A -0.032000 -1.6530850 1.589085 0.9987711
## C-A  1.848148  0.2567422 3.439554 0.0188047
## C-B  1.880148  0.3056826 3.454614 0.0152020

#Tukey's probably isn't appropriate here due to small sample size
#C seems to have a significant difference in weight change relative to A and B


#and visualize the post-hoc analysis with your interpretation. 

ggline(diet.wc,"diet.type","weight.change", 
       add=c("mean_se","jitter"),
       ylab="Weight Change",
       main="Mean Weight Change by Treatment Condition")

#It appears as thought diet C causes a statistically significant weight gain
#in patients relative to diet A and diet C.
beep(5)

Task 2B

Data Specification:

A researcher has a dataset that consists of points of students including their study time & number of courses. And the researcher wish to build a regression model to capture all the patterns in the dataset while predicting the students’ final points.

Please load the Math.csv and complete the following tasks:

Output a descriptive statistics table in a Word document to describe the data.
Check the distribution of target variables, and visualize the mean math score as per the number of courses taken.
Make some plots to help this researcher understand the relationships between all variables.
Did you see any outliers? if so remove them from the dataset. Use filter() in tidyverse to remove the outliers group_by() summarize(meanScore= mean(score)) ungroup()

Or set row to NULL Or try new.df <- df[-rownumber,] where # is row number

Setup a model, does study time significantly predict students’ math learning outcome? Report the result with an APA style table. Formula with [output~input] Correlation coefficient, R^2 value
Generate some diagnostic plots of your fitted model.

Y = B1x1 +B2x2 lm to fit the model, determine the Betas (coefficients?) model.math = lm(Output ~input, data =“df”)

Math <- import("PATHWAYS/Math.csv")
Math.table <- describe(Math)


#Output a descriptive statistics table in a Word document to describe the data.
export(Math.table,MathTable <- tempfile(fileext = ".txt")) 

#Check the distribution of target variables
Math %>%
  shapiro_test(number_courses)  #p is less than 0.001 NOT NORMAL

## # A tibble: 1 × 3
##   variable       statistic           p
##   <chr>              <dbl>       <dbl>
## 1 number_courses     0.883 0.000000243

Math %>%
  shapiro_test(time_study) #p is less than 0.001 NOT NORMAL

## # A tibble: 1 × 3
##   variable   statistic        p
##   <chr>          <dbl>    <dbl>
## 1 time_study     0.950 0.000883

Math %>%
  shapiro_test(Marks) #p is less than 0.001 NOT NORMAL

## # A tibble: 1 × 3
##   variable statistic          p
##   <chr>        <dbl>      <dbl>
## 1 Marks        0.914 0.00000708

Math.m <- Math %>% 
  mutate(MeanMath = Marks/number_courses) #creates a "MeanMath" which is marks per course
par(mfrow=c(2,2)) #distribution of  variables
hist(Math.m$number_courses,xlab="Number of Courses",main=paste("Frequency of Courses Taken"))
hist(Math.m$time_study,xlab="Time Studying (hrs)",main=paste("Frequency of Time Spent Studying"))
hist(Math.m$Marks,xlab="Total Marks",main=paste("Frequency of Marks"))
hist(Math.m$MeanMath,xlab="Average Score",main=paste("Frequency of Mean Marks"))

Math.m %>%
  shapiro_test(MeanMath) #p is less than 0.001 NOT NORMAL

## # A tibble: 1 × 3
##   variable statistic           p
##   <chr>        <dbl>       <dbl>
## 1 MeanMath     0.879 0.000000165

#and visualize the mean math score as per the number of courses taken.

ggline(Math.m,"number_courses","MeanMath", 
       add=c("mean_se","jitter"),
       xlab="Number of Courses Taken",
       ylab="Mean Score",
       main="Mean Math Scores by Courses Taken")

#Make some plots to help this researcher understand the relationships
#between all variables. 
pairs(Math.m, #creates a second array with pearson coefficient, etc.
      upper.panel = panel.smooth, # add a smooth line 
      lower.panel = panel.cor,
      method = "pearson",
      cex.cor = 1.5)

#Did you see any outliers? if so remove them from the dataset.
Math.reg1 <- lm(MeanMath ~number_courses, data = Math.m)
summary(Math.reg1)

## 
## Call:
## lm(formula = MeanMath ~ number_courses, data = Math.m)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6586 -2.3561 -0.8911  1.6500  9.1311 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.4734     0.8848   7.316 7.07e-11 ***
## number_courses  -0.3151     0.1584  -1.989   0.0495 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.837 on 98 degrees of freedom
## Multiple R-squared:  0.03879,    Adjusted R-squared:  0.02898 
## F-statistic: 3.955 on 1 and 98 DF,  p-value: 0.04953

math.aov <- aov(formula = MeanMath ~ time_study, data = Math.m)

outlierTest(Math.reg1)

## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##    rstudent unadjusted p-value Bonferroni p
## 36 3.437488         0.00086635     0.086635

par(mfrow=c(2,2))
  plot(math.aov,1) #36,4 and 14 flagged as outliers
  plot(math.aov,2) #Very fitted, looks good.
  qqPlot(math.aov) # Most all in CI, looks good.

## [1] 11 36

  hist(math.aov$residuals) #Hm.

Math.m %>% 
  identify_outliers(MeanMath) #Outliers are NOT extreme. 11 and 36 flagged

##   number_courses time_study  Marks MeanMath is.outlier is.extreme
## 1              3      7.353 42.036 14.01200       TRUE      FALSE
## 2              3      7.543 43.978 14.65933       TRUE      FALSE

#Creating a new df with outliers removed
Math.m1 <- Math.m[-36,]
Math.m1 <- Math.m1[-11,]


math.aov.1 <- aov(formula = MeanMath ~ time_study, data = Math.m1)
summary(math.aov.1) #p statistically significant at the 0.001 level

##             Df Sum Sq Mean Sq F value Pr(>F)    
## time_study   1  476.6   476.6     289 <2e-16 ***
## Residuals   96  158.3     1.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Setup a model, does study time significantly predict students' math learning outcome? 
#Report the result with an APA style table.

math.aov.1

## Call:
##    aov(formula = MeanMath ~ time_study, data = Math.m1)
## 
## Terms:
##                 time_study Residuals
## Sum of Squares    476.5901  158.3177
## Deg. of Freedom          1        96
## 
## Residual standard error: 1.28419
## Estimated effects may be unbalanced

Math.reg2 <- lm(MeanMath ~time_study, data = Math.m1) #Regression model (linear)
summary(Math.reg2) #Regression model (linear)

## 
## Call:
## lm(formula = MeanMath ~ time_study, data = Math.m1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7577 -0.9503 -0.2074  0.5056  3.9758 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.82636    0.25773   3.206  0.00183 ** 
## time_study   0.94451    0.05556  17.000  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.284 on 96 degrees of freedom
## Multiple R-squared:  0.7506, Adjusted R-squared:  0.748 
## F-statistic:   289 on 1 and 96 DF,  p-value: < 2.2e-16

Math.m1 %>%
  ggplot(aes(x=time_study, y=MeanMath)) + 
  geom_point(alpha = 1) +
  geom_smooth(method = lm,
              alpha = .6,
              se = F) +
  labs(x="Study Time in Hours", 
       y="Total Score", 
       title = "Total Score by Time Spent Studying")+
 theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))

## `geom_smooth()` using formula 'y ~ x'

Math.APA <- Math.m1
Math.APA$time_study <- round(Math.APA$time_study,0)

apaTables::apa.1way.table(iv=time_study,
                          dv=MeanMath,
                          data=Math.APA,
                          file="MathScoresbyStudy.doc")

## 
## 
## Descriptive statistics for MeanMath as a function of time_study.  
## 
##  time_study    M   SD
##           0 1.81 0.08
##           1 2.06 0.14
##           2 2.41 0.29
##           3 3.05 0.45
##           4 3.88 0.87
##           5 6.05 1.01
##           6 7.19 2.36
##           7 7.99 1.93
##           8 7.65 0.89
## 
## Note. M and SD represent mean and standard deviation, respectively.
##

#Generate some diagnostic plots of your fitted model.

ggplot(Math.m1, aes(x = time_study, y = MeanMath))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .6, 
              se = F) +  
  labs( x = "Wind Speed MPH",
        y = "Ozone Concentration BBP", 
        title = "Relationship Between Wind Speed and Ozone Level")+
  stat_regline_equation(label.x = 5, label.y = 20, aes(label = ..eq.label..)) + # show the lm formula 
  stat_regline_equation(label.x = 5, label.y = 15, aes(label = ..rr.label..)) +#  show the R^2
  theme_linedraw()

## `geom_smooth()` using formula 'y ~ x'

#Formula with [output~input] Correlation coefficient, R^2 value

outlierTest(Math.reg2)

## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 100 3.283435          0.0014362      0.14074

# makes a qq plot of residuals
qqPlot(Math.reg2)

##  59 100 
##  57  98

# calculates influence stats for every observation
math.inf <- influence.measures(Math.reg2)
summary(math.inf)

## Potentially influential observations of
##   lm(formula = MeanMath ~ time_study, data = Math.m1) :
## 
##     dfb.1_ dfb.tm_s dffit   cov.r   cook.d hat  
## 7   -0.07   0.25     0.38    0.89_*  0.07   0.02
## 10  -0.09   0.28     0.41    0.87_*  0.08   0.02
## 49  -0.02   0.16     0.28    0.93_*  0.04   0.02
## 59  -0.10   0.29     0.43    0.86_*  0.08   0.02
## 83  -0.20   0.39     0.47_*  0.92_*  0.11   0.03
## 100 -0.12   0.33     0.47_*  0.84_*  0.10   0.02

# check for non-linearity
crPlots(Math.reg2)

# non-constant variance test
ncvTest(Math.reg2)

## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 24.07483, Df = 1, p = 9.2664e-07

# variable-level information useful for analysis.*
augment(Math.reg2)

## # A tibble: 98 × 9
##    .rownames MeanMath time_study .fitted  .resid   .hat .sigma   .cooksd
##    <chr>        <dbl>      <dbl>   <dbl>   <dbl>  <dbl>  <dbl>     <dbl>
##  1 1             6.40      4.51    5.08   1.32   0.0107   1.28 0.00573  
##  2 2             1.93      0.096   0.917  1.02   0.0389   1.29 0.0132   
##  3 3             3.45      3.13    3.79  -0.333  0.0116   1.29 0.000400 
##  4 4             8.84      7.91    8.30   0.540  0.0387   1.29 0.00370  
##  5 5             6.91      7.81    8.20  -1.29   0.0373   1.28 0.0203   
##  6 6             2.97      3.21    3.86  -0.889  0.0114   1.29 0.00279  
##  7 7             9.96      6.06    6.55   3.41   0.0181   1.24 0.0662   
##  8 8             3.45      3.41    4.05  -0.597  0.0109   1.29 0.00120  
##  9 9             5.09      4.41    4.99   0.0953 0.0105   1.29 0.0000296
## 10 10           10.3       6.17    6.66   3.63   0.0190   1.23 0.0788   
## # … with 88 more rows, and 1 more variable: .std.resid <dbl>

# ﬁtted values (.fitted), residuals (.resid), 
# Cook’s distance (.cooksd) -a measure of outliers 

# Let's take a close look 
dig.M <- augment(Math.reg2)


ggplot(dig.M, aes(x = time_study, y = MeanMath))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .6, 
              se = F) + 
  geom_segment(aes(xend = time_study, yend = .fitted), color = "red", size = 0.3)+
  labs( x = "Time Spent Studying (Hours)",
        y = "Mean Score in Mathematics Courses", 
        title = "Relation Between Time Studying and Math Marks (with Residuals)")

## `geom_smooth()` using formula 'y ~ x'

# the plot shows the residual errors between observed values and the fitted regression line,. 
beep(8)


#Thank you for everything! Sorry I didn't have time to try the bonus activities!
#Hopefully I will be able to later this week or when I have a break!

Data Analysis and Visualization Sample

Juan Madrigal

2023-06-26