Juan Madrigal

Demonstration of Scatterplot Data Visualization and Analysis in R

Below is a short demonstration of R based on an assignment for a course at University of Illinois at Chicago.

Students were given data files with inputs and asked to create plots and analyses based on the values.

The first set is composed of the following variables: - Weight: a numeric vector giving the body weight of the chick (gm). - Time: a numeric vector giving the number of days since birth when the measurement was made. - Chick: a grouping variable - Diet : different diet that chicken received

The tasks required by the assignment are as follows: 0. Make a descriptive statistic table use the package/function of your choice and output to MS Word 1. Run a t-test to determine if the mean weight of all sampled chicken is different from the population mean(mu=100) support you conclusion with a boxplot. 2. Make a scatterplot show the relationship of chicken weight and time, add a fitted line for each diet treatment 3. Make a matrix of scatter plot defined by “diet” variable, choose a grid layout whichever fits your data. 4. Add the equations and \(R^2\) to your plots

library(knitr) # Knit your markdown to Word,Pdf and etc. 
library(beepr)
library(rio) # Swiss-army knife for data I/O
library(DescTools) # A collection of functions for efficiently describing data
library(ggpubr) # facilitates the creation of beautiful ggplot2-based graphs 
library(car) # Some useful function for learning R 
library(stargazer)
library(psych)
# And 
library(tidyverse) # tidyverse
library(rstatix) # simple and intuitive pipe-friendly framework
library(janitor) # simplified data cleaning tool 
library(corrr) # correlation ish in R
library(summarytools) # for reporting your results
library(ggstatsplot) # save your time on box plot here 
library(table1)
library(afex)
#import data and check its integrity
chicken <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/chicken.csv")
head(chicken)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
str(chicken)
## 'data.frame':    578 obs. of  4 variables:
##  $ weight: int  42 51 59 64 76 93 106 125 149 171 ...
##  $ Time  : int  0 2 4 6 8 10 12 14 16 18 ...
##  $ Chick : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Diet  : int  1 1 1 1 1 1 1 1 1 1 ...
chicken[is.na(chicken)] # how many NA in our data (hopefully 0)
## integer(0)
#Make a descriptive statistic table use the package/function
#of your choice and output to MS Word

chicken.table <- describe(chicken)
export(chicken.table,chickenStats <- tempfile(fileext = ".txt"))

#Run a t-test to determine if the mean weight of all sampled chicken 
#is different from the population mean(mu=100)

t1 <- chicken %>% 
  t_test(weight ~ 1, #p value is less than 0.01% so it is definitely significant 
         mu = 100,
         detailed = T)

#t1 <- t.test(chicken$weight,  #in base R
          #   mu = 100)

t1 #Displays information from our t-test, including our p-value
## # A tibble: 1 × 12
##   estimate .y.   group1 group2     n statistic        p    df conf.low conf.high
## *    <dbl> <chr> <chr>  <chr>  <int>     <dbl>    <dbl> <dbl>    <dbl>     <dbl>
## 1     122. weig… 1      null …   578      7.38 5.53e-13   577     116.      128.
## # … with 2 more variables: method <chr>, alternative <chr>
#shows a small effect size
chicken %>% cohens_d(weight ~ 1, mu = 100)
## # A tibble: 1 × 6
##   .y.    group1 group2     effsize     n magnitude
## * <chr>  <chr>  <chr>        <dbl> <int> <ord>    
## 1 weight 1      null model   0.307   578 small
#Support you conclusion with a boxplot.
ggboxplot(chicken$weight, 
                    add = c("mean", "jitter"), width = .5,
                    ylab = "Weight (g)", xlab= "")+
           labs(subtitle = get_test_label(t1,detailed = T),title="Weights of Chickens")

#look at all those chickens

#Make a scatterplot show the relationship of chicken weight and time
#add a fitted line for each diet treatment

chicken$Diet <- as.factor(chicken$Diet)

ggplot(chicken, aes(x = Time, y = weight, color = Diet))+
  geom_point()+
  geom_smooth(method = "lm",se = F)+
  scale_color_manual(values = c("#0009AD", "#ADA500", "#AD002B","#00AD82"))+
  labs( x = "Time (weeks)",
        y = "Chicken Weight (g)", 
        title = "Increase in Chicken Weight over Time by Diet")+
  theme(
plot.title = element_text(color="darkblue", size=12, face="bold"),
plot.background = element_rect(fill = "lightgray"),
axis.title.x = element_text(color="darkblue", size=10, face="bold"),
axis.title.y = element_text(color="darkblue", size=10, face="bold"))
## `geom_smooth()` using formula 'y ~ x'

# Make a matrix of scatter plot defined by "diet" variable
# Choose a grid layout that fits your data. 
chicken.Diets <- split(chicken,chicken$Diet)
chicken.D1 <- as.data.frame(chicken.Diets[1])
chicken.D2 <- as.data.frame(chicken.Diets[2])
chicken.D3 <- as.data.frame(chicken.Diets[3])
chicken.D4 <- as.data.frame(chicken.Diets[4])

par(mfrow=c(2,2))
plot(chicken.D1$X1.Time, chicken.D1$X1.weight,
     main = "Chicken Weight Over Time for Diet 1",
     xlab = "Time (weeks)",
     ylab = "Chicken Weight (g)")
abline(lm(X1.weight~X1.Time, data = chicken.D1)) 
plot(chicken.D2$X2.Time, chicken.D2$X2.weight,
     main = "Chicken Weight Over Time for Diet 2",
     xlab = "Time (weeks)",
     ylab = "Chicken Weight (g)")
abline(lm(X2.weight~X2.Time, data = chicken.D2)) 
plot(chicken.D3$X3.Time, chicken.D3$X3.weight,
     main = "Chicken Weight Over Time for Diet 3",
     xlab = "Time (weeks)",
     ylab = "Chicken Weight (g)")
abline(lm(X3.weight~X3.Time, data = chicken.D3)) 
plot(chicken.D4$X4.Time, chicken.D4$X4.weight,
     main = "Chicken Weight Over Time for Diet 4",
     xlab = "Time (weeks)",
     ylab = "Chicken Weight (g)")
abline(lm(X4.weight~X4.Time, data = chicken.D4)) 

#Redoes everything so I can get that equation and R^2 on the graphs....

c1p <- ggplot(chicken.D1, aes(x = X1.Time, y = X1.weight))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .2, 
              se = F) +
  labs( x = "Time (in weeks)",
        y = "Chicken Weight (g)", 
        title = "Chicken Weight Over Time for Diet 2")

lm_eqn.1 <- function(df){
    m <- lm(X1.weight ~ X1.Time , df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

c1p1 <- c1p + geom_text(x = 4, y = 280, label = lm_eqn.1(chicken.D1), cex=2, parse = TRUE) +
              theme(plot.title = element_text(size = 10))

c2p <- ggplot(chicken.D2, aes(x = X2.Time, y = X2.weight))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .2, 
              se = F) +
  labs( x = "Time (in weeks)",
        y = "Chicken Weight (g)", 
        title = "Chicken Weight Over Time for Diet 2")

lm_eqn.2 <- function(df){
    m <- lm(X2.weight ~ X2.Time , df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

c2p1 <- c2p + geom_text(x = 4, y = 280, label = lm_eqn.2(chicken.D2),cex=2, parse = TRUE) +
              theme(plot.title = element_text(size = 10))

c3p <- ggplot(chicken.D3, aes(x = X3.Time, y = X3.weight))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .2, 
              se = F) +
  labs( x = "Time (in weeks)",
        y = "Chicken Weight (g)", 
        title = "Chicken Weight Over Time for Diet 3")

lm_eqn.3 <- function(df){
    m <- lm(X3.weight ~ X3.Time , df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

c3p1 <- c3p + geom_text(x = 4, y = 280, label = lm_eqn.3(chicken.D3), cex=2, parse = TRUE) +
              theme(plot.title = element_text(size = 10))


c4p <- ggplot(chicken.D4, aes(x = X4.Time, y = X4.weight))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .2, 
              se = F) +
  labs( x = "Time (in weeks)",
        y = "Chicken Weight (g)", 
        title = "Chicken Weight Over Time for Diet 4")

lm_eqn.4 <- function(df){
    m <- lm(X4.weight ~ X4.Time , df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

c4p1 <- c4p + geom_text(x = 4, y = 280, label = lm_eqn.4(chicken.D4), cex=2, parse = TRUE) +
              theme(plot.title = element_text(size = 10))

#Add the equations and R^2 to your plot ... I did not enjoy this
ggarrange(c1p1, c2p1, c3p1, c4p1, ncol = 2, nrow = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

beep(1) #beepr was actually super useful for this one for troubleshooting

Statistical modeling 1

The second portion of the assignment uses another data set that models tooth growth. The variables for the data set are defined as follows: len : Tooth length supp : Supplement type (VC or OJ) dose : Dose in milligrams/day

The assignment requested the following actions be performed: - Make a descriptive statistics table and output to MS Word or other destination - Make a graph display the 3 different dosages for 2 different types of supplement - Make a contingency table and test if supplement type is independent from dosage intake - Test if the proportions of all three types of supplements are equal - Compare the mean tooth growth from two supplements, write up your results - Visualize your results

Then, based on your findings, what’s the next step you would like to consider to see how different levels of dosage could have effects on the mean tooth growth rates?

#Make a descriptive statistics table and output to MS Word or other destination

TG <- ToothGrowth
head(TG)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5
## 5  6.4   VC  0.5
## 6 10.0   VC  0.5
TG <- na.omit(TG) #removes NA rows
TG.table <- describe(TG) #Produces descriptive table to export

TG$dose <-as.factor(TG$dose)
export(TG.table,ToothSupps <- tempfile(fileext = ".txt")) 

apaTables::apa.1way.table(iv=supp,
                          dv=len,
                          data=TG,
                          file="ToothGrowth.doc")
## 
## 
## Descriptive statistics for len as a function of supp.  
## 
##  supp     M   SD
##    OJ 20.66 6.61
##    VC 16.96 8.27
## 
## Note. M and SD represent mean and standard deviation, respectively.
## 
ggplot(TG, aes(supp, len, fill=dose)) +
  geom_bar(stat="identity", position = "dodge") +
  labs(title="Tooth Length by Supplement (At 3 Dosages)", x="Supplement", y="Tooth Length")

#Make a contingency table and test if supplement type
#is independent from dosage intake 
conTableTG = table(ToothGrowth)
#print(conTableTG)
#I commented this out because it is long but it is here

chisq.test(TG$dose, TG$supp, correct = TRUE) #Shows independence of variables
## 
##  Pearson's Chi-squared test
## 
## data:  TG$dose and TG$supp
## X-squared = 0, df = 2, p-value = 1
# Test if the proportions of all three dosages of supplements are equal
TG  %>% 
  group_by(dose) %>% 
    summarise(
      Mean = mean(len),
      SD= sd(len),
      count = n()
    ) # all dosages are equal with n = 20
## # A tibble: 3 × 4
##   dose   Mean    SD count
##   <fct> <dbl> <dbl> <int>
## 1 0.5    10.6  4.50    20
## 2 1      19.7  4.42    20
## 3 2      26.1  3.77    20
# Compare the mean tooth growth from two supplements, write up your results 
TG  %>% 
  group_by(supp) %>% 
    summarise(
      Mean = mean(len),
      SD= sd(len),
      count = n()
    ) 
## # A tibble: 2 × 4
##   supp   Mean    SD count
##   <fct> <dbl> <dbl> <int>
## 1 OJ     20.7  6.61    30
## 2 VC     17.0  8.27    30
#The results show the mean tooth length for the OJ supplement is greater than
#the mean tooth length for the VC supplement. The OJ supplement also has
# a smaller standard deviation. Both supplements have the same number of
# observations as well, which shows balance in our data set.

# Visualize your results 

ggbetweenstats(
  data = TG,
  x = supp,
  y = len,
  plot.type = "box", # for boxplot
  type = "parametric", # for student's t-test
  var.equal = TRUE, # equal variances
  centrality.plotting = F# remove mean
) +
  labs(caption = NULL)

beep(2)

Based on the findings presented here, it would appear that OJ is a more effective supplement than VC. However, VC performed better at the highest dosage. I would like to compare VC and OJ at a high dosage to see if the increased effect on tooth growth is statiscally signficant. The sample size is small, so it may not be enough to warrant tests of both supplements at the higher dosage. There also may be other factors that led to the selection of the dosages used in this trial such as the risks of side effects associated with both supplements.

In any event, if the VC supplement does perform significantly better than the OJ supplement, I would likely test for the effect size of the supplement choice at this dosage. I think it would be unlikely that the effect size would warrant further research, but I would like to report the statistical results so that the researcher can make their decision based on their dental / medical expertise.

Statistical modeling 2

The third data set contains data for mice after a hypothetical treatment: before : Weight of the mice after treatment after: Weight of the mice after treatment

The assignment asked us to complete the following tasks: - Check your data and visualize the data after your import - Check the assumption and run a paired sample t-test - Interpret the result, and report it with a plot

mice <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/mice.csv")
head(mice)
##    group weight
## 1 before  200.1
## 2 before  190.9
## 3 before  192.7
## 4 before  213.0
## 5 before  241.4
## 6 before  196.9
str(mice)
## 'data.frame':    20 obs. of  2 variables:
##  $ group : chr  "before" "before" "before" "before" ...
##  $ weight: num  200 191 193 213 241 ...
mice[is.na(mice)] #all good
## character(0)
mice$group[mice$group == "before"] <-0
mice$group[mice$group == "after"] <-1
mice$group <- as.factor(mice$group)
mice$group <- factor(mice$group, labels = c("before", "after"),ordered=TRUE)

ggbetweenstats(
  data = mice,
  x = group,
  y = weight,
  plot.type = "box", # for boxplot
  type = "parametric", # for student's t-test
  var.equal = TRUE, # equal variances
  centrality.plotting = F# remove mean
) +
  labs(caption = NULL)

#Check the assumption and run a paired sample t-test 
# Outliers 
mice %>% 
  identify_outliers(weight)
## [1] group      weight     is.outlier is.extreme
## <0 rows> (or 0-length row.names)
# No outliers

# Normality 
mice %>% 
  shapiro_test(weight)
## # A tibble: 1 × 3
##   variable statistic       p
##   <chr>        <dbl>   <dbl>
## 1 weight       0.809 0.00117
# Population is not normal since p is smaller than 0.01

# 2. visualize it by Q-Q plot 

ggqqplot(mice, x= "weight", facet.by = "group")

# Levene’s test for Homogeneity

# A non-significant result here (greater than .05) indicates you have met the assumption of homogeneity of variance (i.e., equal variances are assumed). 

# A significant result here (less than .05) indicates you have violated the assumption of homogeneity of variance (i.e., equal variances are not assumed).

mice %>% 
  levene_test(weight~group)
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     1    18      1.13 0.302
# p is .3 (greater than .05 and non-significant), the assumption of homogeneity 
#of variance is met (i.e., equal variances are assumed). 


rquery.t.test<-function(x, y = NULL, paired = FALSE, graph = TRUE, ...)
{
  # I. Preliminary test : normality and variance tests
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  var.equal = FALSE # by default
  
  # I.1 One sample t test
  if(is.null(y)){
    if(graph) par(mfrow=c(1,2))
    shapiro.px<-normaTest(x, graph, 
                          hist.title="X - Histogram",
                          qq.title="X - Normal Q-Q Plot")
    if(shapiro.px < 0.05)
      warning("x is not normally distributed :",
              " Shapiro-Wilk test p-value : ", shapiro.px, 
              ".\n Use a non-parametric test like Wilcoxon test.")
  }
  
  # I.2 Two samples t test
  if(!is.null(y)){
    
      # I.2.a unpaired t test
      if(!paired){
          if(graph) par(mfrow=c(2,2))
          # normality test
          shapiro.px<-normaTest(x, graph, 
                                hist.title="X - Histogram",
                                qq.title="X - Normal Q-Q Plot")
          shapiro.py<-normaTest(y, graph,
                                hist.title="Y - Histogram",
                                qq.title="Y - Normal Q-Q Plot")
          if(shapiro.px < 0.05 | shapiro.py < 0.05){
              warning("x or y is not normally distributed :",
                      " Shapiro test p-value : ", shapiro.px,
                      " (for x) and ", shapiro.py, " (for y)",
                      ".\n Use a non parametric test like Wilcoxon test.")
            }
          # Check for equality of variances
          if(var.test(x,y)$p.value >= 0.05) var.equal=TRUE
        } 
      
      # I.2.b Paired t-test
      else {
        if(graph) par(mfrow=c(1,2))
        d = x-y 
        shapiro.pd<-normaTest(d, graph, 
                              hist.title="D - Histogram",
                              qq.title="D - Normal Q-Q Plot")
        if(shapiro.pd < 0.05 )
          warning("The difference d ( = x-y) is not normally distributed :",
                  " Shapiro-Wilk test p-value : ", shapiro.pd, 
                  ".\n Use a non-parametric test like Wilcoxon test.")
      } 
      
   }
  
  # II. Student's t-test
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  res <- t.test(x, y, paired=paired, var.equal=var.equal, ...)
  return(res)
}
#+++++++++++++++++++++++
# Helper function
#+++++++++++++++++++++++
# Performs normality test using Shapiro Wilk's method
# The histogram and Q-Q plot of the data are plotted
# x : a (non-empty) numeric vector of data values.
# graph : possible values are TRUE or FALSE. If TRUE,
  # the histogram and the Q-Q plot of the data are displayed
# hist.title : title of the histogram
# qq.title : title of the Q-Q plot
normaTest<-function(x, graph=TRUE, 
                    hist.title="Histogram", 
                    qq.title="Normal Q-Q Plot",...)
  {  
  # Significance test
  #++++++++++++++++++++++
  shapiro.p<-signif(shapiro.test(x)$p.value,1) 
  
  if(graph){
    # Plot : Visual inspection
    #++++++++++++++++
    h<-hist(x, col="lightblue", main=hist.title, 
            xlab="Data values", ...)
    m<-round(mean(x),1)
    s<-round(sd(x),1)
    mtext(paste0("Mean : ", m, "; SD : ", s),
          side=3, cex=0.8)
    # add normal curve
    xfit<-seq(min(x),max(x),length=40)
    yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
    yfit <- yfit*diff(h$mids[1:2])*length(x)
    lines(xfit, yfit, col="red", lwd=2)
    # qq plot
    qqnorm(x, pch=19, frame.plot=FALSE,main=qq.title)
    qqline(x)
    mtext(paste0("Shapiro-Wilk, p-val : ", shapiro.p),
          side=3, cex=0.8)
  }
  return(shapiro.p)
}

mice.w <- mice %>% 
  mutate(id = 1:20)

mice.w$id[11:20] <- mice.w$id[1:10]
  
mice.w <- as.data.frame(pivot_wider(
          mice.w,
          names_from = group,
          values_from = weight))
          
  
rquery.t.test(mice.w$before,mice.w$after, paired =T)

## 
##  Paired t-test
## 
## data:  x and y
## t = -20.883, df = 9, p-value = 6.2e-09
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -215.5581 -173.4219
## sample estimates:
## mean difference 
##         -194.49
t.test(mice.w$before,mice.w$after, paired =T)
## 
##  Paired t-test
## 
## data:  mice.w$before and mice.w$after
## t = -20.883, df = 9, p-value = 6.2e-09
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -215.5581 -173.4219
## sample estimates:
## mean difference 
##         -194.49
#p is smaller than 0.01, so we reject our null hypothesis and accept our alternative
#that the true mean difference is not equal to 0.

ggwithinstats(
  data = mice,
  x = group,
  y = weight,
  effsize.type = "g",
  type = "parametric", # for student's t-test
  centrality.plotting = T, # remove mean point or not 
  title = "Mice Weight Before and After Treatment"
) + labs(caption = NULL,x="Before or After Treatment")

beep(3)

Statistical modeling 3

The final data set tracked air quality and wind speed to demonstrate a correlation.

The last tasks asked of us for this assignment were as follows: - Regress Ozone on the Wind speed, and then interpret out the result - Check the assumptions, does your data meet all assumptions? - Then make a scatter plot showing their relationships

head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
aq.reg1 <- lm(Ozone ~Wind, data = airquality)
summary(aq.reg1)
## 
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.572 -18.854  -4.868  15.234  90.000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  96.8729     7.2387   13.38  < 2e-16 ***
## Wind         -5.5509     0.6904   -8.04 9.27e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.47 on 114 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.3619, Adjusted R-squared:  0.3563 
## F-statistic: 64.64 on 1 and 114 DF,  p-value: 9.272e-13
par(mfrow=c(2,2))
plot(aq.reg1)

outlierTest(aq.reg1)
##     rstudent unadjusted p-value Bonferroni p
## 117 3.647406          0.0004022     0.046655
#117 is an outlier, and 48 is worth removing as an outlier as well as both have
#the same Cook's d value of .23.
#The plots do confirm our assumptions for the regression:
#the linear relationship assumptions are confirmed by the largely horizontal
#fitted vs residuals plot, the normal distribution of the variables by the
#QQ plot that is relatively fitted to a straight line; the homogeneity of
#the variance of the residuals is confirmed by the equal spread about the horizontal
#line, and the residuals vs. leverage plot identifies only two or three outliers
#that we may choose to remove from our data. We proceed to test for 48, 62 and 117
#through the outlier test function to determine which ought to be removed. We find
#only 117 as identified by the test.


augment.AQ <- augment(aq.reg1)


ggplot(airquality, aes(x = Wind, y = Ozone))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .6, 
              se = F) +  
  labs( x = "Wind Speed MPH",
        y = "Ozone Concentration BBP", 
        title = "Relationship Between Wind Speed and Ozone Level")+
  stat_regline_equation(label.x = 15, label.y = 140, aes(label = ..eq.label..)) + # show the lm formula 
  stat_regline_equation(label.x = 15, label.y = 130, aes(label = ..rr.label..)) +#  show the R^2
  theme_linedraw()
## `geom_smooth()` using formula 'y ~ x'

ggplot(augment.AQ, aes(x = Wind, y = Ozone))+
  geom_point() +
  geom_smooth(method = lm,  
              alpha = .6, 
              se = F) +  
  geom_segment(aes(xend=Wind,yend=.fitted), color="red", size=0.3)+
  labs( x = "Wind Speed MPH",
        y = "Ozone Concentration BBP", 
        title = "Distance Between Observed Values and Fitted Regression Line")+
  stat_regline_equation(label.x = 15, label.y = 140, aes(label = ..eq.label..)) + # show the lm formula 
  stat_regline_equation(label.x = 15, label.y = 130, aes(label = ..rr.label..)) +#  show the R^2
  theme_linedraw()
## `geom_smooth()` using formula 'y ~ x'

beep(5)

Statistical modeling 4

Load the anxiety.sav data, here are the variables of interest :

PartNum: participants ID Condition: 1=full song, 2=lyric only, 3=instrumental only, 4= no music x_anxiety: Mean Anxiety Score year: 1=freshman, 2=sophomore, 3=junior, 4=senior, 5=other

Please preprocess the dataset and: - Run a statistical test to compare the mean anxiety scores between different genders - Is there any statistical differences among the means of Anxiety Score under different treatment conditions? - Do students at different school years show statistical differences in mean Anxiety Score? - Support your conclusion with plots and test results

anxiety <- import("/Users/juan.angel.madrigal/Desktop/Intro to R/anxiety.sav")
head(anxiety)
##   PartNum Condition x_anxiety Pieces Gender Age Year
## 1       1         4  2.846154      8      2  20    3
## 2       2         2  2.076923     14      2  27    4
## 3       3         4  4.230769      9      2  20    3
## 4       4         2  2.692308     12      2  19    2
## 5       5         1  5.769231     16      1  18    2
## 6       6         3  4.153846     11      2  19    2
anxiety[is.na(anxiety)]
## numeric(0)
#Run a statistical test to compare the mean anxiety scores between different genders
anxiety$Gender <- factor(anxiety$Gender, labels = c("Gender1", "Gender2"))
anxiety$Condition <- factor(anxiety$Condition, labels = c("FullSong", "LyricsOnly","Instrumentals","NoMusic"))
anxiety$Year <- factor(anxiety$Year, labels = c("Freshman", "Sophomore","Junior","Senior"))

anxiety.g <- group_by(anxiety, Gender) %>%
  summarise(
    count = n(),
    mean = mean(x_anxiety, na.rm = TRUE)
  ) 
#Our samples are very skewed with a larger sample for one gender than the 
#other by several orders of magnitude. The mean anxiety level of our smaller 
#sample seems much higher

anxiety.g.aov <- aov(x_anxiety ~ Gender,data = anxiety)
summary(anxiety.g.aov) 
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Gender       1  7.628   7.628   11.42 0.00169 **
## Residuals   38 25.388   0.668                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#There is a statistically significant difference in mean anxiety levels
#by gender at the 0.01 level.

ggboxplot(anxiety, x = "Gender", y =  "x_anxiety",
        ylab = "Anxiety Score", xlab = "Gender")

# Normality 
levene_test(anxiety.g.aov) 
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     1    38      2.40 0.130
#p is larger than 0.05; Homogenity is met
par(mfrow=c(2,2))
plot(anxiety.g.aov,1)
plot(anxiety.g.aov,2) #Very fitted, looks good.
qqPlot(anxiety.g.aov) # All in CI, looks good.
## [1]  2 26
hist(anxiety.g.aov$residuals) #Hm.

shapiro_test(anxiety.g.aov$residuals) #nevermind, we're good. All normal here
## # A tibble: 1 × 3
##   variable                statistic p.value
##   <chr>                       <dbl>   <dbl>
## 1 anxiety.g.aov$residuals     0.977   0.572
#with a p value waaaaaay bigger than 0.05.



#Is there any statistical differences among the means of Anxiety Score
#under different treatment conditions?
anxiety.t <- group_by(anxiety, Condition) %>%
  summarise(
    count = n(),
    mean = mean(x_anxiety, na.rm = TRUE)
  )


anxiety.t.aov <- aov(x_anxiety ~ Condition,data = anxiety)
summary(anxiety.t.aov) #There is some significant difference at the 0.05 level
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Condition    3  8.026  2.6755   3.854 0.0172 *
## Residuals   36 24.990  0.6942                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggline(anxiety,"Condition","x_anxiety", 
       add=c("mean_se","jitter"),
       ylab="Anxiety Level",
       main="Means Anxiety Levels by Treatment Condition")

TukeyHSD(anxiety.t.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = x_anxiety ~ Condition, data = anxiety)
## 
## $Condition
##                                 diff         lwr        upr     p adj
## LyricsOnly-FullSong      -1.16153846 -2.16504301 -0.1580339 0.0179565
## Instrumentals-FullSong   -0.23076923 -1.23427378  0.7727353 0.9252264
## NoMusic-FullSong         -0.21538462 -1.21888917  0.7881199 0.9380223
## Instrumentals-LyricsOnly  0.93076923 -0.07273532  1.9342738 0.0772622
## NoMusic-LyricsOnly        0.94615385 -0.05735071  1.9496584 0.0706163
## NoMusic-Instrumentals     0.01538462 -0.98811994  1.0188892 0.9999742
#Tukey's probably isn't appropriate here due to small sample size
#But when we use it, we do see a significant difference only between
#The Lyrics Only and Full Song groups

levene_test(anxiety.t.aov) 
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     3    36     0.532 0.663
#p is larger than 0.05; Homogeneity is met

par(mfrow=c(2,2))
  plot(anxiety.t.aov,1) #2,26 and 33 flagged as outliers
  plot(anxiety.t.aov,2) #Very fitted, looks good.
  qqPlot(anxiety.t.aov) # Most all in CI, looks good.
## [1]  5 35
  hist(anxiety.t.aov$residuals) #Hm.

shapiro_test(anxiety.t.aov$residuals) #greater than 0.05, so it 
## # A tibble: 1 × 3
##   variable                statistic p.value
##   <chr>                       <dbl>   <dbl>
## 1 anxiety.t.aov$residuals     0.954   0.104
#meets our assumption of normality, which surprises me when looking at that graph tbh

#Do students at different school years show
#statistical differences in mean Anxiety Score? 

anxiety.y <- group_by(anxiety, Year) %>%
  summarise(
    count = n(),
    mean = mean(x_anxiety, na.rm = TRUE)
  )

anxiety.y.aov <- aov(x_anxiety ~ Year,data = anxiety)
summary(anxiety.y.aov) #There is not a statistically significant difference by year.
##             Df Sum Sq Mean Sq F value Pr(>F)
## Year         3  4.192  1.3974   1.745  0.175
## Residuals   36 28.824  0.8007
ggline(anxiety,"Year","x_anxiety", 
       add=c("mean_se","jitter"),
       ylab="Anxiety Level",
       main="Means Anxiety Levels by Year in School")

#ANOVA
levene_test(anxiety.y.aov) 
## # A tibble: 1 × 4
##     df1   df2 statistic     p
##   <int> <int>     <dbl> <dbl>
## 1     3    36     0.814 0.495
#p is larger than 0.05; Homogenity is met

par(mfrow=c(2,2))
  plot(anxiety.y.aov,1) #2,26 and 33 flagged as outliers
  plot(anxiety.y.aov,2) #Very fitted, looks good.
  qqPlot(anxiety.y.aov) # Most all in CI, looks good.
## [1] 18 35
  hist(anxiety.y.aov$residuals) #Hm.

shapiro_test(anxiety.y.aov$residuals) #greater than 0.05, so it 
## # A tibble: 1 × 3
##   variable                statistic p.value
##   <chr>                       <dbl>   <dbl>
## 1 anxiety.y.aov$residuals     0.965   0.247
#meets our assumption, which surprises me when looking at that graph tbh
beep(4)