#PROJECT 1
# =============================================================================#
# Smoking and Mortality
# The Whickham data set includes data on age, smoking, and mortality from a one-in-six survey of the
# electoral roll in Whickham, a mixed urban and rural district near Newcastle upon Tyne, in the UK. The
# survey was conducted in 1972-1974 to study heart disease and thyroid disease. A follow-up on those in
# the survey was conducted twenty years later.
# Describe the association between smoking status and mortality in this study.
# Test using a chi-square test
# to determine whether any association is statistically significant at the 95% level of significance.
# Run a binomial logit to again examine the association but this time controlling for age.
# Show the results using plot_model in the package sjPlot.
#Preliminaties
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/StatisticalReview")
options(digits =3, scipen = 99999)
remove(list = ls())
library(scales)
library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 2.0.1 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggthemes)
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
#Load data
dir()
## [1] "~$oject 1.docx" "~WRL2466.tmp"
## [3] "DataGenerator_Project_1.R" "Project 1.docx"
## [5] "Project 1.pdf" "Project_1_AROD.html"
## [7] "Project_1_AROD.R" "Project_1_AROD.spin.R"
## [9] "Project_1_AROD.spin.Rmd" "RidingMowersLargeSet.csv"
## [11] "states_data.csv" "StatisticalReview_V1.R"
## [13] "Whickham.csv" "zombie_grit.csv"
wh = read.csv("Whickham.csv", header = TRUE)
#Exploratory Data Analysis
str(wh)
## 'data.frame': 1314 obs. of 3 variables:
## $ outcome: Factor w/ 2 levels "Alive","Dead": 1 1 2 1 1 1 1 2 1 1 ...
## $ smoker : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 1 1 1 ...
## $ age : int 23 18 71 67 64 38 45 76 28 27 ...
summary(wh)
## outcome smoker age
## Alive:945 No :732 Min. :18.0
## Dead :369 Yes:582 1st Qu.:32.0
## Median :46.0
## Mean :46.9
## 3rd Qu.:61.0
## Max. :84.0
table(wh$outcome, wh$smoker)
##
## No Yes
## Alive 502 443
## Dead 230 139
prop.table(table(wh$outcome, wh$smoker))
##
## No Yes
## Alive 0.382 0.337
## Dead 0.175 0.106
#Association between smoking status and outcome
ggplot(wh, aes(x=smoker, fill=outcome)) +
geom_bar(position = "fill") +
labs(y="Proportion", x = "Smoker") +
theme_light()

#Is there a statistically significant relationship between
#smoker and outcome?
chisq.test(x = wh$smoker, y = wh$outcome)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: wh$smoker and wh$outcome
## X-squared = 9, df = 1, p-value = 0.003
#How about by age group?
wh = wh %>%
mutate(age_group = ifelse(age <= 44, "18-44",
ifelse(age <= 64, "45-64", "65+")))
wh %>%
group_by(age_group, smoker) %>%
count(age_group, smoker, outcome)
## # A tibble: 12 x 4
## # Groups: age_group, smoker [6]
## age_group smoker outcome n
## <chr> <fct> <fct> <int>
## 1 18-44 No Alive 327
## 2 18-44 No Dead 12
## 3 18-44 Yes Alive 270
## 4 18-44 Yes Dead 15
## 5 45-64 No Alive 147
## 6 45-64 No Dead 53
## 7 45-64 Yes Alive 167
## 8 45-64 Yes Dead 80
## 9 65+ No Alive 28
## 10 65+ No Dead 165
## 11 65+ Yes Alive 6
## 12 65+ Yes Dead 44
ggplot(wh, aes(x=smoker, fill=outcome)) +
geom_bar(position = "fill") +
labs(y="Proportion") +
facet_grid(. ~ age_group) +
theme_light()

ggplot(wh, aes(x=age_group, fill=outcome)) +
geom_bar(position = "fill") +
labs(y="Proportion") +
facet_grid(. ~ smoker) +
theme_light()

ggplot(wh, aes(x = outcome, y = age)) +
geom_boxplot()+
facet_grid(~smoker)+
theme_light()

ggplot(wh, aes(x = smoker, y = age)) +
geom_boxplot()+
facet_grid(~outcome)+
theme_light()

ggplot(wh, aes(x = smoker, y = age)) +
geom_boxplot()+
facet_grid(~outcome)

################################################
#Statistical Significance by Age Group
young = wh %>% filter(age_group == "18-44")
prop.table(table(young$smoker, young$outcome))
##
## Alive Dead
## No 0.5240 0.0192
## Yes 0.4327 0.0240
chisq.test(young$smoker, young$outcome)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: young$smoker and young$outcome
## X-squared = 0.7, df = 1, p-value = 0.4
middle = wh %>% filter(age_group == "45-64")
prop.table(table(middle$smoker, middle$outcome))
##
## Alive Dead
## No 0.329 0.119
## Yes 0.374 0.179
chisq.test(middle$smoker, middle$outcome)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: middle$smoker and middle$outcome
## X-squared = 2, df = 1, p-value = 0.2
old = wh %>% filter(age_group == "65+")
prop.table(table(old$smoker, old$outcome))
##
## Alive Dead
## No 0.1152 0.6790
## Yes 0.0247 0.1811
chisq.test(old$smoker, old$outcome)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: old$smoker and old$outcome
## X-squared = 0.05, df = 1, p-value = 0.8
################################################
#########################################################
#Logistic Regression
glm_wh = glm(outcome~ smoker +
age, family = "binomial", data = wh)
summary(glm_wh)
##
## Call:
## glm(formula = outcome ~ smoker + age, family = "binomial", data = wh)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.958 -0.546 -0.223 0.438 3.279
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.59922 0.44123 -17.22 <0.0000000000000002 ***
## smokerYes 0.20470 0.16842 1.22 0.22
## age 0.12368 0.00718 17.23 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1560.32 on 1313 degrees of freedom
## Residual deviance: 945.02 on 1311 degrees of freedom
## AIC: 951
##
## Number of Fisher Scoring iterations: 6
#Examine the model results
plot_model(glm_wh, value.offset = 0.2, show.values = T)

plot_model(glm_wh, type = "pred", terms = c("age", "smoker"))

plot_model(glm_wh, type = "pred", terms = c("smoker", "age"))

#########################################################
# =============================================================================#
#======================================================================#
#Zombie Grit
#read the file
dir()
## [1] "~$oject 1.docx" "~WRL2466.tmp"
## [3] "DataGenerator_Project_1.R" "Project 1.docx"
## [5] "Project 1.pdf" "Project_1_AROD.html"
## [7] "Project_1_AROD.R" "Project_1_AROD.spin.R"
## [9] "Project_1_AROD.spin.Rmd" "RidingMowersLargeSet.csv"
## [11] "states_data.csv" "StatisticalReview_V1.R"
## [13] "Whickham.csv" "zombie_grit.csv"
zombie = read.csv("zombie_grit.csv")
head(zombie)
## X Salary Grit Education Performance Job
## 1 1 10248 0.391 Medium 0.3142 Humans
## 2 2 10249 0.465 High 0.3125 Zombies
## 3 3 10248 0.590 Medium 0.4952 Zombies
## 4 4 10249 0.678 High 0.4552 Humans
## 5 5 10249 0.403 High 0.0883 Humans
## 6 6 10249 0.331 Low 0.6586 Zombies
str(zombie)
## 'data.frame': 1000 obs. of 6 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Salary : num 10248 10249 10248 10249 10249 ...
## $ Grit : num 0.391 0.465 0.59 0.678 0.403 ...
## $ Education : Factor w/ 3 levels "High","Low","Medium": 3 1 3 1 1 2 3 1 3 3 ...
## $ Performance: num 0.3142 0.3125 0.4952 0.4552 0.0883 ...
## $ Job : Factor w/ 2 levels "Humans","Zombies": 1 2 2 1 1 2 2 1 2 2 ...
summary(zombie) #note that there are NA's
## X Salary Grit Education
## Min. : 1 Min. :10248 Min. :0.00 High :248
## 1st Qu.: 251 1st Qu.:10248 1st Qu.:0.36 Low :247
## Median : 500 Median :10249 Median :0.47 Medium:505
## Mean : 500 Mean :10249 Mean :0.47
## 3rd Qu.: 750 3rd Qu.:10249 3rd Qu.:0.57
## Max. :1000 Max. :10249 Max. :1.00
## NA's :20 NA's :20
## Performance Job
## Min. :0.000 Humans :503
## 1st Qu.:0.373 Zombies:497
## Median :0.484
## Mean :0.481
## 3rd Qu.:0.591
## Max. :1.000
##
####################################
#Missing Values
#Use the function mice() from the package "mice"
#to impute the missing values; instead of omitting them.
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
imp <- mice(data = zombie, method = "norm.nob")
##
## iter imp variable
## 1 1 Salary Grit*
## 1 2 Salary Grit*
## 1 3 Salary Grit*
## 1 4 Salary Grit*
## 1 5 Salary Grit*
## 2 1 Salary Grit*
## 2 2 Salary Grit*
## 2 3 Salary Grit*
## 2 4 Salary Grit*
## 2 5 Salary Grit*
## 3 1 Salary Grit*
## 3 2 Salary Grit*
## 3 3 Salary Grit*
## 3 4 Salary Grit*
## 3 5 Salary Grit*
## 4 1 Salary Grit*
## 4 2 Salary Grit*
## 4 3 Salary Grit*
## 4 4 Salary Grit*
## 4 5 Salary Grit*
## 5 1 Salary Grit*
## 5 2 Salary Grit*
## 5 3 Salary Grit*
## 5 4 Salary Grit*
## 5 5 Salary Grit*
## * Please inspect the loggedEvents
## Warning: Number of logged events: 25
zombie = complete(imp) #
####################################
#Exploratory Data Analysis
#Scatter Plot: Grit v. Performance
zombie %>% ggplot(aes(y = Grit, x = Performance)) +
geom_smooth()+
geom_point() +
theme_light()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Scatter Plot: Grit v. Salary
zombie %>% ggplot(aes(y =Grit, Salary)) +
geom_smooth()+
geom_point() +
theme_light()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#Is there a lurking variable at play?
#What happens when you control for Education and for Job type?
#Scatter Plot: Grit v. Performance with Education parametrized
zombie %>% ggplot(aes(y = Grit, x = Performance, col = Education)) +
geom_smooth()+
geom_point() +
theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Scatter Plot: Grit v. Salary with Education parametrized
zombie %>% ggplot(aes(y =Grit, Salary, col = Education)) +
geom_smooth()+
geom_point() +
theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Scatter Plot: Grit v. Performance with Job Type parametrized
zombie %>% ggplot(aes(y = Grit, x = Performance, col = Job)) +
geom_smooth(method = "lm")+
geom_point() +
theme_light()

#Scatter Plot: Grit v. Salary with Job Type parametrized
zombie %>% ggplot(aes(y =Grit, Salary, col = Job)) +
geom_smooth(method = "lm")+
geom_point() +
theme_light()

#======================================================================#