#PROJECT 1
# =============================================================================#
# Smoking and Mortality
# The Whickham data set includes data on age, smoking, and mortality from a one-in-six survey of the
# electoral roll in Whickham, a mixed urban and rural district near Newcastle upon Tyne, in the UK. The
# survey was conducted in 1972-1974 to study heart disease and thyroid disease. A follow-up on those in
# the survey was conducted twenty years later.

# Describe the association between smoking status and mortality in this study. 
# Test using a chi-square test
# to determine whether any association is statistically significant at the 95% level of significance.
# Run a binomial logit to again examine the association but this time controlling for age.
# Show the results using plot_model in the package sjPlot.

      #Preliminaties
      setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/StatisticalReview")
      options(digits =3, scipen = 99999)
      remove(list = ls())
      
      library(scales)
      library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
      library(ggthemes)
      library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
  #Load data    
  dir()
##  [1] "~$oject 1.docx"            "~WRL2466.tmp"             
##  [3] "DataGenerator_Project_1.R" "Project 1.docx"           
##  [5] "Project 1.pdf"             "Project_1_AROD.html"      
##  [7] "Project_1_AROD.R"          "Project_1_AROD.spin.R"    
##  [9] "Project_1_AROD.spin.Rmd"   "RidingMowersLargeSet.csv" 
## [11] "states_data.csv"           "StatisticalReview_V1.R"   
## [13] "Whickham.csv"              "zombie_grit.csv"
  wh = read.csv("Whickham.csv", header = TRUE)
  
  #Exploratory Data Analysis
  str(wh)
## 'data.frame':    1314 obs. of  3 variables:
##  $ outcome: Factor w/ 2 levels "Alive","Dead": 1 1 2 1 1 1 1 2 1 1 ...
##  $ smoker : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 1 1 1 ...
##  $ age    : int  23 18 71 67 64 38 45 76 28 27 ...
  summary(wh)
##   outcome    smoker         age      
##  Alive:945   No :732   Min.   :18.0  
##  Dead :369   Yes:582   1st Qu.:32.0  
##                        Median :46.0  
##                        Mean   :46.9  
##                        3rd Qu.:61.0  
##                        Max.   :84.0
        table(wh$outcome, wh$smoker)
##        
##          No Yes
##   Alive 502 443
##   Dead  230 139
        prop.table(table(wh$outcome, wh$smoker))
##        
##            No   Yes
##   Alive 0.382 0.337
##   Dead  0.175 0.106
        #Association between smoking status and outcome
        ggplot(wh, aes(x=smoker, fill=outcome)) + 
          geom_bar(position = "fill") + 
          labs(y="Proportion", x = "Smoker") + 
          theme_light()

        #Is there a statistically significant relationship between
        #smoker and outcome?
        chisq.test(x = wh$smoker, y = wh$outcome)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  wh$smoker and wh$outcome
## X-squared = 9, df = 1, p-value = 0.003
        #How about by age group?
        wh = wh %>% 
          mutate(age_group = ifelse(age <= 44, "18-44", 
                                  ifelse(age <= 64, "45-64", "65+")))
        wh %>% 
          group_by(age_group, smoker) %>%
              count(age_group, smoker, outcome) 
## # A tibble: 12 x 4
## # Groups:   age_group, smoker [6]
##    age_group smoker outcome     n
##    <chr>     <fct>  <fct>   <int>
##  1 18-44     No     Alive     327
##  2 18-44     No     Dead       12
##  3 18-44     Yes    Alive     270
##  4 18-44     Yes    Dead       15
##  5 45-64     No     Alive     147
##  6 45-64     No     Dead       53
##  7 45-64     Yes    Alive     167
##  8 45-64     Yes    Dead       80
##  9 65+       No     Alive      28
## 10 65+       No     Dead      165
## 11 65+       Yes    Alive       6
## 12 65+       Yes    Dead       44
        ggplot(wh, aes(x=smoker, fill=outcome)) + 
          geom_bar(position = "fill") + 
            labs(y="Proportion") + 
              facet_grid(. ~ age_group) + 
                theme_light()

                ggplot(wh, aes(x=age_group, fill=outcome)) + 
              geom_bar(position = "fill") + 
            labs(y="Proportion") + 
          facet_grid(. ~ smoker) + 
        theme_light()

        ggplot(wh, aes(x = outcome, y = age)) +
          geom_boxplot()+
            facet_grid(~smoker)+
              theme_light()

        ggplot(wh, aes(x = smoker, y = age)) +
          geom_boxplot()+
            facet_grid(~outcome)+
              theme_light()

        ggplot(wh, aes(x = smoker, y = age)) +
          geom_boxplot()+
          facet_grid(~outcome)

              ################################################
              #Statistical Significance by Age Group
              young = wh %>% filter(age_group == "18-44")
                prop.table(table(young$smoker, young$outcome))
##      
##        Alive   Dead
##   No  0.5240 0.0192
##   Yes 0.4327 0.0240
                  chisq.test(young$smoker, young$outcome)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  young$smoker and young$outcome
## X-squared = 0.7, df = 1, p-value = 0.4
              middle = wh %>% filter(age_group == "45-64")
                prop.table(table(middle$smoker, middle$outcome))
##      
##       Alive  Dead
##   No  0.329 0.119
##   Yes 0.374 0.179
                  chisq.test(middle$smoker, middle$outcome)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  middle$smoker and middle$outcome
## X-squared = 2, df = 1, p-value = 0.2
              old = wh %>% filter(age_group == "65+")
                prop.table(table(old$smoker, old$outcome))
##      
##        Alive   Dead
##   No  0.1152 0.6790
##   Yes 0.0247 0.1811
                  chisq.test(old$smoker, old$outcome)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  old$smoker and old$outcome
## X-squared = 0.05, df = 1, p-value = 0.8
              ################################################
                  
        #########################################################
        #Logistic Regression 
        glm_wh = glm(outcome~ smoker + 
                       age, family = "binomial", data = wh)
        summary(glm_wh)
## 
## Call:
## glm(formula = outcome ~ smoker + age, family = "binomial", data = wh)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.958  -0.546  -0.223   0.438   3.279  
## 
## Coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept) -7.59922    0.44123  -17.22 <0.0000000000000002 ***
## smokerYes    0.20470    0.16842    1.22                0.22    
## age          0.12368    0.00718   17.23 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1560.32  on 1313  degrees of freedom
## Residual deviance:  945.02  on 1311  degrees of freedom
## AIC: 951
## 
## Number of Fisher Scoring iterations: 6
        #Examine the model results
        plot_model(glm_wh, value.offset = 0.2, show.values = T)

        plot_model(glm_wh, type = "pred", terms = c("age", "smoker"))

        plot_model(glm_wh, type = "pred", terms = c("smoker", "age"))

        #########################################################
# =============================================================================#
        
        #======================================================================#
        #Zombie Grit
        #read the file
        dir()
##  [1] "~$oject 1.docx"            "~WRL2466.tmp"             
##  [3] "DataGenerator_Project_1.R" "Project 1.docx"           
##  [5] "Project 1.pdf"             "Project_1_AROD.html"      
##  [7] "Project_1_AROD.R"          "Project_1_AROD.spin.R"    
##  [9] "Project_1_AROD.spin.Rmd"   "RidingMowersLargeSet.csv" 
## [11] "states_data.csv"           "StatisticalReview_V1.R"   
## [13] "Whickham.csv"              "zombie_grit.csv"
        zombie = read.csv("zombie_grit.csv")
        head(zombie)
##   X Salary  Grit Education Performance     Job
## 1 1  10248 0.391    Medium      0.3142  Humans
## 2 2  10249 0.465      High      0.3125 Zombies
## 3 3  10248 0.590    Medium      0.4952 Zombies
## 4 4  10249 0.678      High      0.4552  Humans
## 5 5  10249 0.403      High      0.0883  Humans
## 6 6  10249 0.331       Low      0.6586 Zombies
        str(zombie)
## 'data.frame':    1000 obs. of  6 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Salary     : num  10248 10249 10248 10249 10249 ...
##  $ Grit       : num  0.391 0.465 0.59 0.678 0.403 ...
##  $ Education  : Factor w/ 3 levels "High","Low","Medium": 3 1 3 1 1 2 3 1 3 3 ...
##  $ Performance: num  0.3142 0.3125 0.4952 0.4552 0.0883 ...
##  $ Job        : Factor w/ 2 levels "Humans","Zombies": 1 2 2 1 1 2 2 1 2 2 ...
        summary(zombie) #note that there are NA's
##        X            Salary           Grit       Education  
##  Min.   :   1   Min.   :10248   Min.   :0.00   High  :248  
##  1st Qu.: 251   1st Qu.:10248   1st Qu.:0.36   Low   :247  
##  Median : 500   Median :10249   Median :0.47   Medium:505  
##  Mean   : 500   Mean   :10249   Mean   :0.47               
##  3rd Qu.: 750   3rd Qu.:10249   3rd Qu.:0.57               
##  Max.   :1000   Max.   :10249   Max.   :1.00               
##                 NA's   :20      NA's   :20                 
##   Performance         Job     
##  Min.   :0.000   Humans :503  
##  1st Qu.:0.373   Zombies:497  
##  Median :0.484                
##  Mean   :0.481                
##  3rd Qu.:0.591                
##  Max.   :1.000                
## 
                ####################################
                #Missing Values
                #Use the function mice() from the package "mice"
                #to impute the missing values; instead of omitting them.
                library(mice)
## Loading required package: lattice
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
                imp <- mice(data = zombie, method = "norm.nob")
## 
##  iter imp variable
##   1   1  Salary  Grit*
##   1   2  Salary  Grit*
##   1   3  Salary  Grit*
##   1   4  Salary  Grit*
##   1   5  Salary  Grit*
##   2   1  Salary  Grit*
##   2   2  Salary  Grit*
##   2   3  Salary  Grit*
##   2   4  Salary  Grit*
##   2   5  Salary  Grit*
##   3   1  Salary  Grit*
##   3   2  Salary  Grit*
##   3   3  Salary  Grit*
##   3   4  Salary  Grit*
##   3   5  Salary  Grit*
##   4   1  Salary  Grit*
##   4   2  Salary  Grit*
##   4   3  Salary  Grit*
##   4   4  Salary  Grit*
##   4   5  Salary  Grit*
##   5   1  Salary  Grit*
##   5   2  Salary  Grit*
##   5   3  Salary  Grit*
##   5   4  Salary  Grit*
##   5   5  Salary  Grit*
##  * Please inspect the loggedEvents
## Warning: Number of logged events: 25
                zombie = complete(imp)  #
                ####################################
        
        
        #Exploratory Data Analysis
                #Scatter Plot: Grit v. Performance 
                zombie %>% ggplot(aes(y = Grit, x = Performance))  +
                  geom_smooth()+
                  geom_point() +
                  theme_light()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

                #Scatter Plot: Grit v. Salary 
                zombie %>% ggplot(aes(y =Grit, Salary))  +
                  geom_smooth()+
                  geom_point() +
                  theme_light()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

        #Is there a lurking variable at play?
        #What happens when you control for Education and for Job type?
        
        #Scatter Plot: Grit v. Performance with Education parametrized
        zombie %>% ggplot(aes(y = Grit, x = Performance, col = Education))  +
          geom_smooth()+
            geom_point() +
              theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

        #Scatter Plot: Grit v. Salary with Education parametrized
        zombie %>% ggplot(aes(y =Grit, Salary, col = Education))  +
          geom_smooth()+
            geom_point() +
              theme_light()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

        #Scatter Plot: Grit v. Performance with Job Type parametrized
                zombie %>% ggplot(aes(y = Grit, x = Performance, col = Job))  +
              geom_smooth(method = "lm")+
            geom_point() +
          theme_light()

        #Scatter Plot: Grit v. Salary with Job Type parametrized
                zombie %>% ggplot(aes(y =Grit, Salary, col = Job))  +
              geom_smooth(method = "lm")+
            geom_point() +
          theme_light()

      #======================================================================#