GrandFinale_AROD.utf8.md

#THE GRAND FINALE

#Admissions Model

#Prepare an ensemble model to predict admissions.  Use Random Forests
#and logistic regression. Split into training and testing sets.  Use the data
#set binary.csv.

# 1. Show the confusion matrix for the random forest alone.
# 2. Show the confusion matrix for the logistic regression alone.
# 3. Show the confusion matrix for the ensemble model?

# Last: did the ensemble improve over the two individual models? Use Accuracy
# scores to answer the question.

    #=====================================================================#

    setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/Grand_Finale")
    options(digits = 3, scipen = 99999)
    set.seed(12345)
    
    library(caret,quietly = T)
    library(tidyverse, quietly= T)

## -- Attaching packages ------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v tibble  2.1.1       v purrr   0.3.2  
## v tidyr   0.8.3       v dplyr   0.8.0.1
## v readr   1.3.1       v stringr 1.4.0  
## v tibble  2.1.1       v forcats 0.4.0

## -- Conflicts ---------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()

    library(randomForest, quietly = T)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

    library(SuperLearner, quietly = T)

## Super Learner

## Version: 2.0-24

## Package created on 2018-08-10

    library(dplyr, quietly = T)
    
    suppressWarnings(suppressMessages(library("SuperLearner", quietly = T)))
    #=====================================================================#
    dir()

## [1] "binary.csv"                "dna.csv"                  
## [3] "GrandFinale.R"             "GrandFinale_AROD.html"    
## [5] "GrandFinale_AROD.R"        "GrandFinale_AROD.spin.R"  
## [7] "GrandFinale_AROD.spin.Rmd"

    #remove(list = ls())
    #mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
    mydata <- read.csv("binary.csv")
    ## view the first few rows of the data
    head(mydata)

##   X admit gre  gpa rank id
## 1 1     0 380 3.61    3  1
## 2 2     1 660 3.67    3  2
## 3 3     1 800 4.00    1  3
## 4 4     1 640 3.19    4  4
## 5 5     0 520 2.93    4  5
## 6 6     1 760 3.00    2  6

    #write.csv(mydata, "binary.csv")  
    
            #train and test set
            mydata = mydata %>% dplyr::select( -X, -id)
            train = mydata %>% sample_frac(0.8)
            test = mydata %>% anti_join(train, by = c("gre", "gpa", "rank"))
            
    
    
    #Fit a random forest model ############################################
    model_rf = randomForest(admit ~., data= train)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

    model_rf_predict = predict(model_rf, newdata = test, 
                               type = c("response"))
    
    model_rf_class = ifelse(model_rf_predict > 0.34, "1","0")
    
    cm_rf = confusionMatrix(as.factor(model_rf_class), as.factor(test$admit))
    objects(cm_rf)

## [1] "byClass"  "dots"     "mode"     "overall"  "positive" "table"

    cm_rf$overall[1]

## Accuracy 
##    0.579

              #Fit a binomial #############################################
              model_glm <- glm(as.factor(admit) ~ ., data = train, family= binomial())
              
              model_glm_predict = predict(model_glm, test, type =c("response"))
              
              model_glm_class = ifelse(model_glm_predict > 0.44, "1","0")
              
              cm_log = confusionMatrix(as.factor(model_glm_class), as.factor(test$admit))
              
              objects(cm_log)

## [1] "byClass"  "dots"     "mode"     "overall"  "positive" "table"

              cm_log$byClass[1]

## Sensitivity 
##       0.942

              cm_log$overall[1]

## Accuracy 
##     0.75

    #Fit an ensemble #####################################################
        #listWrappers()
    # generate Library and run Super Learner
    SL.library <- c("SL.randomForest", "SL.glm")
   
    SL_1 <- SuperLearner(Y = as.numeric(train$admit), X = train[-1], 
                         newX = test[-1], SL.library = SL.library,
                         verbose = FALSE)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

    SL_1

## 
## Call:  
## SuperLearner(Y = as.numeric(train$admit), X = train[-1], newX = test[-1],  
##     SL.library = SL.library, verbose = FALSE) 
## 
## 
##                      Risk  Coef
## SL.randomForest_All 0.198 0.594
## SL.glm_All          0.200 0.406

    head(SL_1$SL.predict,3)

##    [,1]
## 1 0.142
## 2 0.429
## 3 0.653

    SL_class = ifelse(as.data.frame(SL_1$SL.predict) > 0.54, 1,0)
    
    cm_en = confusionMatrix(as.factor(SL_class), as.factor(test$admit))
    cm_en

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 52 21
##          1  0  3
##                                        
##                Accuracy : 0.724        
##                  95% CI : (0.609, 0.82)
##     No Information Rate : 0.684        
##     P-Value [Acc > NIR] : 0.272        
##                                        
##                   Kappa : 0.164        
##                                        
##  Mcnemar's Test P-Value : 0.0000127    
##                                        
##             Sensitivity : 1.000        
##             Specificity : 0.125        
##          Pos Pred Value : 0.712        
##          Neg Pred Value : 1.000        
##              Prevalence : 0.684        
##          Detection Rate : 0.684        
##    Detection Prevalence : 0.961        
##       Balanced Accuracy : 0.562        
##                                        
##        'Positive' Class : 0            
##

    cm_en$overall[1]

## Accuracy 
##    0.724

    ####### IN SUM ##############
    
          answers = data.frame( 
            Random_Forests = cm_rf$overall[1], 
            Logistic = cm_log$overall[1],
            Ensemble = cm_en$overall[1] 
                              )
    
          answers

##          Random_Forests Logistic Ensemble
## Accuracy          0.579     0.75    0.724

  #============================================================================#

GrandFinale_AROD.R

ARodriguez

2019-05-19