#THE GRAND FINALE
#Admissions Model
#Prepare an ensemble model to predict admissions. Use Random Forests
#and logistic regression. Split into training and testing sets. Use the data
#set binary.csv.
# 1. Show the confusion matrix for the random forest alone.
# 2. Show the confusion matrix for the logistic regression alone.
# 3. Show the confusion matrix for the ensemble model?
# Last: did the ensemble improve over the two individual models? Use Accuracy
# scores to answer the question.
#=====================================================================#
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/Grand_Finale")
options(digits = 3, scipen = 99999)
set.seed(12345)
library(caret,quietly = T)
library(tidyverse, quietly= T)
## -- Attaching packages ------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
library(randomForest, quietly = T)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(SuperLearner, quietly = T)
## Super Learner
## Version: 2.0-24
## Package created on 2018-08-10
library(dplyr, quietly = T)
suppressWarnings(suppressMessages(library("SuperLearner", quietly = T)))
#=====================================================================#
dir()
## [1] "binary.csv" "dna.csv"
## [3] "GrandFinale.R" "GrandFinale_AROD.html"
## [5] "GrandFinale_AROD.R" "GrandFinale_AROD.spin.R"
## [7] "GrandFinale_AROD.spin.Rmd"
#remove(list = ls())
#mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
mydata <- read.csv("binary.csv")
## view the first few rows of the data
head(mydata)
## X admit gre gpa rank id
## 1 1 0 380 3.61 3 1
## 2 2 1 660 3.67 3 2
## 3 3 1 800 4.00 1 3
## 4 4 1 640 3.19 4 4
## 5 5 0 520 2.93 4 5
## 6 6 1 760 3.00 2 6
#write.csv(mydata, "binary.csv")
#train and test set
mydata = mydata %>% dplyr::select( -X, -id)
train = mydata %>% sample_frac(0.8)
test = mydata %>% anti_join(train, by = c("gre", "gpa", "rank"))
#Fit a random forest model ############################################
model_rf = randomForest(admit ~., data= train)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
model_rf_predict = predict(model_rf, newdata = test,
type = c("response"))
model_rf_class = ifelse(model_rf_predict > 0.34, "1","0")
cm_rf = confusionMatrix(as.factor(model_rf_class), as.factor(test$admit))
objects(cm_rf)
## [1] "byClass" "dots" "mode" "overall" "positive" "table"
cm_rf$overall[1]
## Accuracy
## 0.579
#Fit a binomial #############################################
model_glm <- glm(as.factor(admit) ~ ., data = train, family= binomial())
model_glm_predict = predict(model_glm, test, type =c("response"))
model_glm_class = ifelse(model_glm_predict > 0.44, "1","0")
cm_log = confusionMatrix(as.factor(model_glm_class), as.factor(test$admit))
objects(cm_log)
## [1] "byClass" "dots" "mode" "overall" "positive" "table"
cm_log$byClass[1]
## Sensitivity
## 0.942
cm_log$overall[1]
## Accuracy
## 0.75
#Fit an ensemble #####################################################
#listWrappers()
# generate Library and run Super Learner
SL.library <- c("SL.randomForest", "SL.glm")
SL_1 <- SuperLearner(Y = as.numeric(train$admit), X = train[-1],
newX = test[-1], SL.library = SL.library,
verbose = FALSE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
SL_1
##
## Call:
## SuperLearner(Y = as.numeric(train$admit), X = train[-1], newX = test[-1],
## SL.library = SL.library, verbose = FALSE)
##
##
## Risk Coef
## SL.randomForest_All 0.198 0.594
## SL.glm_All 0.200 0.406
head(SL_1$SL.predict,3)
## [,1]
## 1 0.142
## 2 0.429
## 3 0.653
SL_class = ifelse(as.data.frame(SL_1$SL.predict) > 0.54, 1,0)
cm_en = confusionMatrix(as.factor(SL_class), as.factor(test$admit))
cm_en
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 52 21
## 1 0 3
##
## Accuracy : 0.724
## 95% CI : (0.609, 0.82)
## No Information Rate : 0.684
## P-Value [Acc > NIR] : 0.272
##
## Kappa : 0.164
##
## Mcnemar's Test P-Value : 0.0000127
##
## Sensitivity : 1.000
## Specificity : 0.125
## Pos Pred Value : 0.712
## Neg Pred Value : 1.000
## Prevalence : 0.684
## Detection Rate : 0.684
## Detection Prevalence : 0.961
## Balanced Accuracy : 0.562
##
## 'Positive' Class : 0
##
cm_en$overall[1]
## Accuracy
## 0.724
####### IN SUM ##############
answers = data.frame(
Random_Forests = cm_rf$overall[1],
Logistic = cm_log$overall[1],
Ensemble = cm_en$overall[1]
)
answers
## Random_Forests Logistic Ensemble
## Accuracy 0.579 0.75 0.724
#============================================================================#