#Target's Pregnancy Algorithm
# https://www.youtube.com/watch?v=RC5HNTj3Dag
setwd("~/workshops/challenger")
remove(list = ls())
options(digits = 3, scipen = 9999, knitr.table.format = "rst", length = 120)
pacman::p_load(Amelia, sjPlot, knitr,readr,dplyr,ggplot2, stargazer, e1071, randomForest)
preg = read.csv("Pregnancy_Test.csv", header = T)
# Displaying the data
stargazer(preg, type = "text")
##
## =====================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## ---------------------------------------------------------------------
## Pregnancy.Test 1,000 0.011 0.104 0 0 0 1
## Birth.Control 1,000 0.216 0.412 0 0 0 1
## Feminine.Hygiene 1,000 0.209 0.407 0 0 0 1
## Folic.Acid 1,000 0.020 0.140 0 0 0 1
## Prenatal.Vitamins 1,000 0.043 0.203 0 0 0 1
## Prenatal.Yoga 1,000 0.005 0.071 0 0 0 1
## Body.Pillow 1,000 0.008 0.089 0 0 0 1
## Ginger.Ale 1,000 0.032 0.176 0 0 0 1
## Sea.Bands 1,000 0.013 0.113 0 0 0 1
## Stopped.buying.ciggies 1,000 0.050 0.218 0 0 0 1
## Cigarettes 1,000 0.148 0.355 0 0 0 1
## Smoking.Cessation 1,000 0.009 0.094 0 0 0 1
## Stopped.buying.wine 1,000 0.080 0.271 0 0 0 1
## Wine 1,000 0.202 0.402 0 0 0 1
## Maternity.Clothes 1,000 0.052 0.222 0 0 0 1
## PREGNANT 1,000 0.060 0.238 0 0 0 1
## ---------------------------------------------------------------------
# Fitting a random forests model
preg$PREGNANT = as.factor(preg$PREGNANT) # convert integer to factor
preg_rf = randomForest::randomForest(PREGNANT ~., data = preg, ntree=200,importance=TRUE,mtry=2)
preg_rf
##
## Call:
## randomForest(formula = PREGNANT ~ ., data = preg, ntree = 200, importance = TRUE, mtry = 2)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 5.5%
## Confusion matrix:
## 0 1 class.error
## 0 938 2 0.00213
## 1 53 7 0.88333
preg_variables = caret::varImp(preg_rf)
varImpPlot(preg_rf)

#how good is the fitted model?
( accuracy_rf = mean(preg_rf$predicted == preg$PREGNANT) )
## [1] 0.945
caret::confusionMatrix(preg_rf$predicted, preg$PREGNANT)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 938 53
## 1 2 7
##
## Accuracy : 0.945
## 95% CI : (0.929, 0.958)
## No Information Rate : 0.94
## P-Value [Acc > NIR] : 0.279
##
## Kappa : 0.19
##
## Mcnemar's Test P-Value : 0.0000000000156
##
## Sensitivity : 0.998
## Specificity : 0.117
## Pos Pred Value : 0.947
## Neg Pred Value : 0.778
## Prevalence : 0.940
## Detection Rate : 0.938
## Detection Prevalence : 0.991
## Balanced Accuracy : 0.557
##
## 'Positive' Class : 0
##
#==================================================================