#Target's Pregnancy Algorithm

    # https://www.youtube.com/watch?v=RC5HNTj3Dag

  setwd("~/workshops/challenger")
  
  remove(list = ls())
  options(digits = 3, scipen = 9999, knitr.table.format = "rst", length = 120)
  pacman::p_load(Amelia, sjPlot, knitr,readr,dplyr,ggplot2, stargazer, e1071, randomForest)
  
  preg = read.csv("Pregnancy_Test.csv", header = T)
  
    # Displaying the data
    stargazer(preg, type = "text")
## 
## =====================================================================
## Statistic                N   Mean  St. Dev. Min Pctl(25) Pctl(75) Max
## ---------------------------------------------------------------------
## Pregnancy.Test         1,000 0.011  0.104    0     0        0      1 
## Birth.Control          1,000 0.216  0.412    0     0        0      1 
## Feminine.Hygiene       1,000 0.209  0.407    0     0        0      1 
## Folic.Acid             1,000 0.020  0.140    0     0        0      1 
## Prenatal.Vitamins      1,000 0.043  0.203    0     0        0      1 
## Prenatal.Yoga          1,000 0.005  0.071    0     0        0      1 
## Body.Pillow            1,000 0.008  0.089    0     0        0      1 
## Ginger.Ale             1,000 0.032  0.176    0     0        0      1 
## Sea.Bands              1,000 0.013  0.113    0     0        0      1 
## Stopped.buying.ciggies 1,000 0.050  0.218    0     0        0      1 
## Cigarettes             1,000 0.148  0.355    0     0        0      1 
## Smoking.Cessation      1,000 0.009  0.094    0     0        0      1 
## Stopped.buying.wine    1,000 0.080  0.271    0     0        0      1 
## Wine                   1,000 0.202  0.402    0     0        0      1 
## Maternity.Clothes      1,000 0.052  0.222    0     0        0      1 
## PREGNANT               1,000 0.060  0.238    0     0        0      1 
## ---------------------------------------------------------------------
  # Fitting a random forests model
  preg$PREGNANT = as.factor(preg$PREGNANT)  # convert integer to factor
  
  preg_rf = randomForest::randomForest(PREGNANT ~., data = preg, ntree=200,importance=TRUE,mtry=2)
    preg_rf
## 
## Call:
##  randomForest(formula = PREGNANT ~ ., data = preg, ntree = 200,      importance = TRUE, mtry = 2) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.5%
## Confusion matrix:
##     0 1 class.error
## 0 938 2     0.00213
## 1  53 7     0.88333
      preg_variables = caret::varImp(preg_rf)
  
  varImpPlot(preg_rf)

    #how good is the fitted model?
    ( accuracy_rf = mean(preg_rf$predicted == preg$PREGNANT) )
## [1] 0.945
    caret::confusionMatrix(preg_rf$predicted, preg$PREGNANT)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 938  53
##          1   2   7
##                                          
##                Accuracy : 0.945          
##                  95% CI : (0.929, 0.958) 
##     No Information Rate : 0.94           
##     P-Value [Acc > NIR] : 0.279          
##                                          
##                   Kappa : 0.19           
##                                          
##  Mcnemar's Test P-Value : 0.0000000000156
##                                          
##             Sensitivity : 0.998          
##             Specificity : 0.117          
##          Pos Pred Value : 0.947          
##          Neg Pred Value : 0.778          
##              Prevalence : 0.940          
##          Detection Rate : 0.938          
##    Detection Prevalence : 0.991          
##       Balanced Accuracy : 0.557          
##                                          
##        'Positive' Class : 0              
## 
  #==================================================================