imbalance_project_1

#
#   Imbalanced Data Project
#
#   The DataSet Wisconsic Breast Cancer Data - contains data on Malignant and 
#   Benign Cancers.  More detail here: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/home
#
#   The data is slighgly imbalanced: approximately twice as many
#   instances of Benign ("B") as there are Malignant ("M")

#   1.  Fit a logistic classification model on the unadjusted data.  
#       Obtain the following two 
#       performance metrics: Accuracy (from confusionMatrix in the package caret);
#        and AUC in from the package ROSE.  

#   2.  Adjust the data by oversampling using the ovun.sample() and the "both" 
#       capability; ovun.sample() is in the 
#       package ROSE.
#       Fit a logistic classification model on the adjusted sample.
# 
#       Obtain the following two 
#       performance metrics: Accuracy (from confusionMatrix in the package caret);
#        and AUC in from the package ROSE.      

#   3. Display the results of the mano a mano graphically.

#   4. Use the function colAUC() from the package caTools
#      To draw both ROC's side by side.  
#      As follows:

        #library(caTools)
        #preds = cbind(pred.glm.both, logit_pred, test$diagnosis)
        #colAUC(preds, test$diagnosis, plotROC = TRUE)
      
        # Where pred.glm.both is the predicted output of the logistic
        # model with the adjusted data.
        # And logit_pred is the predicted output of the logistic
        # model with the un-adjusted data.
        # test$diagnosis is the classification variable in the test subset

#   5. Which dataset leads to a better result?
#
# ############################################################################
#Preliminaries
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/Topics_ImbalanceClassificationImputation/imbalance_project")
options(digits = 3, scipen = 9999)
remove(list = ls())

#Load libraries
    library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  2.0.1     v dplyr   0.7.8
## v tidyr   0.8.2     v stringr 1.3.1
## v readr   1.3.1     v forcats 0.3.0

## -- Conflicts ------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

    library(mosaic)

## Loading required package: lattice

## Loading required package: ggformula

## Loading required package: ggstance

## 
## Attaching package: 'ggstance'

## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh

## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")

## Loading required package: mosaicData

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

    library(caret)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:mosaic':
## 
##     dotPlot

## The following object is masked from 'package:purrr':
## 
##     lift

    library(sjPlot)
    library(ROSE)

## Loaded ROSE 0.0-3

    library(caTools)

###########################################################################################

# Read the dataset
dir()

## [1] "CreditCard.csv"                    "creditcard.zip"                   
## [3] "creditcard_im.csv"                 "imbalance_project_1.R"            
## [5] "imbalance_project_1_AROD.R"        "imbalance_project_1_AROD.spin.R"  
## [7] "imbalance_project_1_AROD.spin.Rmd" "wisc_bc_data.csv"

bc <- read.csv("wisc_bc_data.csv")

            ## familiarize yourself with the data
            names(bc)

##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave.points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave.points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave.points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"

            str(bc)

## 'data.frame':    569 obs. of  32 variables:
##  $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
##  $ diagnosis              : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
##  $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
##  $ area_mean              : num  1001 1326 1203 386 1297 ...
##  $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
##  $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
##  $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ area_worst             : num  2019 1956 1709 568 1575 ...
##  $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

            head(bc)

##         id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1   842302         M        18.0         10.4          122.8      1001
## 2   842517         M        20.6         17.8          132.9      1326
## 3 84300903         M        19.7         21.2          130.0      1203
## 4 84348301         M        11.4         20.4           77.6       386
## 5 84358402         M        20.3         14.3          135.1      1297
## 6   843786         M        12.4         15.7           82.6       477
##   smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1          0.1184           0.2776         0.3001              0.1471
## 2          0.0847           0.0786         0.0869              0.0702
## 3          0.1096           0.1599         0.1974              0.1279
## 4          0.1425           0.2839         0.2414              0.1052
## 5          0.1003           0.1328         0.1980              0.1043
## 6          0.1278           0.1700         0.1578              0.0809
##   symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1         0.242                 0.0787     1.095      0.905         8.59
## 2         0.181                 0.0567     0.543      0.734         3.40
## 3         0.207                 0.0600     0.746      0.787         4.58
## 4         0.260                 0.0974     0.496      1.156         3.44
## 5         0.181                 0.0588     0.757      0.781         5.44
## 6         0.209                 0.0761     0.335      0.890         2.22
##   area_se smoothness_se compactness_se concavity_se concave.points_se
## 1   153.4       0.00640         0.0490       0.0537            0.0159
## 2    74.1       0.00522         0.0131       0.0186            0.0134
## 3    94.0       0.00615         0.0401       0.0383            0.0206
## 4    27.2       0.00911         0.0746       0.0566            0.0187
## 5    94.4       0.01149         0.0246       0.0569            0.0188
## 6    27.2       0.00751         0.0335       0.0367            0.0114
##   symmetry_se fractal_dimension_se radius_worst texture_worst
## 1      0.0300              0.00619         25.4          17.3
## 2      0.0139              0.00353         25.0          23.4
## 3      0.0225              0.00457         23.6          25.5
## 4      0.0596              0.00921         14.9          26.5
## 5      0.0176              0.00511         22.5          16.7
## 6      0.0216              0.00508         15.5          23.8
##   perimeter_worst area_worst smoothness_worst compactness_worst
## 1           184.6       2019            0.162             0.666
## 2           158.8       1956            0.124             0.187
## 3           152.5       1709            0.144             0.424
## 4            98.9        568            0.210             0.866
## 5           152.2       1575            0.137             0.205
## 6           103.4        742            0.179             0.525
##   concavity_worst concave.points_worst symmetry_worst
## 1           0.712                0.265          0.460
## 2           0.242                0.186          0.275
## 3           0.450                0.243          0.361
## 4           0.687                0.258          0.664
## 5           0.400                0.163          0.236
## 6           0.535                0.174          0.399
##   fractal_dimension_worst
## 1                  0.1189
## 2                  0.0890
## 3                  0.0876
## 4                  0.1730
## 5                  0.0768
## 6                  0.1244

            tally(~diagnosis, data = bc, format = c("proportion") )

## diagnosis
##     B     M 
## 0.627 0.373

            ##Exploratory Data Analysis (i.e. examine data visually)
            ggplot(bc, aes(radius_mean, col = as.factor(diagnosis))) +
              geom_density()

              ggplot(bc, aes(texture_mean, col = as.factor(diagnosis))) +
                geom_density()

                ggplot(bc, aes(radius_mean, col = diagnosis)) +
                  geom_density()

                    ggplot(bc, aes(x = diagnosis, y = perimeter_mean)) +
                      geom_boxplot() + xlab("Diagnosis")

## Fit a logistic model on the unbalanced data ##########################################
#============================================================================================
#There is an imbalance in the breast cancer data
#Perhaps too few "M's"

    #train and test set
    bc$ID = NULL  # remove the annoying ID that comes with the data
    bc = bc %>% mutate(ID = 1:nrow(bc))
    train = sample_frac(bc, 0.8)
    test = bc %>% anti_join(train, by = "ID")

## But first fit the model on the training data set 
model_logit_train= glm(diagnosis ~ radius_mean +
                                    texture_mean +
                                    perimeter_mean +
                                    area_mean +
                                    smoothness_mean +
                                    compactness_mean +
                                    concavity_mean, 
                                                    data = train,
                       
                       family=binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

                        summary(model_logit_train)

## 
## Call:
## glm(formula = diagnosis ~ radius_mean + texture_mean + perimeter_mean + 
##     area_mean + smoothness_mean + compactness_mean + concavity_mean, 
##     family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6604  -0.1765  -0.0465   0.0014   3.0425  
## 
## Coefficients:
##                  Estimate Std. Error z value   Pr(>|z|)    
## (Intercept)      -16.3132    11.1741   -1.46     0.1443    
## radius_mean       -2.9425     4.1617   -0.71     0.4795    
## texture_mean       0.3754     0.0718    5.23 0.00000017 ***
## perimeter_mean     0.0984     0.5487    0.18     0.8576    
## area_mean          0.0434     0.0198    2.20     0.0279 *  
## smoothness_mean  131.4152    29.4002    4.47 0.00000783 ***
## compactness_mean  -2.6860    18.7126   -0.14     0.8859    
## concavity_mean    16.4067     6.3129    2.60     0.0094 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 593.68  on 454  degrees of freedom
## Residual deviance: 126.39  on 447  degrees of freedom
## AIC: 142.4
## 
## Number of Fisher Scoring iterations: 9

    ## create predictions for the test (evaluation) data set
    logit_predict =predict(model_logit_train,newdata=test,type="response")  
    logit_pred = as.factor(ifelse(logit_predict > 0.50, "M", "B") )
    confusionMatrix(logit_pred, as.factor(test$diagnosis))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 60  1
##          M  5 48
##                                              
##                Accuracy : 0.947              
##                  95% CI : (0.889, 0.98)      
##     No Information Rate : 0.57               
##     P-Value [Acc > NIR] : <0.0000000000000002
##                                              
##                   Kappa : 0.894              
##  Mcnemar's Test P-Value : 0.221              
##                                              
##             Sensitivity : 0.923              
##             Specificity : 0.980              
##          Pos Pred Value : 0.984              
##          Neg Pred Value : 0.906              
##              Prevalence : 0.570              
##          Detection Rate : 0.526              
##    Detection Prevalence : 0.535              
##       Balanced Accuracy : 0.951              
##                                              
##        'Positive' Class : B                  
##

    cm_glm = confusionMatrix(logit_pred, as.factor(test$diagnosis))
    roc.curve(as.factor(test$diagnosis), logit_predict, plotit = T)

## Area under the curve (AUC): 0.988

    accuracy.meas(as.factor(test$diagnosis), logit_predict, threshold = 0.50)

## 
## Call: 
## accuracy.meas(response = as.factor(test$diagnosis), predicted = logit_predict, 
##     threshold = 0.5)
## 
## Examples are labelled as positive when predicted is greater than 0.5 
## 
## precision: 0.906
## recall: 0.980
## F: 0.471

    auc_glm=roc.curve(test$diagnosis, logit_predict, plotit = T, col=4)

    auc_glm[2]

## $auc
## [1] 0.988

# The library ROSE has several features capable of dealing with imbalance.
#"both" sampling
data_balanced_both <- ovun.sample(diagnosis ~ radius_mean +
                                    texture_mean +
                                    perimeter_mean +
                                    area_mean +
                                    smoothness_mean +
                                    compactness_mean +
                                    concavity_mean,  
                                  data = train, method = "both",N = nrow(train))

data_balanced_both = data_balanced_both$data

table(data_balanced_both$diagnosis)

## 
##   B   M 
## 225 230

prop.table(table(data_balanced_both$diagnosis))

## 
##     B     M 
## 0.495 0.505

#build logistic regressoin predictive models with each dataset 

glm.both <- glm(diagnosis ~ radius_mean +
                  texture_mean +
                  perimeter_mean +
                  area_mean +
                  smoothness_mean +
                  compactness_mean +
                  concavity_mean,  
                data = data_balanced_both, family = binomial(link = "logit"))

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

plot_model(glm.both, type = "est", value.offset = 0.15, 
           show.values  = TRUE, vline = "blue" ,sort.est = T) +ggtitle("Both")

## Warning: Removed 1 rows containing missing values (geom_errorbar).

    #Confusion Matrix Result
    pred.glm.both <- predict(glm.both, newdata = test, type = "response")
    pred.glm.both = as.factor(ifelse(pred.glm.both >=0.8, "M", "B"))
    confusionMatrix(as.factor(test$diagnosis), pred.glm.both)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  B  M
##          B 62  3
##          M  6 43
##                                              
##                Accuracy : 0.921              
##                  95% CI : (0.855, 0.963)     
##     No Information Rate : 0.596              
##     P-Value [Acc > NIR] : 0.00000000000000575
##                                              
##                   Kappa : 0.838              
##  Mcnemar's Test P-Value : 0.505              
##                                              
##             Sensitivity : 0.912              
##             Specificity : 0.935              
##          Pos Pred Value : 0.954              
##          Neg Pred Value : 0.878              
##              Prevalence : 0.596              
##          Detection Rate : 0.544              
##    Detection Prevalence : 0.570              
##       Balanced Accuracy : 0.923              
##                                              
##        'Positive' Class : B                  
##

    cm_both = confusionMatrix(as.factor(test$diagnosis), pred.glm.both)
    cm_both$overall[1]

## Accuracy 
##    0.921

    #AUC Result
    auc_both=roc.curve(test$diagnosis, pred.glm.both, 
                       plotit = TRUE, col=4)

    auc_both

## Area under the curve (AUC): 0.916

    auc_both[2]

## $auc
## [1] 0.916

#########################################
#DataSet Results Comparison   
model_compare <- data.frame(Model = c("Logistic","Both"),
                            
                            Accuracy = c(
                                        round(cm_glm$overall[1],2),
                                         round(cm_both$overall[1],2)),
                            
                            AUC = c(auc_glm[2], 
                                    auc_both[2])
)
model_compare

##      Model Accuracy AUC.auc AUC.auc.1
## 1 Logistic     0.95   0.988     0.916
## 2     Both     0.92   0.988     0.916

      ggplot(aes(x=Model, y=Accuracy), data=model_compare) +
        geom_bar(stat="identity", fill = 'yellow') +
          ggtitle('Comparative Accuracy of Models on Unbalanced Data') +
            xlab('Models') +
              ylab('Overall Accuracy') +
                geom_text(aes(label = Accuracy), size = 5, hjust = 0.5, 
                  vjust = 3, col = "red")

                  ggplot(aes(x=Model, y=AUC.auc), data=model_compare) +
                geom_bar(stat='identity', fill = 'light blue') +
              ggtitle('Comparative AUC of Models on Unbalanced Data') +
            xlab('Models') +
          ylab('Area Under the ROC curve') +
        geom_text(aes(label = Accuracy), size = 5, hjust = 0.5, 
      vjust = 3, col = "red")

                  #Create the ROC model for the two "models"
                  preds = cbind(pred.glm.both, logit_pred, test$diagnosis)
                  colAUC(preds, test$diagnosis, plotROC = TRUE)

##         pred.glm.both logit_pred  
## B vs. M         0.916      0.951 1

#===============================================================================#

imbalance_project_1_AROD.R

ARodriguez

Wed Feb 20 17:29:54 2019