#
# Imbalanced Data Project
#
# The DataSet Wisconsic Breast Cancer Data - contains data on Malignant and
# Benign Cancers. More detail here: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data/home
#
# The data is slighgly imbalanced: approximately twice as many
# instances of Benign ("B") as there are Malignant ("M")
# 1. Fit a logistic classification model on the unadjusted data.
# Obtain the following two
# performance metrics: Accuracy (from confusionMatrix in the package caret);
# and AUC in from the package ROSE.
# 2. Adjust the data by oversampling using the ovun.sample() and the "both"
# capability; ovun.sample() is in the
# package ROSE.
# Fit a logistic classification model on the adjusted sample.
#
# Obtain the following two
# performance metrics: Accuracy (from confusionMatrix in the package caret);
# and AUC in from the package ROSE.
# 3. Display the results of the mano a mano graphically.
# 4. Use the function colAUC() from the package caTools
# To draw both ROC's side by side.
# As follows:
#library(caTools)
#preds = cbind(pred.glm.both, logit_pred, test$diagnosis)
#colAUC(preds, test$diagnosis, plotROC = TRUE)
# Where pred.glm.both is the predicted output of the logistic
# model with the adjusted data.
# And logit_pred is the predicted output of the logistic
# model with the un-adjusted data.
# test$diagnosis is the classification variable in the test subset
# 5. Which dataset leads to a better result?
#
# ############################################################################
#Preliminaries
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/Topics_ImbalanceClassificationImputation/imbalance_project")
options(digits = 3, scipen = 9999)
remove(list = ls())
#Load libraries
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 2.0.1 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mosaic)
## Loading required package: lattice
## Loading required package: ggformula
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
##
## dotPlot
## The following object is masked from 'package:purrr':
##
## lift
library(sjPlot)
library(ROSE)
## Loaded ROSE 0.0-3
library(caTools)
###########################################################################################
# Read the dataset
dir()
## [1] "CreditCard.csv" "creditcard.zip"
## [3] "creditcard_im.csv" "imbalance_project_1.R"
## [5] "imbalance_project_1_AROD.R" "imbalance_project_1_AROD.spin.R"
## [7] "imbalance_project_1_AROD.spin.Rmd" "wisc_bc_data.csv"
bc <- read.csv("wisc_bc_data.csv")
## familiarize yourself with the data
names(bc)
## [1] "id" "diagnosis"
## [3] "radius_mean" "texture_mean"
## [5] "perimeter_mean" "area_mean"
## [7] "smoothness_mean" "compactness_mean"
## [9] "concavity_mean" "concave.points_mean"
## [11] "symmetry_mean" "fractal_dimension_mean"
## [13] "radius_se" "texture_se"
## [15] "perimeter_se" "area_se"
## [17] "smoothness_se" "compactness_se"
## [19] "concavity_se" "concave.points_se"
## [21] "symmetry_se" "fractal_dimension_se"
## [23] "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst"
## [27] "smoothness_worst" "compactness_worst"
## [29] "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_dimension_worst"
str(bc)
## 'data.frame': 569 obs. of 32 variables:
## $ id : int 842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
## $ diagnosis : Factor w/ 2 levels "B","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave.points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave.points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave.points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
head(bc)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1 842302 M 18.0 10.4 122.8 1001
## 2 842517 M 20.6 17.8 132.9 1326
## 3 84300903 M 19.7 21.2 130.0 1203
## 4 84348301 M 11.4 20.4 77.6 386
## 5 84358402 M 20.3 14.3 135.1 1297
## 6 843786 M 12.4 15.7 82.6 477
## smoothness_mean compactness_mean concavity_mean concave.points_mean
## 1 0.1184 0.2776 0.3001 0.1471
## 2 0.0847 0.0786 0.0869 0.0702
## 3 0.1096 0.1599 0.1974 0.1279
## 4 0.1425 0.2839 0.2414 0.1052
## 5 0.1003 0.1328 0.1980 0.1043
## 6 0.1278 0.1700 0.1578 0.0809
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1 0.242 0.0787 1.095 0.905 8.59
## 2 0.181 0.0567 0.543 0.734 3.40
## 3 0.207 0.0600 0.746 0.787 4.58
## 4 0.260 0.0974 0.496 1.156 3.44
## 5 0.181 0.0588 0.757 0.781 5.44
## 6 0.209 0.0761 0.335 0.890 2.22
## area_se smoothness_se compactness_se concavity_se concave.points_se
## 1 153.4 0.00640 0.0490 0.0537 0.0159
## 2 74.1 0.00522 0.0131 0.0186 0.0134
## 3 94.0 0.00615 0.0401 0.0383 0.0206
## 4 27.2 0.00911 0.0746 0.0566 0.0187
## 5 94.4 0.01149 0.0246 0.0569 0.0188
## 6 27.2 0.00751 0.0335 0.0367 0.0114
## symmetry_se fractal_dimension_se radius_worst texture_worst
## 1 0.0300 0.00619 25.4 17.3
## 2 0.0139 0.00353 25.0 23.4
## 3 0.0225 0.00457 23.6 25.5
## 4 0.0596 0.00921 14.9 26.5
## 5 0.0176 0.00511 22.5 16.7
## 6 0.0216 0.00508 15.5 23.8
## perimeter_worst area_worst smoothness_worst compactness_worst
## 1 184.6 2019 0.162 0.666
## 2 158.8 1956 0.124 0.187
## 3 152.5 1709 0.144 0.424
## 4 98.9 568 0.210 0.866
## 5 152.2 1575 0.137 0.205
## 6 103.4 742 0.179 0.525
## concavity_worst concave.points_worst symmetry_worst
## 1 0.712 0.265 0.460
## 2 0.242 0.186 0.275
## 3 0.450 0.243 0.361
## 4 0.687 0.258 0.664
## 5 0.400 0.163 0.236
## 6 0.535 0.174 0.399
## fractal_dimension_worst
## 1 0.1189
## 2 0.0890
## 3 0.0876
## 4 0.1730
## 5 0.0768
## 6 0.1244
tally(~diagnosis, data = bc, format = c("proportion") )
## diagnosis
## B M
## 0.627 0.373
##Exploratory Data Analysis (i.e. examine data visually)
ggplot(bc, aes(radius_mean, col = as.factor(diagnosis))) +
geom_density()

ggplot(bc, aes(texture_mean, col = as.factor(diagnosis))) +
geom_density()

ggplot(bc, aes(radius_mean, col = diagnosis)) +
geom_density()

ggplot(bc, aes(x = diagnosis, y = perimeter_mean)) +
geom_boxplot() + xlab("Diagnosis")

## Fit a logistic model on the unbalanced data ##########################################
#============================================================================================
#There is an imbalance in the breast cancer data
#Perhaps too few "M's"
#train and test set
bc$ID = NULL # remove the annoying ID that comes with the data
bc = bc %>% mutate(ID = 1:nrow(bc))
train = sample_frac(bc, 0.8)
test = bc %>% anti_join(train, by = "ID")
## But first fit the model on the training data set
model_logit_train= glm(diagnosis ~ radius_mean +
texture_mean +
perimeter_mean +
area_mean +
smoothness_mean +
compactness_mean +
concavity_mean,
data = train,
family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_logit_train)
##
## Call:
## glm(formula = diagnosis ~ radius_mean + texture_mean + perimeter_mean +
## area_mean + smoothness_mean + compactness_mean + concavity_mean,
## family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6604 -0.1765 -0.0465 0.0014 3.0425
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.3132 11.1741 -1.46 0.1443
## radius_mean -2.9425 4.1617 -0.71 0.4795
## texture_mean 0.3754 0.0718 5.23 0.00000017 ***
## perimeter_mean 0.0984 0.5487 0.18 0.8576
## area_mean 0.0434 0.0198 2.20 0.0279 *
## smoothness_mean 131.4152 29.4002 4.47 0.00000783 ***
## compactness_mean -2.6860 18.7126 -0.14 0.8859
## concavity_mean 16.4067 6.3129 2.60 0.0094 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 593.68 on 454 degrees of freedom
## Residual deviance: 126.39 on 447 degrees of freedom
## AIC: 142.4
##
## Number of Fisher Scoring iterations: 9
## create predictions for the test (evaluation) data set
logit_predict =predict(model_logit_train,newdata=test,type="response")
logit_pred = as.factor(ifelse(logit_predict > 0.50, "M", "B") )
confusionMatrix(logit_pred, as.factor(test$diagnosis))
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 60 1
## M 5 48
##
## Accuracy : 0.947
## 95% CI : (0.889, 0.98)
## No Information Rate : 0.57
## P-Value [Acc > NIR] : <0.0000000000000002
##
## Kappa : 0.894
## Mcnemar's Test P-Value : 0.221
##
## Sensitivity : 0.923
## Specificity : 0.980
## Pos Pred Value : 0.984
## Neg Pred Value : 0.906
## Prevalence : 0.570
## Detection Rate : 0.526
## Detection Prevalence : 0.535
## Balanced Accuracy : 0.951
##
## 'Positive' Class : B
##
cm_glm = confusionMatrix(logit_pred, as.factor(test$diagnosis))
roc.curve(as.factor(test$diagnosis), logit_predict, plotit = T)

## Area under the curve (AUC): 0.988
accuracy.meas(as.factor(test$diagnosis), logit_predict, threshold = 0.50)
##
## Call:
## accuracy.meas(response = as.factor(test$diagnosis), predicted = logit_predict,
## threshold = 0.5)
##
## Examples are labelled as positive when predicted is greater than 0.5
##
## precision: 0.906
## recall: 0.980
## F: 0.471
auc_glm=roc.curve(test$diagnosis, logit_predict, plotit = T, col=4)

auc_glm[2]
## $auc
## [1] 0.988
# The library ROSE has several features capable of dealing with imbalance.
#"both" sampling
data_balanced_both <- ovun.sample(diagnosis ~ radius_mean +
texture_mean +
perimeter_mean +
area_mean +
smoothness_mean +
compactness_mean +
concavity_mean,
data = train, method = "both",N = nrow(train))
data_balanced_both = data_balanced_both$data
table(data_balanced_both$diagnosis)
##
## B M
## 225 230
prop.table(table(data_balanced_both$diagnosis))
##
## B M
## 0.495 0.505
#build logistic regressoin predictive models with each dataset
glm.both <- glm(diagnosis ~ radius_mean +
texture_mean +
perimeter_mean +
area_mean +
smoothness_mean +
compactness_mean +
concavity_mean,
data = data_balanced_both, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
plot_model(glm.both, type = "est", value.offset = 0.15,
show.values = TRUE, vline = "blue" ,sort.est = T) +ggtitle("Both")
## Warning: Removed 1 rows containing missing values (geom_errorbar).

#Confusion Matrix Result
pred.glm.both <- predict(glm.both, newdata = test, type = "response")
pred.glm.both = as.factor(ifelse(pred.glm.both >=0.8, "M", "B"))
confusionMatrix(as.factor(test$diagnosis), pred.glm.both)
## Confusion Matrix and Statistics
##
## Reference
## Prediction B M
## B 62 3
## M 6 43
##
## Accuracy : 0.921
## 95% CI : (0.855, 0.963)
## No Information Rate : 0.596
## P-Value [Acc > NIR] : 0.00000000000000575
##
## Kappa : 0.838
## Mcnemar's Test P-Value : 0.505
##
## Sensitivity : 0.912
## Specificity : 0.935
## Pos Pred Value : 0.954
## Neg Pred Value : 0.878
## Prevalence : 0.596
## Detection Rate : 0.544
## Detection Prevalence : 0.570
## Balanced Accuracy : 0.923
##
## 'Positive' Class : B
##
cm_both = confusionMatrix(as.factor(test$diagnosis), pred.glm.both)
cm_both$overall[1]
## Accuracy
## 0.921
#AUC Result
auc_both=roc.curve(test$diagnosis, pred.glm.both,
plotit = TRUE, col=4)

auc_both
## Area under the curve (AUC): 0.916
auc_both[2]
## $auc
## [1] 0.916
#########################################
#DataSet Results Comparison
model_compare <- data.frame(Model = c("Logistic","Both"),
Accuracy = c(
round(cm_glm$overall[1],2),
round(cm_both$overall[1],2)),
AUC = c(auc_glm[2],
auc_both[2])
)
model_compare
## Model Accuracy AUC.auc AUC.auc.1
## 1 Logistic 0.95 0.988 0.916
## 2 Both 0.92 0.988 0.916
ggplot(aes(x=Model, y=Accuracy), data=model_compare) +
geom_bar(stat="identity", fill = 'yellow') +
ggtitle('Comparative Accuracy of Models on Unbalanced Data') +
xlab('Models') +
ylab('Overall Accuracy') +
geom_text(aes(label = Accuracy), size = 5, hjust = 0.5,
vjust = 3, col = "red")

ggplot(aes(x=Model, y=AUC.auc), data=model_compare) +
geom_bar(stat='identity', fill = 'light blue') +
ggtitle('Comparative AUC of Models on Unbalanced Data') +
xlab('Models') +
ylab('Area Under the ROC curve') +
geom_text(aes(label = Accuracy), size = 5, hjust = 0.5,
vjust = 3, col = "red")

#Create the ROC model for the two "models"
preds = cbind(pred.glm.both, logit_pred, test$diagnosis)
colAUC(preds, test$diagnosis, plotROC = TRUE)

## pred.glm.both logit_pred
## B vs. M 0.916 0.951 1
#===============================================================================#