#======================================================================
# CLUSTERING PROJECT
remove(list = ls())
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(cluster)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(NbClust)
library(ggfortify)
library(TSclust)
## Loading required package: wmtsa
## Loading required package: pdc
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
set.seed(12345)
options(digits = 3, scipen = 9999)
dir()
## [1] "16data.zip"
## [2] "2016 FE Guide for DOE-OK to release-no-sales-4-27-2017Mercedesforpublic.xlsx"
## [3] "cluster_project.R"
## [4] "cluster_project_AROD.R"
## [5] "cluster_project_AROD.spin.R"
## [6] "cluster_project_AROD.spin.Rmd"
## [7] "cluster_project_raw.R"
## [8] "Pharmaceuticals.csv"
## [9] "unempstates.csv"
## [10] "Wholesale_customers.csv"
## [11] "Wholesale_customers_ data.csv"
setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/U_ClusterAnalysis/Cluster_Project")
#======================================================================
#==========================================================================#
# An equities analyst is studying the pharmaceutical industry and would like
# help in exploring and understanding the financial data collected by her
# firm. Her main objective is to understand the structure of the
# pharmaceutical industry using some basic financial measures.
# The file "Pharmaceuticals.csv" contains financial data on 21 pharma firms.
# Use only the numerical variables to cluster the 21 firms.
# i. Interpret the clusters.
# ii. Is there a pattern in the clusters with respect to any (or several) of
# the three factor variables?
# iii. Provide appropiate "names" for each cluster using any or all the
# variables in the data set.
ph = read.csv("Pharmaceuticals.csv")
str(ph)
## 'data.frame': 21 obs. of 14 variables:
## $ ï..Symbol : Factor w/ 21 levels "ABT","AGN","AHM",..: 1 2 3 5 4 6 7 8 9 13 ...
## $ Name : Factor w/ 21 levels "Abbott Laboratories",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Market_Cap : num 68.44 7.58 6.3 67.63 47.16 ...
## $ Beta : num 0.32 0.41 0.46 0.52 0.32 1.11 0.5 0.85 1.08 0.18 ...
## $ PE_Ratio : num 24.7 82.5 20.7 21.5 20.1 27.9 13.9 26 3.6 27.9 ...
## $ ROE : num 26.4 12.9 14.9 27.4 21.8 3.9 34.8 24.1 15.1 31 ...
## $ ROA : num 11.8 5.5 7.8 15.4 7.5 1.4 15.1 4.3 5.1 13.5 ...
## $ Asset_Turnover : num 0.7 0.9 0.9 0.9 0.6 0.6 0.9 0.6 0.3 0.6 ...
## $ Leverage : num 0.42 0.6 0.27 0 0.34 0 0.57 3.51 1.07 0.53 ...
## $ Rev_Growth : num 7.54 9.16 7.05 15 26.81 ...
## $ Net_Profit_Margin : num 16.1 5.5 11.2 18 12.9 2.6 20.6 7.5 13.3 23.4 ...
## $ Median_Recommendation: Factor w/ 4 levels "Hold","Moderate Buy",..: 2 2 4 3 2 1 3 2 3 1 ...
## $ Location : Factor w/ 7 levels "CANADA","FRANCE",..: 7 1 6 6 2 3 7 7 4 7 ...
## $ Exchange : Factor w/ 3 levels "AMEX","NASDAQ",..: 3 3 3 3 3 3 3 2 3 3 ...
head(ph, 3)
## ï..Symbol Name Market_Cap Beta PE_Ratio ROE ROA
## 1 ABT Abbott Laboratories 68.44 0.32 24.7 26.4 11.8
## 2 AGN Allergan, Inc. 7.58 0.41 82.5 12.9 5.5
## 3 AHM Amersham plc 6.30 0.46 20.7 14.9 7.8
## Asset_Turnover Leverage Rev_Growth Net_Profit_Margin
## 1 0.7 0.42 7.54 16.1
## 2 0.9 0.60 9.16 5.5
## 3 0.9 0.27 7.05 11.2
## Median_Recommendation Location Exchange
## 1 Moderate Buy US NYSE
## 2 Moderate Buy CANADA NYSE
## 3 Strong Buy UK NYSE
names(ph)[1] = "Symbol"
#Remove all but numeric data and convert column "Names" to rownames
#using column_to_rownames()
ph_df = ph %>% dplyr::select(-Symbol, -Median_Recommendation, - Location, -Exchange) %>%
column_to_rownames(var = "Name")
head(ph_df, 3)
## Market_Cap Beta PE_Ratio ROE ROA Asset_Turnover
## Abbott Laboratories 68.44 0.32 24.7 26.4 11.8 0.7
## Allergan, Inc. 7.58 0.41 82.5 12.9 5.5 0.9
## Amersham plc 6.30 0.46 20.7 14.9 7.8 0.9
## Leverage Rev_Growth Net_Profit_Margin
## Abbott Laboratories 0.42 7.54 16.1
## Allergan, Inc. 0.60 9.16 5.5
## Amersham plc 0.27 7.05 11.2
#Is the dataset "clusterable"?
#Check the "hopkins statistic" using the function
#get_clust_tendency() in the package "factoextra"
ph_res <- get_clust_tendency(ph_df, 10, graph = T)
names(ph_res)
## [1] "hopkins_stat" "plot"
# Hopskin statistic
ph_res$hopkins_stat
## [1] 0.315
# Visualize the dissimilarity matrix
ph_res$plot

#Determine how may clusters?
NbClust(ph_df, method = "complete", index = 'hartigan')$Best.nc
## Number_clusters Value_Index
## 3.00 6.93
NbClust(ph_df, method = "ward.D", index = "hartigan")$Best.nc
## Number_clusters Value_Index
## 3.00 6.93
#TEST 1: Use PAM to obtain clusters
#input is the dataframe
pam_ph = pam(ph_df, k = 3)
#and plotting
clusplot(ph_df, pam_ph$clustering, color = TRUE, shade = TRUE,
labels = 2, cex = 0.7)

#Test 2: Use RandomForests to obtain proximity metric
rf_ph = randomForest(ph_df,ntree = 1000, proximity = TRUE)
#aand from there the dissimilarity matrix
rf_dis = as.dist(1 - rf_ph$proximity)
#and the use PAM to obtain the clusters but this time using
#the distance (aka dissimilarity) matrix as input
pam_ph = pam(rf_dis, k = 3, diss = FALSE)
#and plotting
clusplot(ph_df, pam_ph$clustering, color = TRUE, shade = TRUE,
labels = 2, cex = 0.7)

#Item 3: Leave the factor variables in the dataframe (except symbol)
ph_dt = ph %>% dplyr::select(-Symbol) %>%
column_to_rownames(var = "Name")
head(ph_dt, 3)
## Market_Cap Beta PE_Ratio ROE ROA Asset_Turnover
## Abbott Laboratories 68.44 0.32 24.7 26.4 11.8 0.7
## Allergan, Inc. 7.58 0.41 82.5 12.9 5.5 0.9
## Amersham plc 6.30 0.46 20.7 14.9 7.8 0.9
## Leverage Rev_Growth Net_Profit_Margin
## Abbott Laboratories 0.42 7.54 16.1
## Allergan, Inc. 0.60 9.16 5.5
## Amersham plc 0.27 7.05 11.2
## Median_Recommendation Location Exchange
## Abbott Laboratories Moderate Buy US NYSE
## Allergan, Inc. Moderate Buy CANADA NYSE
## Amersham plc Strong Buy UK NYSE
#Use RandomForests to obtain proximity metric
rf_ph = randomForest(ph_dt,ntree = 1000, proximity = TRUE)
#aand from there the dissimilarity matrix
rf_dis = as.dist(1 - rf_ph$proximity)
#and the use PAM to obtain the clusters but this time using
#the distance (aka dissimilarity) matrix as input
pam_ph = pam(rf_dis, k = 3, diss = T)
#and plotting
clusplot(ph_df, pam_ph$clustering, color = TRUE, shade = TRUE,
labels = 2, cex = 0.7)

str(ph_dt)
## 'data.frame': 21 obs. of 12 variables:
## $ Market_Cap : num 68.44 7.58 6.3 67.63 47.16 ...
## $ Beta : num 0.32 0.41 0.46 0.52 0.32 1.11 0.5 0.85 1.08 0.18 ...
## $ PE_Ratio : num 24.7 82.5 20.7 21.5 20.1 27.9 13.9 26 3.6 27.9 ...
## $ ROE : num 26.4 12.9 14.9 27.4 21.8 3.9 34.8 24.1 15.1 31 ...
## $ ROA : num 11.8 5.5 7.8 15.4 7.5 1.4 15.1 4.3 5.1 13.5 ...
## $ Asset_Turnover : num 0.7 0.9 0.9 0.9 0.6 0.6 0.9 0.6 0.3 0.6 ...
## $ Leverage : num 0.42 0.6 0.27 0 0.34 0 0.57 3.51 1.07 0.53 ...
## $ Rev_Growth : num 7.54 9.16 7.05 15 26.81 ...
## $ Net_Profit_Margin : num 16.1 5.5 11.2 18 12.9 2.6 20.6 7.5 13.3 23.4 ...
## $ Median_Recommendation: Factor w/ 4 levels "Hold","Moderate Buy",..: 2 2 4 3 2 1 3 2 3 1 ...
## $ Location : Factor w/ 7 levels "CANADA","FRANCE",..: 7 1 6 6 2 3 7 7 4 7 ...
## $ Exchange : Factor w/ 3 levels "AMEX","NASDAQ",..: 3 3 3 3 3 3 3 2 3 3 ...
pam2 = pam(ph_dt, k = 3, diss = F)
clusplot(ph_dt, pam2$clustering, color = TRUE, shade = TRUE,
labels = 2, cex = 0.7)

#Relationship to Exchange, Location, and Median Recommendation
ph_dt = data.frame(ph_dt, pam_ph$clustering)
head(ph_dt)
## Market_Cap Beta PE_Ratio ROE ROA Asset_Turnover
## Abbott Laboratories 68.44 0.32 24.7 26.4 11.8 0.7
## Allergan, Inc. 7.58 0.41 82.5 12.9 5.5 0.9
## Amersham plc 6.30 0.46 20.7 14.9 7.8 0.9
## AstraZeneca PLC 67.63 0.52 21.5 27.4 15.4 0.9
## Aventis 47.16 0.32 20.1 21.8 7.5 0.6
## Bayer AG 16.90 1.11 27.9 3.9 1.4 0.6
## Leverage Rev_Growth Net_Profit_Margin
## Abbott Laboratories 0.42 7.54 16.1
## Allergan, Inc. 0.60 9.16 5.5
## Amersham plc 0.27 7.05 11.2
## AstraZeneca PLC 0.00 15.00 18.0
## Aventis 0.34 26.81 12.9
## Bayer AG 0.00 -3.17 2.6
## Median_Recommendation Location Exchange
## Abbott Laboratories Moderate Buy US NYSE
## Allergan, Inc. Moderate Buy CANADA NYSE
## Amersham plc Strong Buy UK NYSE
## AstraZeneca PLC Moderate Sell UK NYSE
## Aventis Moderate Buy FRANCE NYSE
## Bayer AG Hold GERMANY NYSE
## pam_ph.clustering
## Abbott Laboratories 1
## Allergan, Inc. 2
## Amersham plc 3
## AstraZeneca PLC 3
## Aventis 1
## Bayer AG 2
table(ph_dt$Exchange, ph_dt$pam_ph.clustering)
##
## 1 2 3
## AMEX 0 1 0
## NASDAQ 0 1 0
## NYSE 6 5 8
table(ph_dt$Median_Recommendation, pam2$clustering)
##
## 1 2 3
## Hold 4 3 2
## Moderate Buy 2 3 2
## Moderate Sell 2 2 0
## Strong Buy 0 1 0
table(ph_dt$Location, pam2$clustering)
##
## 1 2 3
## CANADA 0 1 0
## FRANCE 1 0 0
## GERMANY 0 1 0
## IRELAND 0 1 0
## SWITZERLAND 1 0 0
## UK 1 1 1
## US 5 5 3
#=======================================================================#