#======================================================================
# CLUSTERING PROJECT

            remove(list = ls())
            
            library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
            library(cluster)
            library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
            library(NbClust)
            library(ggfortify)
            library(TSclust)
## Loading required package: wmtsa
## Loading required package: pdc
            library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
    set.seed(12345)
    options(digits = 3, scipen = 9999)
    
    dir()
##  [1] "16data.zip"                                                                  
##  [2] "2016 FE Guide for DOE-OK to release-no-sales-4-27-2017Mercedesforpublic.xlsx"
##  [3] "cluster_project.R"                                                           
##  [4] "cluster_project_AROD.R"                                                      
##  [5] "cluster_project_AROD.spin.R"                                                 
##  [6] "cluster_project_AROD.spin.Rmd"                                               
##  [7] "cluster_project_raw.R"                                                       
##  [8] "Pharmaceuticals.csv"                                                         
##  [9] "unempstates.csv"                                                             
## [10] "Wholesale_customers.csv"                                                     
## [11] "Wholesale_customers_ data.csv"
    setwd("C:/Users/arodriguez/Dropbox/classes/DataMining/U_ClusterAnalysis/Cluster_Project")

    #======================================================================
    
    #==========================================================================#
    # An equities analyst is studying the pharmaceutical industry and would like
    # help in exploring and understanding the financial data collected by her
    # firm.  Her main objective is to understand the structure of the 
    # pharmaceutical industry using some basic financial measures.
    
    # The file "Pharmaceuticals.csv" contains financial data on 21 pharma firms.
    
    # Use only the numerical variables to cluster the 21 firms.
    # i. Interpret the clusters.
    # ii. Is there a pattern in the clusters with respect to any (or several) of
    #     the three factor variables?  
    # iii. Provide appropiate "names" for each cluster using any or all the
    #      variables in the data set.
    
    ph = read.csv("Pharmaceuticals.csv")
    str(ph)
## 'data.frame':    21 obs. of  14 variables:
##  $ ï..Symbol            : Factor w/ 21 levels "ABT","AGN","AHM",..: 1 2 3 5 4 6 7 8 9 13 ...
##  $ Name                 : Factor w/ 21 levels "Abbott Laboratories",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Market_Cap           : num  68.44 7.58 6.3 67.63 47.16 ...
##  $ Beta                 : num  0.32 0.41 0.46 0.52 0.32 1.11 0.5 0.85 1.08 0.18 ...
##  $ PE_Ratio             : num  24.7 82.5 20.7 21.5 20.1 27.9 13.9 26 3.6 27.9 ...
##  $ ROE                  : num  26.4 12.9 14.9 27.4 21.8 3.9 34.8 24.1 15.1 31 ...
##  $ ROA                  : num  11.8 5.5 7.8 15.4 7.5 1.4 15.1 4.3 5.1 13.5 ...
##  $ Asset_Turnover       : num  0.7 0.9 0.9 0.9 0.6 0.6 0.9 0.6 0.3 0.6 ...
##  $ Leverage             : num  0.42 0.6 0.27 0 0.34 0 0.57 3.51 1.07 0.53 ...
##  $ Rev_Growth           : num  7.54 9.16 7.05 15 26.81 ...
##  $ Net_Profit_Margin    : num  16.1 5.5 11.2 18 12.9 2.6 20.6 7.5 13.3 23.4 ...
##  $ Median_Recommendation: Factor w/ 4 levels "Hold","Moderate Buy",..: 2 2 4 3 2 1 3 2 3 1 ...
##  $ Location             : Factor w/ 7 levels "CANADA","FRANCE",..: 7 1 6 6 2 3 7 7 4 7 ...
##  $ Exchange             : Factor w/ 3 levels "AMEX","NASDAQ",..: 3 3 3 3 3 3 3 2 3 3 ...
    head(ph, 3)
##   ï..Symbol                Name Market_Cap Beta PE_Ratio  ROE  ROA
## 1       ABT Abbott Laboratories      68.44 0.32     24.7 26.4 11.8
## 2       AGN      Allergan, Inc.       7.58 0.41     82.5 12.9  5.5
## 3       AHM        Amersham plc       6.30 0.46     20.7 14.9  7.8
##   Asset_Turnover Leverage Rev_Growth Net_Profit_Margin
## 1            0.7     0.42       7.54              16.1
## 2            0.9     0.60       9.16               5.5
## 3            0.9     0.27       7.05              11.2
##   Median_Recommendation Location Exchange
## 1          Moderate Buy       US     NYSE
## 2          Moderate Buy   CANADA     NYSE
## 3            Strong Buy       UK     NYSE
    names(ph)[1] = "Symbol"
    
    #Remove all but numeric data and convert column "Names" to rownames
    #using column_to_rownames()
    ph_df = ph %>% dplyr::select(-Symbol, -Median_Recommendation, - Location, -Exchange) %>%
      column_to_rownames(var = "Name")
    
    head(ph_df, 3)
##                     Market_Cap Beta PE_Ratio  ROE  ROA Asset_Turnover
## Abbott Laboratories      68.44 0.32     24.7 26.4 11.8            0.7
## Allergan, Inc.            7.58 0.41     82.5 12.9  5.5            0.9
## Amersham plc              6.30 0.46     20.7 14.9  7.8            0.9
##                     Leverage Rev_Growth Net_Profit_Margin
## Abbott Laboratories     0.42       7.54              16.1
## Allergan, Inc.          0.60       9.16               5.5
## Amersham plc            0.27       7.05              11.2
              #Is the dataset "clusterable"?
              #Check the "hopkins statistic" using the function
              #get_clust_tendency() in the package "factoextra"
              ph_res <- get_clust_tendency(ph_df, 10, graph = T)
              names(ph_res)
## [1] "hopkins_stat" "plot"
              # Hopskin statistic
              ph_res$hopkins_stat
## [1] 0.315
              # Visualize the dissimilarity matrix
              ph_res$plot

    #Determine how may clusters?
    NbClust(ph_df, method = "complete", index = 'hartigan')$Best.nc
## Number_clusters     Value_Index 
##            3.00            6.93
    NbClust(ph_df, method = "ward.D", index = "hartigan")$Best.nc
## Number_clusters     Value_Index 
##            3.00            6.93
        #TEST 1: Use PAM to obtain clusters
        #input is the dataframe
        pam_ph = pam(ph_df, k = 3)    
        #and plotting
        clusplot(ph_df, pam_ph$clustering, color = TRUE, shade = TRUE, 
                 labels = 2, cex = 0.7)

        #Test 2: Use RandomForests to obtain proximity metric
        rf_ph = randomForest(ph_df,ntree = 1000, proximity = TRUE)
        #aand from there the dissimilarity matrix
        rf_dis = as.dist(1 - rf_ph$proximity) 
        #and the use PAM to obtain the clusters but this time using
        #the distance (aka dissimilarity) matrix as input
        pam_ph = pam(rf_dis, k = 3, diss = FALSE)    
        #and plotting
        clusplot(ph_df, pam_ph$clustering, color = TRUE, shade = TRUE, 
                 labels = 2, cex = 0.7)

        #Item 3: Leave the factor variables in the dataframe (except symbol)
        ph_dt = ph %>% dplyr::select(-Symbol) %>%
          column_to_rownames(var = "Name")
        head(ph_dt, 3)
##                     Market_Cap Beta PE_Ratio  ROE  ROA Asset_Turnover
## Abbott Laboratories      68.44 0.32     24.7 26.4 11.8            0.7
## Allergan, Inc.            7.58 0.41     82.5 12.9  5.5            0.9
## Amersham plc              6.30 0.46     20.7 14.9  7.8            0.9
##                     Leverage Rev_Growth Net_Profit_Margin
## Abbott Laboratories     0.42       7.54              16.1
## Allergan, Inc.          0.60       9.16               5.5
## Amersham plc            0.27       7.05              11.2
##                     Median_Recommendation Location Exchange
## Abbott Laboratories          Moderate Buy       US     NYSE
## Allergan, Inc.               Moderate Buy   CANADA     NYSE
## Amersham plc                   Strong Buy       UK     NYSE
        #Use RandomForests to obtain proximity metric
        rf_ph = randomForest(ph_dt,ntree = 1000, proximity = TRUE)
        #aand from there the dissimilarity matrix
        rf_dis = as.dist(1 - rf_ph$proximity) 
        #and the use PAM to obtain the clusters but this time using
        #the distance (aka dissimilarity) matrix as input
        pam_ph = pam(rf_dis, k = 3, diss = T)    
        #and plotting
        clusplot(ph_df, pam_ph$clustering, color = TRUE, shade = TRUE, 
                 labels = 2, cex = 0.7)

        str(ph_dt)
## 'data.frame':    21 obs. of  12 variables:
##  $ Market_Cap           : num  68.44 7.58 6.3 67.63 47.16 ...
##  $ Beta                 : num  0.32 0.41 0.46 0.52 0.32 1.11 0.5 0.85 1.08 0.18 ...
##  $ PE_Ratio             : num  24.7 82.5 20.7 21.5 20.1 27.9 13.9 26 3.6 27.9 ...
##  $ ROE                  : num  26.4 12.9 14.9 27.4 21.8 3.9 34.8 24.1 15.1 31 ...
##  $ ROA                  : num  11.8 5.5 7.8 15.4 7.5 1.4 15.1 4.3 5.1 13.5 ...
##  $ Asset_Turnover       : num  0.7 0.9 0.9 0.9 0.6 0.6 0.9 0.6 0.3 0.6 ...
##  $ Leverage             : num  0.42 0.6 0.27 0 0.34 0 0.57 3.51 1.07 0.53 ...
##  $ Rev_Growth           : num  7.54 9.16 7.05 15 26.81 ...
##  $ Net_Profit_Margin    : num  16.1 5.5 11.2 18 12.9 2.6 20.6 7.5 13.3 23.4 ...
##  $ Median_Recommendation: Factor w/ 4 levels "Hold","Moderate Buy",..: 2 2 4 3 2 1 3 2 3 1 ...
##  $ Location             : Factor w/ 7 levels "CANADA","FRANCE",..: 7 1 6 6 2 3 7 7 4 7 ...
##  $ Exchange             : Factor w/ 3 levels "AMEX","NASDAQ",..: 3 3 3 3 3 3 3 2 3 3 ...
        pam2 = pam(ph_dt, k = 3, diss = F)
        clusplot(ph_dt, pam2$clustering, color = TRUE, shade = TRUE, 
                 labels = 2, cex = 0.7)

      #Relationship to Exchange, Location, and Median Recommendation
        ph_dt = data.frame(ph_dt, pam_ph$clustering)
        head(ph_dt)
##                     Market_Cap Beta PE_Ratio  ROE  ROA Asset_Turnover
## Abbott Laboratories      68.44 0.32     24.7 26.4 11.8            0.7
## Allergan, Inc.            7.58 0.41     82.5 12.9  5.5            0.9
## Amersham plc              6.30 0.46     20.7 14.9  7.8            0.9
## AstraZeneca PLC          67.63 0.52     21.5 27.4 15.4            0.9
## Aventis                  47.16 0.32     20.1 21.8  7.5            0.6
## Bayer AG                 16.90 1.11     27.9  3.9  1.4            0.6
##                     Leverage Rev_Growth Net_Profit_Margin
## Abbott Laboratories     0.42       7.54              16.1
## Allergan, Inc.          0.60       9.16               5.5
## Amersham plc            0.27       7.05              11.2
## AstraZeneca PLC         0.00      15.00              18.0
## Aventis                 0.34      26.81              12.9
## Bayer AG                0.00      -3.17               2.6
##                     Median_Recommendation Location Exchange
## Abbott Laboratories          Moderate Buy       US     NYSE
## Allergan, Inc.               Moderate Buy   CANADA     NYSE
## Amersham plc                   Strong Buy       UK     NYSE
## AstraZeneca PLC             Moderate Sell       UK     NYSE
## Aventis                      Moderate Buy   FRANCE     NYSE
## Bayer AG                             Hold  GERMANY     NYSE
##                     pam_ph.clustering
## Abbott Laboratories                 1
## Allergan, Inc.                      2
## Amersham plc                        3
## AstraZeneca PLC                     3
## Aventis                             1
## Bayer AG                            2
        table(ph_dt$Exchange, ph_dt$pam_ph.clustering)
##         
##          1 2 3
##   AMEX   0 1 0
##   NASDAQ 0 1 0
##   NYSE   6 5 8
        table(ph_dt$Median_Recommendation, pam2$clustering)
##                
##                 1 2 3
##   Hold          4 3 2
##   Moderate Buy  2 3 2
##   Moderate Sell 2 2 0
##   Strong Buy    0 1 0
        table(ph_dt$Location, pam2$clustering)
##              
##               1 2 3
##   CANADA      0 1 0
##   FRANCE      1 0 0
##   GERMANY     0 1 0
##   IRELAND     0 1 0
##   SWITZERLAND 1 0 0
##   UK          1 1 1
##   US          5 5 3
        #=======================================================================#