Below example shows the step by step implementation of nonet_ensemble and nonet_plot functions in the context of clustering. We have used Bank Note authentication data set to predict the output class variable using Cluster package because it provides the probability of the input point to be in a specific cluster. Predictions from first GMM and second GMM model are being used as inputs to the nonet_ensemble in the list form.
Let’s start:
library(caret)
library(ggplot2)
library(ClusterR)
## Loading required package: gtools
library(nonet)
set.seed(1001)
dataframe <- data.frame(banknote_authentication)
We can see above that class variable has int datatype, we need to convert it into factor.
dataframe$class <- as.factor(dataframe$class)
#Spliting training set into two parts based on outcome: 75% and 25%
index <- createDataPartition(dataframe$class, p=0.75, list=FALSE)
trainSet <- dataframe[ index,]
testSet <- dataframe[-index,]
dim(trainSet); dim(testSet)
## [1] 1030 5
## [1] 342 5
str(trainSet)
## 'data.frame': 1030 obs. of 5 variables:
## $ variance: num 4.546 3.866 3.457 0.329 4.368 ...
## $ skewness: num 8.17 -2.64 9.52 -4.46 9.67 ...
## $ curtosis: num -2.46 1.92 -4.01 4.57 -3.96 ...
## $ entropy : num -1.462 0.106 -3.594 -0.989 -3.163 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
str(testSet)
## 'data.frame': 342 obs. of 5 variables:
## $ variance: num 3.622 3.203 1.225 -1.577 0.804 ...
## $ skewness: num 8.67 5.76 8.78 10.84 2.85 ...
## $ curtosis: num -2.807 -0.753 -2.213 2.546 4.344 ...
## $ entropy : num -0.447 -0.613 -0.806 -2.936 0.602 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
control <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 3,
verbose = FALSE)
outcomeName<-'class'
predictors<-c("variance", "curtosis", "entropy")
head(trainSet[,predictors])
## variance curtosis entropy
## 2 4.54590 -2.45860 -1.46210
## 3 3.86600 1.92420 0.10645
## 4 3.45660 -4.01120 -3.59440
## 5 0.32924 4.57180 -0.98880
## 6 4.36840 -3.96060 -3.16250
## 7 3.59120 0.72888 0.56421
head(trainSet[,outcomeName])
## [1] 0 0 0 0 0 0
## Levels: 0 1
set.seed(900)
gmm_first <- GMM(trainSet[,predictors], 2, dist_mode = "maha_dist", seed_mode = "random_subset", km_iter = 10, em_iter = 10, verbose = F)
predict_clustering_first <- predict_GMM(trainSet[,predictors], gmm_first$centroids, gmm_first$covariance_matrices, gmm_first$weights)
head(predict_clustering_first$cluster_proba[, 2])
## [1] 0.9997857 0.9991832 0.9962177 0.7541448 0.9992900 0.9990455
predict_cluster_first_class <- as.factor(ifelse(predict_clustering_first$cluster_proba[, 2] >= "0.5", "1", "0"))
head(predict_cluster_first_class)
## [1] 1 1 1 1 1 1
## Levels: 0 1
head(predict_clustering_first$cluster_labels)
## [1] 1 1 1 1 1 1
index <- createDataPartition(dataframe$class, p=0.75, list=FALSE)
trainSet <- dataframe[ index,]
testSet <- dataframe[-index,]
dim(trainSet); dim(testSet)
## [1] 1030 5
## [1] 342 5
str(trainSet)
## 'data.frame': 1030 obs. of 5 variables:
## $ variance: num 3.62 4.55 3.87 3.46 4.37 ...
## $ skewness: num 8.67 8.17 -2.64 9.52 9.67 ...
## $ curtosis: num -2.81 -2.46 1.92 -4.01 -3.96 ...
## $ entropy : num -0.447 -1.462 0.106 -3.594 -3.163 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
str(testSet)
## 'data.frame': 342 obs. of 5 variables:
## $ variance: num 0.329 3.203 1.899 -1.577 4.891 ...
## $ skewness: num -4.46 5.76 7.66 10.84 -3.36 ...
## $ curtosis: num 4.572 -0.753 0.154 2.546 3.42 ...
## $ entropy : num -0.989 -0.613 -3.111 -2.936 1.091 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
control <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 3,
verbose = FALSE)
outcomeName<-'class'
predictors<-c("skewness", "curtosis", "entropy")
head(trainSet[,predictors])
## skewness curtosis entropy
## 1 8.6661 -2.80730 -0.44699
## 2 8.1674 -2.45860 -1.46210
## 3 -2.6383 1.92420 0.10645
## 4 9.5228 -4.01120 -3.59440
## 6 9.6718 -3.96060 -3.16250
## 7 3.0129 0.72888 0.56421
head(trainSet[,outcomeName])
## [1] 0 0 0 0 0 0
## Levels: 0 1
set.seed(423)
gmm_second <- GMM(trainSet[,predictors], 2, dist_mode = "maha_dist", seed_mode = "random_subset", km_iter = 10, em_iter = 10, verbose = F)
predict_clustering_Second <- predict_GMM(trainSet[,predictors], gmm_second$centroids, gmm_second$covariance_matrices, gmm_second$weights)
head(predict_clustering_Second$cluster_proba[, 2])
## [1] 0.987314759 0.995678444 0.003361081 0.999983188 0.999946316 0.393450214
predict_cluster_Second_class <- as.factor(ifelse(predict_clustering_Second$cluster_proba[, 2] >= "0.5", "1", "0"))
head(predict_cluster_Second_class)
## [1] 1 1 0 1 1 0
## Levels: 0 1
head(predict_clustering_Second$cluster_labels)
## [1] 1 1 0 1 1 0
Stack_object <- list(predict_clustering_first$cluster_proba[, 2], predict_clustering_Second$cluster_proba[, 2])
names(Stack_object) <- c("Cluster_first", "Cluster_second")
Now we need to apply the nonet_ensemble method by supplying list object and best model name as input. Note that We have not provided training or test outcome labels to compute the weights in the weighted average ensemble method, which is being used inside the none_ensemble. Thus it uses best models prediction to compute the weights in the weighted average ensemble.
prediction_nonet <- nonet_ensemble(Stack_object, "Cluster_second")
Results can be plotted using the nonet_plot function. nonet_plot is being designed to provided different plot_type options to the user so that one can plot different visualization based on their needs.
Prediction_data <- list(prediction_nonet, predict_clustering_first$cluster_proba[, 2], predict_clustering_Second$cluster_proba[, 2])
names(Prediction_data) <- c("pred_nonet", "pred_clust_first", "pred_clust_second")
Prediction_dataframe <- data.frame(Prediction_data)
head(Prediction_dataframe)
## pred_nonet pred_clust_first pred_clust_second
## 1 1.1403889 0.9997857 0.987314759
## 2 1.1486603 0.9991832 0.995678444
## 3 0.1558889 0.9962177 0.003361081
## 4 1.1154480 0.7541448 0.999983188
## 5 1.1529445 0.9992900 0.999946316
## 6 0.5464110 0.9990455 0.393450214
plot_first <- nonet_plot(Prediction_dataframe$pred_nonet, Prediction_dataframe$pred_clust_first, Prediction_dataframe, plot_type = "hist")
plot_first
plot_second <- nonet_plot(Prediction_dataframe$pred_clust_first, Prediction_dataframe$pred_clust_second, Prediction_dataframe, plot_type = "hist")
plot_second
plot_third <- nonet_plot(Prediction_dataframe$pred_clust_second, Prediction_dataframe$pred_clust_first, Prediction_dataframe, plot_type = "hist")
plot_third
Above it can be seen that nonet_ensemble and nonet_plot can serve in a way that one do not need to worry about the outcome variables labels to compute the weights of weighted average ensemble solution.