The following examples illustrate the functionality of the KernelKnn package for classification tasks. I’ll make use of the ionosphere data set,
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 2 1 219 269 204 259 231 260 244 267 246 269
## V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24
## 238 266 234 270 254 280 254 266 248 265 248 264
## V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 class
## 256 273 256 281 244 266 243 263 245 263 2
When using an algorithm where the ouput depends on distance calculation (as is the case in k-nearest-neighbors) it is recommended to first scale the data,
# recommended is to scale the data
X = scale(ionosphere[, -ncol(ionosphere)])
y = ionosphere[, ncol(ionosphere)]
important note : In classification, both functions KernelKnn and KernelKnnCV accept a numeric vector as a response variable (here y) and the unique values of the labels should begin from 1. This is important otherwise the internal functions do not work. Furthermore, both functions (by default) return predictions in form of probabilities, which can be converted to labels by using either a threshold (if binary classification) or the maximum value of each column (if multiclass classification).
# labels should be numeric and begin from 1:Inf
y = c(1:length(unique(y)))[ match(ionosphere$class, sort(unique(ionosphere$class))) ]
# random split of data in train and test
spl_train = sample(1:length(y), round(length(y) * 0.75))
spl_test = setdiff(1:length(y), spl_train)
str(spl_train)
## int [1:263] 152 100 134 277 322 64 305 167 11 177 ...
## int [1:88] 1 5 14 15 16 20 21 28 29 32 ...
# evaluation metric
acc = function (y_true, preds) {
out = table(y_true, max.col(preds, ties.method = "random"))
acc = sum(diag(out))/sum(out)
acc
}
The KernelKnn function takes a number of arguments. To read details for each one of the arguments type ?KernelKnn::KernelKnn in the console.
A simple k-nearest-neighbors can be run with weights_function = NULL and the parameter ‘regression’ should be set to FALSE. In classification the Levels parameter takes the unique values of the response variable,
library(KernelKnn)
preds_TEST = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 5 ,
method = 'euclidean', weights_function = NULL, regression = F,
Levels = unique(y))
head(preds_TEST)
## class_1 class_2
## [1,] 0.2 0.8
## [2,] 0.2 0.8
## [3,] 0.0 1.0
## [4,] 0.0 1.0
## [5,] 0.6 0.4
## [6,] 0.2 0.8
There are two ways to use a kernel in the KernelKnn function. The first option is to choose one of the existing kernels (uniform, triangular, epanechnikov, biweight, triweight, tricube, gaussian, cosine, logistic, silverman, inverse, gaussianSimple, exponential). Here, I use the canberra metric and the tricube kernel because they give optimal results (according to my RandomSearchR package),
preds_TEST_tric = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 ,
method = 'canberra', weights_function = 'tricube', regression = F,
Levels = unique(y))
head(preds_TEST_tric)
## [,1] [,2]
## [1,] 0.0000000 1.0000000000
## [2,] 0.0000000 1.0000000000
## [3,] 0.5635877 0.4364123451
## [4,] 0.1441363 0.8558636754
## [5,] 0.9995187 0.0004813259
## [6,] 0.8994787 0.1005212960
The second option is to give a self defined kernel function. Here, I’ll pick the density function of the normal distribution with mean = 0.0 and standard deviation = 1.0 (the data are scaled to have mean zero and unit variance),
norm_kernel = function(W) {
W = dnorm(W, mean = 0, sd = 1.0)
W = W / rowSums(W)
return(W)
}
preds_TEST_norm = KernelKnn(X[spl_train, ], TEST_data = X[spl_test, ], y[spl_train], k = 10 ,
method = 'canberra', weights_function = norm_kernel, regression = F,
Levels = unique(y))
head(preds_TEST_norm)
## [,1] [,2]
## [1,] 0.0000000 1.00000000
## [2,] 0.0000000 1.00000000
## [3,] 0.4334149 0.56658510
## [4,] 0.1869283 0.81307169
## [5,] 0.9138637 0.08613632
## [6,] 0.8989750 0.10102495
The computations can be speed up by using the parameter threads (multiple cores can be run in parallel). There is also the option to exclude extrema (minimum and maximum distances) during the calculation of the k-nearest-neighbor distances using extrema = TRUE. The bandwidth of the existing kernels can be tuned using the h parameter.
K-nearest-neigbor calculations in the KernelKnn function can be accomplished using the following distance metrics : euclidean, manhattan, chebyshev, canberra, braycurtis, minkowski (by default the order ‘p’ of the minkowski parameter equals k), hamming, mahalanobis, pearson_correlation, simple_matching_coefficient, jaccard_coefficient and Rao_coefficient. The last four are similarity measures and are appropriate for binary data [0,1].
I employed my RandomSearchR package to find the optimal parameters for the KernelKnn function and the following two pairs of parameters give an optimal accuracy,
k | method | kernel |
---|---|---|
10 | canberra | tricube |
9 | canberra | epanechnikov |
I’ll use the KernelKnnCV function to calculate the accuracy using 5-fold cross-validation for the previous mentioned parameter pairs,
fit_cv_pair1 = KernelKnnCV(X, y, k = 10 , folds = 5, method = 'canberra',
weights_function = 'tricube', regression = F,
Levels = unique(y), threads = 5, seed_num = 5)
## List of 2
## $ preds:List of 5
## ..$ : num [1:71, 1:2] 0.00648 0.25323 1 0.97341 0.92031 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0.999 ...
## ..$ : num [1:70, 1:2] 0.353 0 0.17 0.212 0.266 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0 ...
## ..$ : num [1:70, 1:2] 0.989 0 1 0 0 ...
## $ folds:List of 5
## ..$ fold_1: int [1:71] 5 26 233 243 30 41 237 229 19 11 ...
## ..$ fold_2: int [1:70] 262 89 257 67 58 266 253 85 275 268 ...
## ..$ fold_3: int [1:70] 127 128 295 287 134 288 130 277 125 101 ...
## ..$ fold_4: int [1:70] 313 301 317 318 316 142 175 157 146 147 ...
## ..$ fold_5: int [1:70] 195 326 225 332 342 347 206 219 218 214 ...
fit_cv_pair2 = KernelKnnCV(X, y, k = 9 , folds = 5,method = 'canberra',
weights_function = 'epanechnikov', regression = F,
Levels = unique(y), threads = 5, seed_num = 5)
## List of 2
## $ preds:List of 5
## ..$ : num [1:71, 1:2] 0.0224 0.255 1 0.9601 0.8876 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0.998 ...
## ..$ : num [1:70, 1:2] 0.36 0 0.164 0.185 0.202 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0 ...
## ..$ : num [1:70, 1:2] 0.912 0 1 0 0 ...
## $ folds:List of 5
## ..$ fold_1: int [1:71] 5 26 233 243 30 41 237 229 19 11 ...
## ..$ fold_2: int [1:70] 262 89 257 67 58 266 253 85 275 268 ...
## ..$ fold_3: int [1:70] 127 128 295 287 134 288 130 277 125 101 ...
## ..$ fold_4: int [1:70] 313 301 317 318 316 142 175 157 146 147 ...
## ..$ fold_5: int [1:70] 195 326 225 332 342 347 206 219 218 214 ...
Each cross-validated object returns a list of length 2 ( the first sublist includes the predictions for each fold whereas the second gives the indices of the folds)
acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds),
function(x) acc(y[fit_cv_pair1$folds[[x]]],
fit_cv_pair1$preds[[x]])))
acc_pair1
## [1] 0.9154930 0.9142857 0.9142857 0.9285714 0.9571429
## accurcay for params_pair1 is : 0.9259557
acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds),
function(x) acc(y[fit_cv_pair2$folds[[x]]],
fit_cv_pair2$preds[[x]])))
acc_pair2
## [1] 0.9014085 0.9142857 0.9000000 0.9142857 0.9571429
## accuracy for params_pair2 is : 0.9174245
In the KernelKnn package there is also the option to combine kernels (adding or multiplying) from the existing ones. For instance, if I want to multiply the tricube with the gaussian kernel, then I’ll give the following character string to the weights_function, “tricube_gaussian_MULT”. On the other hand, If I want to add the same kernels then the weights_function will be “tricube_gaussian_ADD”. I experimented with my RandomSearchR package combining the different kernels and the following two parameter settings gave optimal results,
k | method | kernel |
---|---|---|
16 | canberra | biweight_triweight_gaussian_MULT |
5 | canberra | triangular_triweight_MULT |
fit_cv_pair1 = KernelKnnCV(X, y, k = 16, folds = 5, method = 'canberra',
weights_function = 'biweight_triweight_gaussian_MULT',
regression = F, Levels = unique(y), threads = 5,
seed_num = 5)
## List of 2
## $ preds:List of 5
## ..$ : num [1:71, 1:2] 0.0015 0.1516 1 0.9763 0.9674 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0.999 ...
## ..$ : num [1:70, 1:2] 0.249 0 0.113 0.252 0.27 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0 ...
## ..$ : num [1:70, 1:2] 0.991 0 1 0 0 ...
## $ folds:List of 5
## ..$ fold_1: int [1:71] 5 26 233 243 30 41 237 229 19 11 ...
## ..$ fold_2: int [1:70] 262 89 257 67 58 266 253 85 275 268 ...
## ..$ fold_3: int [1:70] 127 128 295 287 134 288 130 277 125 101 ...
## ..$ fold_4: int [1:70] 313 301 317 318 316 142 175 157 146 147 ...
## ..$ fold_5: int [1:70] 195 326 225 332 342 347 206 219 218 214 ...
fit_cv_pair2 = KernelKnnCV(X, y, k = 5, folds = 5, method = 'canberra',
weights_function = 'triangular_triweight_MULT',
regression = F, Levels = unique(y), threads = 5,
seed_num = 5)
## List of 2
## $ preds:List of 5
## ..$ : num [1:71, 1:2] 0 0.0273 1 1 1 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 1 ...
## ..$ : num [1:70, 1:2] 0.1161 0 0.0105 0.307 0.022 ...
## ..$ : num [1:70, 1:2] 0 0 0 0 0 ...
## ..$ : num [1:70, 1:2] 1 0 1 0 0 ...
## $ folds:List of 5
## ..$ fold_1: int [1:71] 5 26 233 243 30 41 237 229 19 11 ...
## ..$ fold_2: int [1:70] 262 89 257 67 58 266 253 85 275 268 ...
## ..$ fold_3: int [1:70] 127 128 295 287 134 288 130 277 125 101 ...
## ..$ fold_4: int [1:70] 313 301 317 318 316 142 175 157 146 147 ...
## ..$ fold_5: int [1:70] 195 326 225 332 342 347 206 219 218 214 ...
acc_pair1 = unlist(lapply(1:length(fit_cv_pair1$preds),
function(x) acc(y[fit_cv_pair1$folds[[x]]],
fit_cv_pair1$preds[[x]])))
acc_pair1
## [1] 0.9014085 0.9142857 0.9285714 0.9285714 0.9571429
## accuracy for params_pair1 is : 0.925996
acc_pair2 = unlist(lapply(1:length(fit_cv_pair2$preds),
function(x) acc(y[fit_cv_pair2$folds[[x]]],
fit_cv_pair2$preds[[x]])))
acc_pair2
## [1] 0.9014085 0.9285714 0.9285714 0.9142857 0.9714286
## accuracy for params_pair2 is : 0.9288531