We show examples of jointly discretizing continuous data based on grids that preserve clusters in the original data.
plot.patterns()
Below is a helper function plot.patterns
to visualize a three-dimensional grid on a three-variable data set. It shows scatter plots of each dimension with the clusters identified on the continuous data and the corresponding contingency table after grid discretization. We will use it to show the grid obtained on three examples next.
# Package `FunChisq' must have been installed.
plot.patterns <- function(x, y, z, res)
{
k = length(unique(res$clabels))
mar = c(2.5,3,4,1)
par(mar=mar, mgp=c(3,1,0)-c(1.5,0.5,0), lwd=2)
col <- "limegreen"
labelcol <- colorRampPalette(c("black", col))
plot(x, y, main="Original data", col=labelcol(k)[res$clabels],
pch=19, cex.axis=0.8, cex=0.7)
abline(v=res$grid[[1]], h=res$grid[[2]], col="black", lty="dotted")
tab <- as.matrix(table(-res$D[, 2], res$D[, 1]))
par(xpd=TRUE)
legend("top", legend = c(min(res$clabels):max(res$clabels)), pch=19,
col = labelcol(k), bty = "n", horiz = TRUE, inset = c(0,-0.2))
par(xpd=FALSE)
FunChisq::plot_table(
tab, xlab="x discretized", ylab="y discretized",
col=col, main="Discretized data", highlight="none", mar = mar)
col <- "brown3"
labelcol <- colorRampPalette(c("black", col))
plot(y, z, main="Original data", col=labelcol(k)[res$clabels],
pch=19, cex.axis=0.8, cex=0.7)
abline(v=res$grid[[2]], h=res$grid[[3]], col="black", lty="dotted")
tab <- as.matrix(table(-res$D[, 3], res$D[, 2]))
par(xpd=TRUE)
legend("top", legend = c(min(res$clabels):max(res$clabels)), pch=19,
col = labelcol(k), bty = "n", horiz = TRUE, inset = c(0,-0.2))
par(xpd=FALSE)
FunChisq::plot_table(
tab, xlab="y discretized", ylab="z discretized",
col=col, main="Discretized data", highlight="none", mar = mar)
col <- "dodgerblue"
labelcol <- colorRampPalette(c("black", col))
plot(x, z, main="Original data", col=labelcol(k)[res$clabels],
pch=19, cex.axis=0.8, cex=0.7)
abline(v=res$grid[[1]], h=res$grid[[3]], col="black", lty="dotted")
tab <- as.matrix(table(-res$D[, 3], res$D[, 1]))
par(xpd=TRUE)
legend("top", legend = c(min(res$clabels):max(res$clabels)), pch=19,
col = labelcol(k), bty = "n", horiz = TRUE, inset = c(0,-0.2))
par(xpd=FALSE)
FunChisq::plot_table(
tab, xlab="x discretized", ylab="z discretized",
col=col, main="Discretized data", highlight="none", mar = mar)
}
require(GridOnClusters)
#> Loading required package: GridOnClusters
x = rnorm(50)
y = sin(x)
z = cos(x)
data = cbind(x, y, z)
res = discretize.jointly(data, k=3) # using a specified k
plot.patterns(x, y, z, res)
Example 1. Nonlinear curves using k-means clustering with a fixed number of clusters.
x = rnorm(100)
y = log1p(abs(x))
z = ifelse(x >= -0.5 & x <= 0.5, 0, 1) + rnorm(100, 0, 0.1)
data = cbind(x, y, z)
res = discretize.jointly(data, k=c(2:3)) # using a range of k
plot.patterns(x, y, z, res)
Example 2. Using a range for the number of k-means clusters
# using a clustering method other than k-means
x = rnorm(100)
y = log1p(abs(x))
z = sin(x)
data = cbind(x, y, z)
# pre-cluster the data using partition around medoids (PAM)
cluster_label = cluster::pam(x=data, diss = FALSE, metric = "euclidean", k = 4)$clustering
res = discretize.jointly(data, cluster_label = cluster_label)
plot.patterns(x, y, z, res)
Example 3. Using the partition around medoids clustering method.