# Function to initialize centroids randomly
initialize_centroids <- function(data, k) {
n = nrow(data)
indices = sample(1:n, k)
centroids = data[indices, ]
return(centroids)
}
# Function to assign each data point to the nearest centroid
assign_clusters <- function(data, centroids) {
n = nrow(data)
k = nrow(centroids)
distances = matrix(NA, nrow = n, ncol = k)
# Calculate distances from each data point to each centroid
for (i in 1:k) {
distances[, i] = rowSums((data - centroids[i, ])^2)
}
# Assign data points to the nearest centroid
clusters = apply(distances, 1, which.min)
return(clusters)
}
# Function to update centroids based on cluster assignments
update_centroids <- function(data, clusters, k) {
n = nrow(data)
d = ncol(data)
centroids = matrix(NA, nrow = k, ncol = d)
for (i in 1:k) {
centroids[i, ] = colMeans(data[clusters == i, ])
}
return(centroids)
}
# Function to check if centroids have converged
centroids_converged <- function(old_centroids, new_centroids, tol = 1e-6) {
sum_diff = sum((old_centroids - new_centroids)^2)
return(sum_diff < tol)
}
# K-means algorithm
kmeans <- function(data, k, max_iters = 100, plot_iters = 0) {
# Input:
# data: matrix of data points
# k: number of clusters
# max_iters: number of maximum iterations for k-means
# plot_iters: number of iterations per plot; 0 means no plotting
# Output: a list of kmeans outputs.
# centroids: coordinates of the cluster centroids
# clusters: membership of clusters
if(plot_iters != 0){
par(mar = c(2, 2, 2, 2))
}
# Initialize centroids
centroids = initialize_centroids(data, k)
# Loop until convergence or maximum iterations reached
for (iter in 1:max_iters) {
old_centroids = centroids
# Assign data points to clusters
clusters = assign_clusters(data, centroids)
# Update centroids
centroids = update_centroids(data, clusters, k)
if((plot_iters != 0) & (iter %% plot_iters == 0)){
plot(data[,1], data[,2], xlab = "", ylab = "", col = clusters, pch = 20)
points(centroids, col = 1:k, pch = 8, cex = 2)
}
# Check for convergence
if (centroids_converged(old_centroids, centroids)) {
break
}
}
print(paste0("Number of iterations:", iter))
# Return final centroids and cluster assignments
return(list(centroids = centroids, clusters = clusters))
}
Test example
set.seed(123)
cluster1 <- matrix(rnorm(100, mean = 0, sd = 0.5), ncol = 2)
cluster2 <- matrix(rnorm(100, mean = 1.5, sd = 0.5), ncol = 2)
cluster3 <- matrix(rnorm(100, mean = 3, sd = 0.5), ncol = 2)
data <- rbind(cluster1, cluster2, cluster3)
# Specify number of clusters (k)
k = 3
# Run k-means algorithm
result = kmeans(data = data, k = k, plot_iters = 1)
## [1] "Number of iterations:8"
#plot(data, col = result$clusters, pch = 20)
#points(result$centroids, col = 1:k, pch = 8, cex = 2)
Application
# Generate example data
set.seed(123)
data <- matrix(rnorm(200), ncol = 2)
# Dissimilarity matrix
d <- dist(data, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )
# Cut tree into 4 groups
hc1_groups <- cutree(hc1, k = 4)
# Number of members in each cluster
table(hc1_groups)
## hc1_groups
## 1 2 3 4
## 25 53 17 5
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 4, border = 2:5)
plot(data[,1], data[,2], col = hc1_groups, xlab = "x", ylab = "y")
# Hierarchical clustering using Complete Linkage
hc2 <- hclust(d, method = "average" )
# Plot the obtained dendrogram
plot(hc2, cex = 0.6, hang = -1)
# Cut tree into 4 groups
hc2_groups <- cutree(hc2, k = 4)
# Plot the obtained dendrogram
plot(hc2, cex = 0.6, hang = -1)
rect.hclust(hc2, k = 4, border = 2:5)
plot(data[,1], data[,2], col = hc2_groups, xlab = "x", ylab = "y")
# Hierarchical clustering using Complete Linkage
hc3 <- hclust(d, method = "ward.D2" )
# Plot the obtained dendrogram
plot(hc3, cex = 0.6, hang = -1)
# Cut tree into 4 groups
hc3_groups <- cutree(hc3, k = 4)
# Plot the obtained dendrogram
plot(hc3, cex = 0.6, hang = -1)
rect.hclust(hc3, k = 4, border = 2:5)
plot(data[,1], data[,2], col = hc3_groups, xlab = "x", ylab = "y")