K-means

# Function to initialize centroids randomly
initialize_centroids <- function(data, k) {
  n = nrow(data)
  indices = sample(1:n, k)
  centroids = data[indices, ]
  return(centroids)
}

# Function to assign each data point to the nearest centroid
assign_clusters <- function(data, centroids) {
  n = nrow(data)
  k = nrow(centroids)
  distances = matrix(NA, nrow = n, ncol = k)
  
  # Calculate distances from each data point to each centroid
  for (i in 1:k) {
    distances[, i] = rowSums((data - centroids[i, ])^2)
  }
  
  # Assign data points to the nearest centroid
  clusters = apply(distances, 1, which.min)
  return(clusters)
}

# Function to update centroids based on cluster assignments
update_centroids <- function(data, clusters, k) {
  n = nrow(data)
  d = ncol(data)
  centroids = matrix(NA, nrow = k, ncol = d)
  for (i in 1:k) {
    centroids[i, ] = colMeans(data[clusters == i, ])
  }
  return(centroids)
}

# Function to check if centroids have converged
centroids_converged <- function(old_centroids, new_centroids, tol = 1e-6) {
  sum_diff = sum((old_centroids - new_centroids)^2)
  return(sum_diff < tol)
}

# K-means algorithm
kmeans <- function(data, k, max_iters = 100, plot_iters = 0) {
  # Input:
  # data: matrix of data points
  # k: number of clusters
  # max_iters: number of maximum iterations for k-means
  # plot_iters: number of iterations per plot; 0 means no plotting
  # Output: a list of kmeans outputs.
  # centroids: coordinates of the cluster centroids
  # clusters: membership of clusters
  
  if(plot_iters != 0){
    par(mar = c(2, 2, 2, 2))
  }
  
  # Initialize centroids
  centroids = initialize_centroids(data, k)
  
  # Loop until convergence or maximum iterations reached
  for (iter in 1:max_iters) {
    old_centroids = centroids
    
    # Assign data points to clusters
    clusters = assign_clusters(data, centroids)
    
    # Update centroids
    centroids = update_centroids(data, clusters, k)
    
    if((plot_iters != 0) & (iter %% plot_iters == 0)){
      plot(data[,1], data[,2], xlab = "", ylab = "", col = clusters, pch = 20)
      points(centroids, col = 1:k, pch = 8, cex = 2)
    }
    
    # Check for convergence
    if (centroids_converged(old_centroids, centroids)) {
      break
    }
  }
  print(paste0("Number of iterations:", iter))
  
  # Return final centroids and cluster assignments
  return(list(centroids = centroids, clusters = clusters))
}

Test example

set.seed(123)
cluster1 <- matrix(rnorm(100, mean = 0, sd = 0.5), ncol = 2)
cluster2 <- matrix(rnorm(100, mean = 1.5, sd = 0.5), ncol = 2)
cluster3 <- matrix(rnorm(100, mean = 3, sd = 0.5), ncol = 2)
data <- rbind(cluster1, cluster2, cluster3)

# Specify number of clusters (k)
k = 3

# Run k-means algorithm
result = kmeans(data = data, k = k, plot_iters = 1)

## [1] "Number of iterations:8"

#plot(data, col = result$clusters, pch = 20)
#points(result$centroids, col = 1:k, pch = 8, cex = 2)

Hierarchical clustering analysis (HCA)

Application

# Generate example data
set.seed(123)
data <- matrix(rnorm(200), ncol = 2)

# Dissimilarity matrix
d <- dist(data, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "complete" )

# Cut tree into 4 groups
hc1_groups <- cutree(hc1, k = 4)

# Number of members in each cluster
table(hc1_groups)

## hc1_groups
##  1  2  3  4 
## 25 53 17  5

# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 4, border = 2:5)

plot(data[,1], data[,2], col = hc1_groups, xlab = "x", ylab = "y")

# Hierarchical clustering using Complete Linkage
hc2 <- hclust(d, method = "average" )

# Plot the obtained dendrogram
plot(hc2, cex = 0.6, hang = -1)
# Cut tree into 4 groups
hc2_groups <- cutree(hc2, k = 4)

# Plot the obtained dendrogram
plot(hc2, cex = 0.6, hang = -1)
rect.hclust(hc2, k = 4, border = 2:5)

plot(data[,1], data[,2], col = hc2_groups, xlab = "x", ylab = "y")

# Hierarchical clustering using Complete Linkage
hc3 <- hclust(d, method = "ward.D2" )

# Plot the obtained dendrogram
plot(hc3, cex = 0.6, hang = -1)
# Cut tree into 4 groups
hc3_groups <- cutree(hc3, k = 4)

# Plot the obtained dendrogram
plot(hc3, cex = 0.6, hang = -1)
rect.hclust(hc3, k = 4, border = 2:5)

plot(data[,1], data[,2], col = hc3_groups, xlab = "x", ylab = "y")

Part 10: High dimensional data analysis

B. Clustering

Mingyang Lu

03/27/2024

K-means

Hierarchical clustering analysis (HCA)