Introduction

MNIST Machine Learning project involves using the MNIST dataset to build a machine learning model for recognizing handwritten digits, helping to classify images of numbers from 0 to 9 with high accuracy, including using PCA, K-means and Kernel Methods.

Code & Images

K-means clustering

import random
seed = 0
random.seed(seed)

loss = []

def k_means(x, k = 2 , epochs = 600, print_initial_center = False):
    ''' 
    Parameters:
        x: input data
        k: number of clusters
        epochs: number of iterations
    '''
    # 1. Randomly initialize k centroids
    np.random.seed(2024)
    clusters = []
    # "you need to randomly initialize k centroids using x"
    random_list = np.random.choice(x.shape[0], k, replace=False)
    cluster_center = x[np.random.choice(x.shape[0], k, replace=False)]
    if print_initial_center:
        print('the initial cluster center is', len(set(y[random_list])), ' - ', y[random_list])
        # replace=False -> no duplicate centroids
    for i in range(k):
        clusters.append([])

    # 2. Training
    for _ in range(epochs):
        new_clusters_center = np.zeros((k, x.shape[1]))
        for i in range(k):
            clusters[i]=[]
        ## a. E-step: finding points to the nearest centroid, assigning to corresponding clusters
        ## Calculate the distance from all points to the k cluster centers
        for i in range(x.shape[0]):
            xi = x[i]
            # "Calculate the distance of xi with each centroid using norm2"
            distances = [np.linalg.norm(xi - cluster_center[j]) for j in range(k)]
            # Returns the index of the nearest centriod to this datapoint
            c = distances.index(min(distances))
                # c: index of the nearest centroid
            clusters[c].append(i) # Append this datapoint to this cluster

        ## b. M-step: recalculate the location of the centroid using the mean of the clusters
        for i in range(k):
            # "You need to calculate the position of the new centroids"
            new_clusters_center[i] = np.mean(x[clusters[i]], axis=0)

        # c. If the centroid did not change, the algorithm should be stopped, otherwise continue.
        if np.all(new_clusters_center == cluster_center):
            return clusters, new_clusters_center

        # d. update the centroid
        loss.append(abs(np.sum(new_clusters_center - cluster_center)))
        cluster_center = new_clusters_center

    return clusters, cluster_center

k = 10
clusters, cluster_center = k_means(x, k)

import matplotlib.pyplot as plt
import numpy as np
# Create figure
figure = plt.figure(figsize=(16, 2))

# for 10 centers
for i in range(10):
    ax = figure.add_subplot(1, 10, i + 1)  # 1 row 9 column layout
    pixels = cluster_center[i].reshape((28, 28))
    
    # show images
    temp = ax.imshow(pixels, cmap='gray')
    
    # turn off axis
    temp = ax.axis('off')

# plot
plt.show()

def k_mean_cluster_number(clusters, y):
    cluster_number = []
    for i in range(k):
        current_cluster = clusters[i]
        current_y = y[current_cluster]
        # find the most common number in the cluster
        most_common = np.argmax(np.bincount(current_y))
        cluster_number.append(most_common)
    return cluster_number

k = 10
print(k_mean_cluster_number(clusters, y))

output: [0, 8, 2, 3, 1, 3, 7, 4, 8, 6]

Find best K in K-means

# use 10-divided data to test the best K
import time
x_train, x_test = x[:4000], x[4000:]
y_train, y_test = y[:4000], y[4000:]

# Method
    # divide K to 10 pieces, get 10 accuracy, save into dictionary
    # get the best 2, divide again
    # if no new K, stop

def k_mean_cluster_number(clusters, y):
    # get the most common number in each cluster, as the cluster representative
    cluster_number = []
    for i in range(len(clusters)):
        current_cluster = clusters[i]
        current_y = y[current_cluster]
        # find the most common number in the cluster
        most_common = np.argmax(np.bincount(current_y))
        cluster_number.append(most_common)
    return cluster_number

def predict(clusters, cluster_center, y_train, xi):
    # get the predicted result from the closest cluster
    cluster_number = k_mean_cluster_number(clusters, y_train)
    distances = [np.linalg.norm(xi - cluster_center[j]) for j in range(len(cluster_center))]
    min_dis_index = distances.index(min(distances))
    y_pred = cluster_number[min_dis_index]
    return y_pred

def accuracy(clusters, cluster_center, y_train, x_test, y_test):
    correct = 0
    for i in range(len(x_test)):
        y_pred = predict(clusters, cluster_center, y_train, x_test[i])
        if y_pred == y_test[i]:
            correct += 1
    return correct / len(x_test)

def get_n_number_from_range(start, end, n):
    if end<=start:
        start, end = end, start
    l = np.linspace(start, end, n, dtype=int)
    l = np.unique(l)
    l = list(l)
    return l

def get_accuracy_into_dict(x_train, y_train, x_test, y_test, k_list):
    k_in_dict = []
    for k in k_list:
        if k not in accuracy_dict.keys():
            start_time = time.time()
            clusters, cluster_center = k_means(x_train, k)
            accuracy_rate = accuracy(clusters, cluster_center, y_train, x_test, y_test)
            time_cost = time.time() - start_time
            print(f't={time_cost:6.2f}s, k={k:4} - Accuracy rate: {accuracy_rate}')
            accuracy_dict[k] = accuracy_rate
        else:
            k_in_dict.append(False)
    return len(k_list)-len(k_in_dict)
    
def get_max_2_from_dict(accuracy_dict):
    max_2 = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)[:2]
    result = [i[0] for i in max_2]
    result.sort()
    return result

# parameters
start = 2
end = len(x_train)-1
divide = 10
tolerance = 10
accuracy_dict = {}

# run the method
while tolerance > 0:
    k_list = get_n_number_from_range(start, end, divide+1)
    print(f'----------{k_list}----------')
    new_k_number = get_accuracy_into_dict(x_train, y_train, x_test, y_test, k_list)
    if new_k_number == 0:
        break
    start, end = get_max_2_from_dict(accuracy_dict)
    tolerance -= 1

# get best k
best_k = get_max_2_from_dict(accuracy_dict)[0]
best_accuracy = accuracy_dict[best_k]
print(f'The optimal k is {best_k} with an accuracy rate of {best_accuracy}')

---------[2, 401, 801, 1201, 1600, 2000, 2400, 2799, 3199, 3599, 3999]---------- t= 0.98s, k= 2 - Accuracy rate: 0.2155 t= 46.17s, k= 401 - Accuracy rate: 0.879 t= 66.29s, k= 801 - Accuracy rate: 0.899 t= 77.24s, k=1201 - Accuracy rate: 0.9055 t= 76.35s, k=1600 - Accuracy rate: 0.906 t= 78.92s, k=2000 - Accuracy rate: 0.9105 t= 94.07s, k=2400 - Accuracy rate: 0.9175 t=110.12s, k=2799 - Accuracy rate: 0.925 t=126.86s, k=3199 - Accuracy rate: 0.928 t=116.47s, k=3599 - Accuracy rate: 0.9305 t= 90.95s, k=3999 - Accuracy rate: 0.933 ----------[3599, 3639, 3679, 3719, 3759, 3799, 3839, 3879, 3919, 3959, 3999]---------- t=112.00s, k=3639 - Accuracy rate: 0.9305 t=115.66s, k=3679 - Accuracy rate: 0.93 t=115.42s, k=3719 - Accuracy rate: 0.9305 t=116.64s, k=3759 - Accuracy rate: 0.9305 t=118.10s, k=3799 - Accuracy rate: 0.932 t=125.24s, k=3839 - Accuracy rate: 0.9335 t=123.49s, k=3879 - Accuracy rate: 0.935 t=120.79s, k=3919 - Accuracy rate: 0.935 t=120.68s, k=3959 - Accuracy rate: 0.9335 ----------[3879, 3883, 3887, 3891, 3895, 3899, 3903, 3907, 3911, 3915, 3919]---------- t=117.72s, k=3883 - Accuracy rate: 0.935 t=118.46s, k=3887 - Accuracy rate: 0.935 t=120.62s, k=3891 - Accuracy rate: 0.935 t=120.66s, k=3895 - Accuracy rate: 0.9345 t=120.08s, k=3899 - Accuracy rate: 0.9355 t=123.20s, k=3903 - Accuracy rate: 0.9355 t=122.83s, k=3907 - Accuracy rate: 0.9365 t=122.17s, k=3911 - Accuracy rate: 0.9365 t=119.37s, k=3915 - Accuracy rate: 0.936 ----------[3907, 3908, 3909, 3910, 3911]---------- t=118.79s, k=3908 - Accuracy rate: 0.9365 t=118.53s, k=3909 - Accuracy rate: 0.9365 t=119.35s, k=3910 - Accuracy rate: 0.9365 ----------[3907, 3908, 3909, 3910, 3911]---------- The optimal k is 3907 with an accuracy rate of 0.9365

def plot_dict(d: dict):
    x_value = [i for i in d.keys()]
    y_value = [i for i in d.values()]
    figure = plt.figure(figsize=(12, 4))
    temp = plt.plot(x_value, y_value, marker='.')
    temp = plt.title('Accuracy rate vs. k')
    plt.show()
plot_dict(accuracy_dict)

K-means with RBF-kernel

x_train2, x_test2 = x[:500], x[500:1000]
y_train2, y_test2 = y[:500], y[500:1000]

def cal_RBF(x):
    ## implement RBF method
    n = x.shape[0]
    # "You need to initialize k"
    kernel_x = np.zeros((n, n))
    sigma = 0

    for i in range(n):
        for j in range(n):
            # "You need to implement sigma here"
            sigma += (np.linalg.norm(x[i] - x[j], ord=2))**2
    sigma = sigma/(n**2)

    for i in range(n):
        for j in range(n):
            # "You need to implement kernel x here"
            kernel_x[i, j] = np.exp(-(np.linalg.norm(x[i] - x[j], ord=2))**2 / (2 * sigma**2))
    # kernel_x = k
    return kernel_x
kernel_x = cal_RBF(x_train2)
clusters, cluster_center = k_means(kernel_x, 5)

def plot_clusters(clusters, y_train):
    x_value = []
    y_value = []
    c_value = []
    for i in range(len(clusters)):
        for j in clusters[i]:
            x_value.append(j)
            y_value.append(i)
            c_value.append(y_train[j])
    figure_height = max(15*len(set(y_value))//len(x_value), 3)
    figure = plt.figure(figsize=(15, figure_height))
    temp = plt.scatter(x_value, y_value, marker='o', c=c_value)
    plt.show()
plot_clusters(clusters, y_train2)