MNIST Machine Learning project involves using the MNIST dataset to build a machine learning model for recognizing handwritten digits, helping to classify images of numbers from 0 to 9 with high accuracy, including using PCA, K-means and Kernel Methods.
import random
seed = 0
random.seed(seed)
loss = []
def k_means(x, k = 2 , epochs = 600, print_initial_center = False):
'''
Parameters:
x: input data
k: number of clusters
epochs: number of iterations
'''
# 1. Randomly initialize k centroids
np.random.seed(2024)
clusters = []
# "you need to randomly initialize k centroids using x"
random_list = np.random.choice(x.shape[0], k, replace=False)
cluster_center = x[np.random.choice(x.shape[0], k, replace=False)]
if print_initial_center:
print('the initial cluster center is', len(set(y[random_list])), ' - ', y[random_list])
# replace=False -> no duplicate centroids
for i in range(k):
clusters.append([])
# 2. Training
for _ in range(epochs):
new_clusters_center = np.zeros((k, x.shape[1]))
for i in range(k):
clusters[i]=[]
## a. E-step: finding points to the nearest centroid, assigning to corresponding clusters
## Calculate the distance from all points to the k cluster centers
for i in range(x.shape[0]):
xi = x[i]
# "Calculate the distance of xi with each centroid using norm2"
distances = [np.linalg.norm(xi - cluster_center[j]) for j in range(k)]
# Returns the index of the nearest centriod to this datapoint
c = distances.index(min(distances))
# c: index of the nearest centroid
clusters[c].append(i) # Append this datapoint to this cluster
## b. M-step: recalculate the location of the centroid using the mean of the clusters
for i in range(k):
# "You need to calculate the position of the new centroids"
new_clusters_center[i] = np.mean(x[clusters[i]], axis=0)
# c. If the centroid did not change, the algorithm should be stopped, otherwise continue.
if np.all(new_clusters_center == cluster_center):
return clusters, new_clusters_center
# d. update the centroid
loss.append(abs(np.sum(new_clusters_center - cluster_center)))
cluster_center = new_clusters_center
return clusters, cluster_center
k = 10
clusters, cluster_center = k_means(x, k)
import matplotlib.pyplot as plt
import numpy as np
# Create figure
figure = plt.figure(figsize=(16, 2))
# for 10 centers
for i in range(10):
ax = figure.add_subplot(1, 10, i + 1) # 1 row 9 column layout
pixels = cluster_center[i].reshape((28, 28))
# show images
temp = ax.imshow(pixels, cmap='gray')
# turn off axis
temp = ax.axis('off')
# plot
plt.show()
def k_mean_cluster_number(clusters, y):
cluster_number = []
for i in range(k):
current_cluster = clusters[i]
current_y = y[current_cluster]
# find the most common number in the cluster
most_common = np.argmax(np.bincount(current_y))
cluster_number.append(most_common)
return cluster_number
k = 10
print(k_mean_cluster_number(clusters, y))
output: [0, 8, 2, 3, 1, 3, 7, 4, 8, 6]
# use 10-divided data to test the best K
import time
x_train, x_test = x[:4000], x[4000:]
y_train, y_test = y[:4000], y[4000:]
# Method
# divide K to 10 pieces, get 10 accuracy, save into dictionary
# get the best 2, divide again
# if no new K, stop
def k_mean_cluster_number(clusters, y):
# get the most common number in each cluster, as the cluster representative
cluster_number = []
for i in range(len(clusters)):
current_cluster = clusters[i]
current_y = y[current_cluster]
# find the most common number in the cluster
most_common = np.argmax(np.bincount(current_y))
cluster_number.append(most_common)
return cluster_number
def predict(clusters, cluster_center, y_train, xi):
# get the predicted result from the closest cluster
cluster_number = k_mean_cluster_number(clusters, y_train)
distances = [np.linalg.norm(xi - cluster_center[j]) for j in range(len(cluster_center))]
min_dis_index = distances.index(min(distances))
y_pred = cluster_number[min_dis_index]
return y_pred
def accuracy(clusters, cluster_center, y_train, x_test, y_test):
correct = 0
for i in range(len(x_test)):
y_pred = predict(clusters, cluster_center, y_train, x_test[i])
if y_pred == y_test[i]:
correct += 1
return correct / len(x_test)
def get_n_number_from_range(start, end, n):
if end<=start:
start, end = end, start
l = np.linspace(start, end, n, dtype=int)
l = np.unique(l)
l = list(l)
return l
def get_accuracy_into_dict(x_train, y_train, x_test, y_test, k_list):
k_in_dict = []
for k in k_list:
if k not in accuracy_dict.keys():
start_time = time.time()
clusters, cluster_center = k_means(x_train, k)
accuracy_rate = accuracy(clusters, cluster_center, y_train, x_test, y_test)
time_cost = time.time() - start_time
print(f't={time_cost:6.2f}s, k={k:4} - Accuracy rate: {accuracy_rate}')
accuracy_dict[k] = accuracy_rate
else:
k_in_dict.append(False)
return len(k_list)-len(k_in_dict)
def get_max_2_from_dict(accuracy_dict):
max_2 = sorted(accuracy_dict.items(), key=lambda x: x[1], reverse=True)[:2]
result = [i[0] for i in max_2]
result.sort()
return result
# parameters
start = 2
end = len(x_train)-1
divide = 10
tolerance = 10
accuracy_dict = {}
# run the method
while tolerance > 0:
k_list = get_n_number_from_range(start, end, divide+1)
print(f'----------{k_list}----------')
new_k_number = get_accuracy_into_dict(x_train, y_train, x_test, y_test, k_list)
if new_k_number == 0:
break
start, end = get_max_2_from_dict(accuracy_dict)
tolerance -= 1
# get best k
best_k = get_max_2_from_dict(accuracy_dict)[0]
best_accuracy = accuracy_dict[best_k]
print(f'The optimal k is {best_k} with an accuracy rate of {best_accuracy}')
---------[2, 401, 801, 1201, 1600, 2000, 2400, 2799, 3199, 3599, 3999]---------- t= 0.98s, k= 2 - Accuracy rate: 0.2155 t= 46.17s, k= 401 - Accuracy rate: 0.879 t= 66.29s, k= 801 - Accuracy rate: 0.899 t= 77.24s, k=1201 - Accuracy rate: 0.9055 t= 76.35s, k=1600 - Accuracy rate: 0.906 t= 78.92s, k=2000 - Accuracy rate: 0.9105 t= 94.07s, k=2400 - Accuracy rate: 0.9175 t=110.12s, k=2799 - Accuracy rate: 0.925 t=126.86s, k=3199 - Accuracy rate: 0.928 t=116.47s, k=3599 - Accuracy rate: 0.9305 t= 90.95s, k=3999 - Accuracy rate: 0.933 ----------[3599, 3639, 3679, 3719, 3759, 3799, 3839, 3879, 3919, 3959, 3999]---------- t=112.00s, k=3639 - Accuracy rate: 0.9305 t=115.66s, k=3679 - Accuracy rate: 0.93 t=115.42s, k=3719 - Accuracy rate: 0.9305 t=116.64s, k=3759 - Accuracy rate: 0.9305 t=118.10s, k=3799 - Accuracy rate: 0.932 t=125.24s, k=3839 - Accuracy rate: 0.9335 t=123.49s, k=3879 - Accuracy rate: 0.935 t=120.79s, k=3919 - Accuracy rate: 0.935 t=120.68s, k=3959 - Accuracy rate: 0.9335 ----------[3879, 3883, 3887, 3891, 3895, 3899, 3903, 3907, 3911, 3915, 3919]---------- t=117.72s, k=3883 - Accuracy rate: 0.935 t=118.46s, k=3887 - Accuracy rate: 0.935 t=120.62s, k=3891 - Accuracy rate: 0.935 t=120.66s, k=3895 - Accuracy rate: 0.9345 t=120.08s, k=3899 - Accuracy rate: 0.9355 t=123.20s, k=3903 - Accuracy rate: 0.9355 t=122.83s, k=3907 - Accuracy rate: 0.9365 t=122.17s, k=3911 - Accuracy rate: 0.9365 t=119.37s, k=3915 - Accuracy rate: 0.936 ----------[3907, 3908, 3909, 3910, 3911]---------- t=118.79s, k=3908 - Accuracy rate: 0.9365 t=118.53s, k=3909 - Accuracy rate: 0.9365 t=119.35s, k=3910 - Accuracy rate: 0.9365 ----------[3907, 3908, 3909, 3910, 3911]---------- The optimal k is 3907 with an accuracy rate of 0.9365
def plot_dict(d: dict):
x_value = [i for i in d.keys()]
y_value = [i for i in d.values()]
figure = plt.figure(figsize=(12, 4))
temp = plt.plot(x_value, y_value, marker='.')
temp = plt.title('Accuracy rate vs. k')
plt.show()
plot_dict(accuracy_dict)
x_train2, x_test2 = x[:500], x[500:1000]
y_train2, y_test2 = y[:500], y[500:1000]
def cal_RBF(x):
## implement RBF method
n = x.shape[0]
# "You need to initialize k"
kernel_x = np.zeros((n, n))
sigma = 0
for i in range(n):
for j in range(n):
# "You need to implement sigma here"
sigma += (np.linalg.norm(x[i] - x[j], ord=2))**2
sigma = sigma/(n**2)
for i in range(n):
for j in range(n):
# "You need to implement kernel x here"
kernel_x[i, j] = np.exp(-(np.linalg.norm(x[i] - x[j], ord=2))**2 / (2 * sigma**2))
# kernel_x = k
return kernel_x
kernel_x = cal_RBF(x_train2)
clusters, cluster_center = k_means(kernel_x, 5)
def plot_clusters(clusters, y_train):
x_value = []
y_value = []
c_value = []
for i in range(len(clusters)):
for j in clusters[i]:
x_value.append(j)
y_value.append(i)
c_value.append(y_train[j])
figure_height = max(15*len(set(y_value))//len(x_value), 3)
figure = plt.figure(figsize=(15, figure_height))
temp = plt.scatter(x_value, y_value, marker='o', c=c_value)
plt.show()
plot_clusters(clusters, y_train2)