In this tutorial we will see some libraries of scikit for clustering. You can read more about clustering in SciKit here:
import numpy as np
import scipy as sp
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text
import scipy.cluster.hierarchy as hr
import time
import seaborn as sns
%matplotlib inline
Generate data from Gaussian distributions.
More on data generation here: http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html
centers = [[1,1], [-1, -1], [1, -1]]
X, true_labels = sk_data.make_blobs(n_samples=500, centers=3, n_features=2,
                                    center_box=(-10.0, 10.0),random_state=0)
#X, true_labels = sk_data.make_blobs(n_samples=500, centers=centers, n_features=2,center_box=(-10.0, 10.0),cluster_std = 0.4, random_state=0)
plt.scatter(X[:,0], X[:,1])
print(type(X))
print(true_labels)
print(len(true_labels[true_labels==0]),len(true_labels[true_labels==1]),len(true_labels[true_labels==2]))
plt.scatter(X[true_labels==1,0], X[true_labels==1,1],c = 'r')
plt.hold
plt.scatter(X[true_labels==2,0], X[true_labels==2,1],c = 'b')
plt.hold
plt.scatter(X[true_labels==0,0], X[true_labels==0,1],c = 'g')
euclidean_dists = metrics.euclidean_distances(X)
plt.pcolor(euclidean_dists,cmap=plt.cm.coolwarm)
scikit-learn has a huge set of tools for unsupervised learning generally, and clustering specifically. These are in sklearn.cluster. http://scikit-learn.org/stable/modules/clustering.html
There are 3 functions in all the clustering classes,
fit() builds the model from the training data (e.g. for kmeans, it finds the centroids),
predict() assigns labels to the data after building the model, and
fit_predict() does both at the same data (e.g in kmeans, it finds the centroids and assigns the labels to the dataset).
More on the k-means clustering here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
import sklearn.cluster as sk_cluster
kmeans = sk_cluster.KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
kmeans_labels = kmeans.labels_
error = kmeans.inertia_
print ("The total error of the clustering is: ", error)
print ('\nCluster labels')
print(kmeans_labels)
print ('\n Cluster Centroids')
print (centroids)
idx = np.argsort(kmeans_labels) # returns the indices in sorted order
rX = X[idx,:]
r_euclid = metrics.euclidean_distances(rX)
#r_euclid = euclidean_dists[idx,:][:,idx]
plt.pcolor(r_euclid,cmap=plt.cm.coolwarm)
Confusion matrix: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
Important: In the produced confusion matrix, the first list defines the rows and the second the columns. The matrix is always square, regarless if the number of classes and clusters are not the same. The extra rows or columns are filled with zeros.
Homogeneity and completeness: http://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness
Silhouette score: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
C= metrics.confusion_matrix(true_labels,kmeans_labels)
print (C)
#plt.pcolor(C,cmap=plt.cm.coolwarm)
plt.pcolor(C,cmap=plt.cm.Reds)
#The metrics assume that cluster i is mapped to class i
p = metrics.precision_score(true_labels,kmeans_labels, average=None)
print(p)
r = metrics.recall_score(true_labels,kmeans_labels, average = None)
print(r)
# map each cluster to the class with the larger number of points, and compute the new confusion matrix
# You need to be careful in the case that many clusters map to the same class
def cluster_class_mapping(kmeans_labels,true_labels):
    C= metrics.confusion_matrix(true_labels,kmeans_labels)
    mapping = list(np.argmax(C,axis=0)) #for each column (cluster) find the best class in the confusion matrix
    mapped_kmeans_labels = [mapping[l] for l in kmeans_labels]
    C2= metrics.confusion_matrix(true_labels,mapped_kmeans_labels)
    return mapped_kmeans_labels,C2
mapped_kmeans_labels,C = cluster_class_mapping(kmeans_labels,true_labels)
print(C)
h = metrics.homogeneity_score(true_labels,kmeans_labels)
print(h)
c = metrics.completeness_score(true_labels,kmeans_labels)
print(c)
v = metrics.v_measure_score(true_labels,kmeans_labels)
print(v)
p = metrics.precision_score(true_labels,mapped_kmeans_labels, average=None)
print(p)
r = metrics.recall_score(true_labels,mapped_kmeans_labels, average = None)
print(r)
p = metrics.precision_score(true_labels,mapped_kmeans_labels, average='weighted')
print(p)
r = metrics.recall_score(true_labels,mapped_kmeans_labels, average = 'weighted')
print(r)
error = np.zeros(11)
sh_score = np.zeros(11)
for k in range(1,11):
    kmeans = sk_cluster.KMeans(init='k-means++', n_clusters=k, n_init=10)
    kmeans.fit_predict(X)
    error[k] = kmeans.inertia_
    if k>1: sh_score[k]= metrics.silhouette_score(X, kmeans.labels_)
plt.plot(range(1,len(error)),error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Error')
plt.plot(range(2,len(sh_score)),sh_score[2:])
plt.xlabel('Number of clusters')
plt.ylabel('silhouette score')
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
plt.scatter(X[:, 0], X[:, 1], color=colors[kmeans_labels].tolist(), s=10, alpha=0.8)
More on Agglomerative Clustering here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
agglo = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 3)
agglo_labels = agglo.fit_predict(X)
C_agglo= metrics.confusion_matrix(true_labels,agglo_labels)
print (C_agglo)
#plt.pcolor(C_agglo,cmap=plt.cm.coolwarm)
plt.pcolor(C_agglo,cmap=plt.cm.Reds)
mapped_agglo_labels,C_agglo = cluster_class_mapping(agglo_labels,true_labels)
print(C_agglo)
p = metrics.precision_score(true_labels,mapped_agglo_labels, average='weighted')
print(p)
r = metrics.recall_score(true_labels,mapped_agglo_labels, average = 'weighted')
print(r)
Another way to do agglomerative clustering using SciPy:
https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
import scipy.cluster.hierarchy as hr
Z = hr.linkage(X, method='complete', metric='euclidean')
print (Z.shape, X.shape)
import scipy.spatial.distance as sp_dist
D = sp_dist.pdist(X, 'euclidean') 
Z = hr.linkage(D, method='complete')
print (Z.shape, X.shape)
Hierarchical clustering returns a 4 by (n-1) matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the n original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster.
fig = plt.figure(figsize=(10,10))
T = hr.dendrogram(Z,color_threshold=0.4, leaf_font_size=4)
fig.show()
Another way to do agglomerative clustering (and visualizing it): http://seaborn.pydata.org/generated/seaborn.clustermap.html
distances = metrics.euclidean_distances(X)
cg = sns.clustermap(distances, method="complete", figsize=(13,13), xticklabels=False)
print (cg.dendrogram_col.reordered_ind)
More on DBSCAN here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
dbscan = sk_cluster.DBSCAN(eps=0.3)
dbscan_labels = dbscan.fit_predict(X)
print(dbscan_labels) #label -1 corresponds to noise
renamed_dbscan_labels = [x+1 for x in dbscan_labels]
C = metrics.confusion_matrix(true_labels,renamed_dbscan_labels)
print (C[:max(true_labels)+1,:])
#print(metrics.confusion_matrix(true_labels,renamed_dbscan_labels))
#colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
#colors = np.hstack([colors] * 20)
colors = np.array([x for x in 'bgrcmywk'*10])
plt.scatter(X[:, 0], X[:, 1], color=colors[dbscan_labels].tolist(), s=10, alpha=0.8)
So far we have assumed that the intput is in the form of numerical vectors to which we can apply directly the algorithms we have. Often the data will be more complex. For example what if we want to cluster categorical data, itemsets, or text? Python provides libraries for processing the data and transforming them to a format that we can use.
Python offers a set of tools for extracting features:http://scikit-learn.org/stable/modules/feature_extraction.html
The DictVectorizer feature extraction: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html#sklearn.feature_extraction.DictVectorizer
The DictVectorizer takes a dictionary of attribute-value pairs and transforms them into numerical vectors. Real values are preserved, while categorical attributes are transformed into binary. The vectorizer produces a sparse representation.
measurements = [
{'city': 'Dubai', 'temperature': 45},
{'city': 'London', 'temperature': 12},
{'city': 'San Fransisco', 'temperature': 23},
]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
print(type(vec.fit_transform(measurements)))
print(vec.fit_transform(measurements).toarray())
vec.get_feature_names()
measurements = [
    {'refund' : 'No','marital_status': 'married', 'income' : 100},
    {'refund' : 'Yes','marital_status': 'single', 'income' : 120},
    {'refund' : 'No','marital_status':'divorced', 'income' : 80},
]
vec = DictVectorizer()
print(vec.fit_transform(measurements))
vec.get_feature_names()
Feature extraction from text: http://scikit-learn.org/stable/modules/classes.html#text-feature-extraction-ref
SciKit datasets: http://scikit-learn.org/stable/datasets/
We will use the 20-newsgroups datasets which consists of postings on 20 different newsgroups.
More information here: http://scikit-learn.org/stable/datasets/#the-20-newsgroups-text-dataset
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
#categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = sk_data.fetch_20newsgroups(subset='train', 
                               remove=('headers', 'footers', 'quotes'),
                               categories=categories)
#print (news_data.target, len(news_data.target))
print (news_data.target_names)
print (type(news_data))
print (news_data.filenames)
print (news_data.target[:10])
print (news_data.data[0])
print (len(news_data.data))
The CountVectorizer can be used to extract features in the form of bag of words. It is typically used for text, but you could use it to represent also a collection of itemsets (where each itemset will become a word).
import sklearn.feature_extraction.text as sk_text
vectorizer = sk_text.CountVectorizer(min_df=1)
#vectorizer = sk_text.CountVectorizer(min_df=1,stop_words = 'english')
corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first document?',
          ]
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
vectorizer.get_feature_names()
TfIdfVectorizer transforms text into a sparse matrix where rows are text and columns are words, and values are the tf-dif values. It performs tokenization, normalization, and removes stop-words. More here: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
vectorizer = sk_text.TfidfVectorizer(
                            #stop_words='english',
                             #max_features = 1000,
                             min_df=1)
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
print (vectorizer.get_feature_names())
vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                             #max_features = 1000,
                             min_df=4, max_df=0.8)
data = vectorizer.fit_transform(news_data.data)
print(type(data))
An example of what we want to do: http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
import sklearn.cluster as sk_cluster
k=3
kmeans = sk_cluster.KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
kmeans.fit_predict(data)
print("Top terms per cluster:")
asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
order_centroids = asc_order_centroids[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print (' %s' % terms[ind])
    print
C = metrics.confusion_matrix(news_data.target,kmeans.labels_)
mapped_kmeans_labels,C = cluster_class_mapping(kmeans.labels_,news_data.target)
print (C)
p = metrics.precision_score(news_data.target,mapped_kmeans_labels, average=None)
print(p)
r = metrics.recall_score(news_data.target,mapped_kmeans_labels, average = None)
print(r)
agglo = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 3,)
dense = data.todense()
agglo_labels = agglo.fit_predict(dense) # agglomerative needs dense data
C_agglo= metrics.confusion_matrix(news_data.target,agglo_labels)
print (C_agglo)
dbscan = sk_cluster.DBSCAN(eps=0.1)
dbscan_labels = dbscan.fit_predict(data)
C = metrics.confusion_matrix(news_data.target,dbscan.labels_)
print (C)