Introduction to Clustering¶

In this tutorial we will see some libraries of scikit for clustering. You can read more about clustering in SciKit here:

http://scikit-learn.org/stable/modules/clustering.html

import numpy as np
import scipy as sp
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist

import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text


import scipy.cluster.hierarchy as hr

import time
import seaborn as sns

%matplotlib inline

:0: FutureWarning: IPython widgets are experimental and may change in the future.

Generate data from Gaussian distributions.

More on data generation here: http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html

centers = [[1,1], [-1, -1], [1, -1]]
X, true_labels = sk_data.make_blobs(n_samples=500, centers=3, n_features=2,center_box=(-10.0, 10.0),random_state=0)
#X, true_labels = sk_data.make_blobs(n_samples=500, centers=centers, n_features=2,center_box=(-10.0, 10.0),cluster_std = 0.4, random_state=0)
plt.scatter(X[:,0], X[:,1])

<matplotlib.collections.PathCollection at 0x1699080>

print(type(X))
print(true_labels)

<class 'numpy.ndarray'>
[0 2 2 0 0 0 0 2 2 0 2 2 2 2 0 0 1 0 2 0 1 1 1 2 0 2 1 2 0 2 2 0 0 0 1 1 1
 1 1 1 2 0 0 1 1 0 1 0 2 2 0 0 0 0 2 2 1 1 0 2 1 2 1 1 1 2 0 1 2 0 2 0 0 2
 1 2 1 1 1 0 0 0 2 0 2 1 2 2 2 2 0 0 1 0 2 1 2 2 2 0 1 2 2 0 0 1 0 1 0 1 0
 0 1 2 2 1 2 1 1 2 2 0 2 2 2 0 2 0 2 0 1 0 2 1 0 1 2 2 1 0 0 2 2 1 0 0 0 1
 1 0 1 0 0 0 1 0 2 1 2 0 1 1 2 2 2 1 0 1 0 1 2 2 1 0 0 2 2 1 1 1 1 1 2 2 1
 1 0 1 0 2 2 0 1 1 0 2 1 0 2 1 2 1 0 0 2 1 1 1 2 2 0 1 1 2 2 0 2 0 2 2 1 2
 1 1 0 0 0 1 2 0 0 2 2 1 2 2 0 1 0 0 0 1 1 1 0 2 2 2 2 2 1 2 2 0 2 2 0 1 2
 1 1 0 0 1 1 0 2 1 2 1 1 2 1 0 0 1 0 1 1 1 1 2 1 0 0 0 0 2 2 1 1 2 2 0 0 1
 2 0 2 1 0 1 2 1 0 2 0 1 0 2 1 2 2 0 0 0 1 2 2 0 0 1 2 1 0 0 1 1 0 2 1 0 1
 2 1 1 0 2 0 2 1 2 1 0 0 0 1 0 0 2 1 0 2 2 2 0 1 1 1 2 0 1 2 0 0 0 2 0 2 0
 2 2 0 2 2 2 2 1 1 2 1 2 2 2 2 0 0 0 1 2 0 1 0 1 0 1 2 2 0 2 1 0 1 2 2 0 1
 2 1 2 0 0 0 1 2 0 0 1 2 2 0 2 1 0 2 0 1 0 2 0 0 1 0 0 0 0 1 0 1 2 1 1 0 2
 1 2 1 2 1 0 2 1 1 1 1 1 0 2 1 2 0 0 1 2 2 0 2 1 0 0 1 1 2 1 2 1 1 1 1 2 0
 1 1 0 1 2 2 0 1 1 2 0 2 0 0 1 1 1 0 1]

plt.scatter(X[true_labels==1,0], X[true_labels==1,1],c = 'r')
plt.hold
plt.scatter(X[true_labels==2,0], X[true_labels==2,1],c = 'b')
plt.hold
plt.scatter(X[true_labels==0,0], X[true_labels==0,1],c = 'g')

<matplotlib.collections.PathCollection at 0x186a2e8>

euclidean_dists = metrics.euclidean_distances(X)
plt.pcolor(euclidean_dists,cmap=plt.cm.coolwarm)

<matplotlib.collections.PolyCollection at 0x18dd438>

Clustering¶

scikit-learn has a huge set of tools for unsupervised learning generally, and clustering specifically. These are in sklearn.cluster. http://scikit-learn.org/stable/modules/clustering.html

There are 3 functions in all the clustering classes,

fit(),
predict(),
fit_predict().

fit() builds the model from the training data (e.g. for kmeans, it finds the centroids),

predict() assigns labels to the data after building the model, and

fit_predict() does both at the same data (e.g in kmeans, it finds the centroids and assigns the labels to the dataset).

K-means clustering¶

More on the k-means clustering here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans

import sklearn.cluster as sk_cluster

kmeans = sk_cluster.KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
kmeans_labels = kmeans.labels_
error = kmeans.inertia_

print ("The total error of the clustering is: ", error)
print ('\nCluster labels')
print(kmeans_labels)
print ('\n Cluster Centroids')
print (centroids)

The total error of the clustering is:  881.748277487

Cluster labels
[0 1 1 0 1 0 1 1 1 0 1 1 1 1 0 0 2 0 1 0 0 2 2 1 0 1 2 1 0 1 1 0 0 1 2 2 2
 2 2 2 1 0 0 2 2 0 2 0 1 1 0 0 0 0 1 1 2 2 0 2 2 1 2 2 2 1 2 2 1 0 1 0 0 1
 2 1 2 2 2 0 0 0 1 0 1 2 1 1 1 1 0 0 2 0 1 2 1 0 1 0 2 1 1 0 0 2 0 2 0 2 0
 0 2 1 1 2 1 2 2 1 1 2 0 1 1 0 1 0 1 0 2 0 1 2 0 2 1 1 2 0 0 0 1 2 0 0 0 2
 2 0 2 0 0 0 2 0 1 2 1 0 2 2 1 1 1 2 0 2 0 2 1 1 2 0 0 1 1 2 2 2 2 2 1 1 2
 2 0 2 1 1 1 0 2 2 0 1 2 0 1 2 1 2 0 0 1 2 2 2 1 1 0 2 2 1 1 0 1 0 1 1 2 1
 2 2 0 0 0 2 1 0 0 1 1 2 1 1 0 2 0 0 0 2 2 2 0 1 1 1 1 1 2 1 1 0 1 1 0 2 1
 2 2 0 0 2 2 0 1 2 1 2 2 1 2 0 1 2 0 2 2 2 2 1 2 0 0 0 0 1 1 1 2 1 1 0 0 2
 1 0 1 2 0 2 1 2 0 1 0 2 0 1 2 1 1 0 1 0 2 1 1 0 0 2 0 2 0 0 0 2 0 1 2 2 2
 1 2 2 0 1 1 1 2 1 2 0 0 0 2 0 1 1 2 0 1 1 1 2 2 2 2 1 0 2 1 0 0 0 2 0 1 0
 1 1 0 1 1 1 1 2 2 1 2 0 1 0 1 0 0 1 2 1 1 2 0 2 0 2 1 1 0 0 2 0 2 1 1 0 2
 1 2 1 0 0 1 2 2 0 0 2 1 1 0 1 2 0 1 0 2 0 1 0 0 2 0 0 0 0 2 0 2 1 2 1 0 1
 2 1 2 1 2 0 1 2 2 2 2 2 0 1 2 1 0 0 2 1 1 1 1 2 0 0 2 2 0 2 1 2 2 1 2 0 0
 2 2 0 2 1 1 0 2 2 1 0 1 0 0 2 2 2 0 2]

 Cluster Centroids
[[ 0.87564159  4.45514163]
 [-1.52371332  2.92068825]
 [ 1.96167358  0.73752985]]

idx = np.argsort(kmeans_labels) # returns the indices in sorted order
rX = X[idx,:]
r_euclid = metrics.euclidean_distances(rX)
#r_euclid = euclidean_dists[idx,:][:,idx]
plt.pcolor(r_euclid,cmap=plt.cm.coolwarm)

<matplotlib.collections.PolyCollection at 0xe367390>

Confusion matrix: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

Homogeneity and completeness: http://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness

Precision: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score

Recall: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score

C= metrics.confusion_matrix(true_labels,kmeans_labels)
print (C)
#plt.pcolor(C,cmap=plt.cm.coolwarm)
plt.pcolor(C,cmap=plt.cm.Reds)

[[151  12   4]
 [  2   3 162]
 [  9 154   3]]

<matplotlib.collections.PolyCollection at 0x19be5710>

p = metrics.precision_score(true_labels,kmeans_labels, average=None)
print(p)
r = metrics.recall_score(true_labels,kmeans_labels, average = None)
print(r)

[ 0.93209877  0.01775148  0.01775148]
[ 0.90419162  0.01796407  0.01807229]

mapping = list(np.argmax(C,axis=0))
print(mapping)
mapped_kmeans_labels = [0 for i in range(len(kmeans_labels))]
for i in range(len(kmeans_labels)):
    mapped_kmeans_labels[i] = mapping[kmeans_labels[i]]
    
C2= metrics.confusion_matrix(true_labels,mapped_kmeans_labels)
print (C2)
#plt.pcolor(C,cmap=plt.cm.coolwarm)
plt.pcolor(C2,cmap=plt.cm.coolwarm) 
C = C2

[0, 2, 1]
[[151   4  12]
 [  2 162   3]
 [  9   3 154]]

h = metrics.homogeneity_score(true_labels,kmeans_labels)
print(h)
c = metrics.completeness_score(true_labels,kmeans_labels)
print(c)
v = metrics.v_measure_score(true_labels,kmeans_labels)
print(v)
p = metrics.precision_score(true_labels,mapped_kmeans_labels, average=None)
print(p)
r = metrics.recall_score(true_labels,mapped_kmeans_labels, average = None)
print(r)
p = metrics.precision_score(true_labels,mapped_kmeans_labels, average='weighted')
print(p)
r = metrics.recall_score(true_labels,mapped_kmeans_labels, average = 'weighted')
print(r)

0.749703757499
0.749835439427
0.749769592681
[ 0.93209877  0.95857988  0.9112426 ]
[ 0.90419162  0.97005988  0.92771084]
0.934019212506
0.934

error = np.zeros(11)
error[0] = 0;
for k in range(1,11):
    kmeans = sk_cluster.KMeans(init='k-means++', n_clusters=k, n_init=10)
    kmeans.fit_predict(X)
    error[k] = kmeans.inertia_

plt.plot(range(1,len(error)),error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Error')

<matplotlib.text.Text at 0x19c24780>

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
plt.scatter(X[:, 0], X[:, 1], color=colors[kmeans_labels].tolist(), s=10, alpha=0.8)

<matplotlib.collections.PathCollection at 0x19d19518>

Agglomerative Clustering¶

More on Agglomerative Clustering here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html

agglo = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 3)
agglo_labels = agglo.fit_predict(X)

C_agglo= metrics.confusion_matrix(true_labels,agglo_labels)
print (C_agglo)
#plt.pcolor(C_agglo,cmap=plt.cm.coolwarm)
plt.pcolor(C_agglo,cmap=plt.cm.Reds)

[[ 12  23 132]
 [159   7   1]
 [  3 149  14]]

<matplotlib.collections.PolyCollection at 0x19d7bba8>

Another way to do agglomerative clustering using SciPy:

https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html

import scipy.cluster.hierarchy as hr

Z = hr.linkage(X, method='complete', metric='euclidean')

print (Z.shape, X.shape)

(499, 4) (500, 2)

import scipy.spatial.distance as sp_dist
D = sp_dist.pdist(X, 'euclidean') 
Z = hr.linkage(D, method='complete')
print (Z.shape, X.shape)

(499, 4) (500, 2)

Hierarchical clustering returns a 4 by (n-1) matrix Z is returned. At the i-th iteration, clusters with indices Z[i, 0] and Z[i, 1] are combined to form cluster n + i. A cluster with an index less than n corresponds to one of the n original observations. The distance between clusters Z[i, 0] and Z[i, 1] is given by Z[i, 2]. The fourth value Z[i, 3] represents the number of original observations in the newly formed cluster.

fig = plt.figure(figsize=(10,10))
T = hr.dendrogram(Z,color_threshold=0.4, leaf_font_size=4)
fig.show()

C:\Anaconda3\lib\site-packages\matplotlib\figure.py:402: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

Another way to do agglomerative clustering (and visualizing it): http://seaborn.pydata.org/generated/seaborn.clustermap.html

distances = metrics.euclidean_distances(X)
cg = sns.clustermap(distances, method="complete", figsize=(13,13), xticklabels=False)
print (cg.dendrogram_col.reordered_ind)

[56, 199, 237, 179, 233, 358, 303, 452, 143, 478, 147, 440, 74, 223, 16, 388, 467, 267, 451, 44, 406, 290, 201, 422, 92, 100, 60, 107, 342, 356, 269, 458, 327, 393, 275, 165, 277, 135, 272, 117, 301, 227, 95, 172, 193, 37, 444, 310, 414, 279, 332, 57, 484, 21, 26, 46, 350, 184, 488, 192, 264, 340, 78, 489, 66, 178, 448, 220, 408, 160, 180, 499, 473, 130, 36, 22, 150, 34, 133, 177, 118, 181, 436, 109, 307, 380, 76, 241, 263, 455, 260, 167, 295, 446, 470, 169, 316, 400, 462, 38, 62, 242, 334, 476, 250, 417, 243, 402, 77, 321, 196, 475, 257, 85, 361, 67, 377, 378, 413, 481, 115, 323, 431, 63, 357, 206, 112, 471, 335, 105, 482, 212, 280, 205, 211, 438, 183, 96, 443, 447, 2, 55, 329, 291, 490, 379, 249, 374, 114, 120, 450, 87, 163, 268, 30, 287, 25, 119, 128, 441, 219, 311, 333, 13, 88, 302, 337, 348, 93, 479, 141, 286, 483, 363, 411, 234, 486, 170, 200, 309, 4, 457, 463, 382, 124, 373, 82, 421, 176, 228, 164, 248, 48, 288, 162, 318, 235, 459, 126, 370, 182, 190, 213, 466, 396, 11, 116, 7, 75, 376, 445, 246, 70, 353, 18, 198, 8, 341, 368, 305, 292, 298, 485, 29, 247, 339, 175, 389, 27, 254, 407, 424, 49, 216, 439, 281, 464, 102, 271, 371, 54, 123, 492, 10, 195, 23, 171, 397, 245, 428, 89, 101, 296, 137, 204, 142, 255, 354, 384, 73, 317, 474, 1, 258, 189, 251, 132, 362, 12, 218, 231, 418, 209, 40, 86, 136, 404, 352, 349, 65, 84, 208, 47, 324, 430, 149, 104, 229, 351, 45, 394, 285, 293, 224, 31, 225, 140, 313, 42, 17, 217, 381, 359, 383, 33, 412, 221, 312, 325, 442, 152, 72, 240, 108, 174, 308, 80, 367, 194, 5, 186, 0, 106, 365, 423, 344, 449, 433, 41, 91, 215, 210, 173, 491, 58, 127, 369, 297, 315, 392, 153, 437, 145, 304, 71, 168, 111, 110, 364, 410, 52, 435, 203, 32, 498, 336, 306, 469, 256, 416, 461, 81, 155, 386, 429, 129, 415, 43, 496, 187, 330, 355, 154, 497, 395, 495, 282, 331, 121, 494, 79, 300, 372, 276, 244, 487, 425, 460, 19, 103, 345, 273, 434, 283, 480, 166, 262, 253, 328, 50, 83, 69, 420, 493, 53, 226, 347, 360, 146, 265, 398, 432, 326, 24, 236, 125, 238, 51, 239, 131, 385, 151, 320, 15, 139, 159, 401, 456, 6, 322, 468, 188, 61, 98, 68, 387, 472, 294, 232, 390, 261, 319, 28, 90, 97, 427, 122, 375, 14, 314, 99, 191, 230, 134, 197, 252, 158, 214, 156, 266, 94, 274, 338, 409, 113, 465, 20, 64, 426, 343, 3, 144, 9, 202, 284, 405, 138, 453, 161, 399, 299, 148, 270, 185, 222, 259, 391, 59, 278, 157, 366, 346, 454, 289, 477, 35, 39, 419, 207, 403]

C:\Anaconda3\lib\site-packages\matplotlib\cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)

DBSCAN Algorithm¶

More on DBSCAN here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

dbscan = sk_cluster.DBSCAN(eps=0.3)
dbscan_labels = dbscan.fit_predict(X)
print(dbscan_labels)
C = metrics.confusion_matrix(true_labels,dbscan.labels_)
print (C)

[ 0  3  1 -1  2  0 -1  2  3  6  3  2 -1  4  0 -1 -1 -1  3  0 -1  5  5  3  6
  7  5 -1  0  3  7 -1  9 -1  5 -1  5  5  5 -1  3  0 -1  8 -1 -1  5 -1  3 -1
  0  0  0  0 -1  1 -1  5  0 -1 -1  0  5  5 -1 -1 -1  5  0  0  3 -1 -1  3 -1
  2  5  5  5 -1 -1 -1  3  0 -1 -1  3 -1  4  3  0  0 -1 -1  0  5 -1  0  0  0
 -1  3 -1  0 -1 -1  0 -1  0  5  0  0 -1  0  1  5  2  5  5  7  1 -1  0 -1  3
  6  3  0 -1 -1  5  0  3  5  0  5  3  3 -1 -1 -1  2  3 -1 -1  0  0 -1 -1 -1
  5 -1 -1  0  8 -1  0 -1  0 -1  5 -1  1  1  3  5  0  5  2  5  2  3  5  0  0
  2  3  5  5 -1  5  5  3 -1  5 -1  0  8  0  3  3  0  5  5  0  3  5  0  3  5
  2 -1  6  9  3 -1 -1 -1 -1  3  0 -1 -1  3  3  0 -1 -1  1  4  5 -1 -1 -1 -1
 -1  0  5  3 -1  0  3  0 -1  2  3  6  5  0  0 -1  5  5  5  0  3  3  3  3  1
  5  3  3  0 -1  3  9 -1  3 -1  5  0  0  5  5  0  0 -1  7  5 -1 -1  5  0  0
  5  0  5 -1  5 -1 -1 -1  0 -1 -1  2  7  3 -1 -1  1  3 -1  0  5  3  0  3 -1
 -1  5  4 -1 -1  3  9  5  0  2  5  4 -1 -1  0  0  5  3  3  0 -1  5  0  5 -1
 -1  6  5  0  1  8 -1  5  4  5 -1  9  4  0  2  5  3  5 -1  0  0 -1  0 -1 -1
  5 -1 -1  3  3 -1  5  5 -1 -1  0 -1  3  2  0  0 -1  0  3  0  3 -1 -1  3  1
  0  2  5  5  1  5 -1  3 -1  3  0 -1  0 -1  2  0 -1  0  5 -1 -1  2 -1  6 -1
  5 -1  5 -1  3 -1  5 -1  5  0  9  2 -1  5  5  0  9  5  3 -1  0  3 -1  0 -1
  0 -1  0 -1 -1 -1  5  6  0  0  0  5  0 -1 -1 -1 -1 -1 -1  5  2  5 -1 -1  0
 -1 -1 -1 -1 -1  5  0  2  5  3  0 -1 -1  2 -1  0  3 -1 -1  9  5 -1  0  5  3
 -1  5 -1 -1 -1  0  5 -1  2  5  3  2  0  5  5  1  0 -1  0 -1 -1  8  8  9  5]
[[ 0  0  0  0  0  0  0  0  0  0  0]
 [58 87  0  6  0  0  0  7  0  0  9]
 [68  0  0  0  0  0 92  1  0  6  0]
 [41 15 13 18 66  7  1  0  5  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]]

#colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
#colors = np.hstack([colors] * 20)
colors = np.array([x for x in 'bgrcmywk'*10])
plt.scatter(X[:, 0], X[:, 1], color=colors[dbscan_labels].tolist(), s=10, alpha=0.8)

<matplotlib.collections.PathCollection at 0x26393da0>

Processing Complex Data¶

So far we have assumed that the intput is in the form of numerical vectors to which we can apply directly the algorithms we have. Often the data will be more complex. For example what if we want to cluster categorical data, itemsets, or text? Python provides libraries for processing the data and transforming them to a format that we can use.

Python offers a set of tools for extracting features:http://scikit-learn.org/stable/modules/feature_extraction.html

DictVectorizer¶

The DictVectorizer feature extraction: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html#sklearn.feature_extraction.DictVectorizer

The DictVectorizer takes a dictionary of attribute-value pairs and transforms them into numerical vectors. Real values are preserved, while categorical attributes are transformed into binary. The vectorizer produces a sparse representation.

measurements = [
{'city': 'Dubai', 'temperature': 33.},
{'city': 'London', 'temperature': 12.},
{'city': 'San Fransisco', 'temperature': 18.},
]
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
print(vec.fit_transform(measurements).toarray())
vec.get_feature_names()

[[  1.   0.   0.  33.]
 [  0.   1.   0.  12.]
 [  0.   0.   1.  18.]]

['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature']

measurements = [
    {'refund' : 'No','marital_status': 'married', 'income' : 100},
    {'refund' : 'Yes','marital_status': 'single', 'income' : 120},
    {'refund' : 'No','marital_status':'divorced', 'income' : 80},
]
vec = DictVectorizer()
print(vec.fit_transform(measurements))
vec.get_feature_names()

  (0, 0)	100.0
  (0, 2)	1.0
  (0, 4)	1.0
  (1, 0)	120.0
  (1, 3)	1.0
  (1, 5)	1.0
  (2, 0)	80.0
  (2, 1)	1.0
  (2, 4)	1.0

['income',
 'marital_status=divorced',
 'marital_status=married',
 'marital_status=single',
 'refund=No',
 'refund=Yes']

Text processing¶

Feature extraction from text: http://scikit-learn.org/stable/modules/classes.html#text-feature-extraction-ref

SciKit datasets: http://scikit-learn.org/stable/datasets/

We will use the 20-newsgroups datasets which consists of postings on 20 different newsgroups.

More information here: http://scikit-learn.org/stable/datasets/#the-20-newsgroups-text-dataset

categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = sk_data.fetch_20newsgroups(subset='train', 
                               remove=('headers', 'footers', 'quotes'),
                               categories=categories)
#print (news_data.target, len(news_data.target))
print (news_data.target_names)

['alt.atheism', 'rec.sport.baseball', 'sci.space']

print (type(news_data))
print (news_data.filenames)
print (news_data.target[:10])
print (news_data.data[0])
print (len(news_data.data))

<class 'sklearn.datasets.base.Bunch'>
[ 'C:\\Users\\Panayiotis\\scikit_learn_data\\20news_home\\20news-bydate-train\\alt.atheism\\53136'
 'C:\\Users\\Panayiotis\\scikit_learn_data\\20news_home\\20news-bydate-train\\alt.atheism\\51300'
 'C:\\Users\\Panayiotis\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\61216'
 ...,
 'C:\\Users\\Panayiotis\\scikit_learn_data\\20news_home\\20news-bydate-train\\rec.sport.baseball\\104509'
 'C:\\Users\\Panayiotis\\scikit_learn_data\\20news_home\\20news-bydate-train\\rec.sport.baseball\\105103'
 'C:\\Users\\Panayiotis\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60225']
[0 0 2 1 1 2 1 0 2 0]

That's okay:  it's what all the rest of them who come on here say...


This isn't the guy who was a lawyer was he?  Could you give more info on this
guy (never mind- I'm sure there will be PLENTY of responses to this post, and
it will appear there)


This is true.  Make sure it is true for ALL cases.
  

Why not both?  ;)
 

Why not die for a lie?  If you were poverty stricken and alunatic, sounds
perfecetly reasoable to me.  As to whether the societal dregs he had for
followers would be able to tell if he was a liar or not, not necessarily.
Even if he died for what he believed in, this still makes him completely
selfish.  Like us all.  So what's the difference.


People 

There is no historical proof of this (see earlier threads).  Besides, he (or at
least his name), have been the cause of enough deaths to make up for whatever
healing he gave.


SIEG HEIL!!


Who is David Koresh?  I am curious.

 	Therefore since he wasn't a liar or a lunatic, he must have been the 

How does this follow?  Your definition of lunatic (and "disproof" thereof seem
rather... uhhh.. SHAKY)


Good idea.


Naturally, those or not TRUE Christians, right?  ;)


Someone else handle this, I don't know if it's worth it... *sigh*

1670

CountVectorizer¶

The CountVectorizer can be used to extract features in the form of bag of words. It is typically used for text, but you could use it to represent also a collection of itemsets (where each itemset will become a word).

import sklearn.feature_extraction.text as sk_text
vectorizer = sk_text.CountVectorizer(min_df=1)
#vectorizer = sk_text.CountVectorizer(min_df=1,stop_words = 'english')

corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first document?',
          ]
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
vectorizer.get_feature_names()

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

TfIdfVectorizer¶

TfIdfVectorizer transforms text into a sparse matrix where rows are text and columns are words, and values are the tf-dif values. It performs tokenization, normalization, and removes stop-words. More here: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

vectorizer = sk_text.TfidfVectorizer(
                            #stop_words='english',
                             #max_features = 1000,
                             min_df=1)
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
print (vectorizer.get_feature_names())

[[ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]
 [ 0.          0.27230147  0.          0.27230147  0.          0.85322574
   0.22262429  0.          0.27230147]
 [ 0.55280532  0.          0.          0.          0.55280532  0.
   0.28847675  0.55280532  0.        ]
 [ 0.          0.43877674  0.54197657  0.43877674  0.          0.
   0.35872874  0.          0.43877674]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                             #max_features = 1000,
                             min_df=4, max_df=0.8)
data = vectorizer.fit_transform(news_data.data)
print(type(data))

<class 'scipy.sparse.csr.csr_matrix'>

Clustering text data¶

An example of what we want to do: http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

import sklearn.cluster as sk_cluster
k=3
kmeans = sk_cluster.KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
kmeans.fit_predict(data)

array([2, 2, 2, ..., 2, 2, 1])

print("Top terms per cluster:")
asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
order_centroids = asc_order_centroids[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
    print ("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print (' %s' % terms[ind])
    print

Top terms per cluster:
Cluster 0:
 don
 god
 just
 people
 think
 does
 like
 know
 say
 time
Cluster 1:
 year
 team
 game
 games
 runs
 baseball
 good
 hit
 pitching
 think
Cluster 2:
 space
 nasa
 launch
 moon
 shuttle
 earth
 orbit
 lunar
 like
 data

C = metrics.confusion_matrix(news_data.target,kmeans.labels_)
print (C)
p = metrics.precision_score(news_data.target,kmeans.labels_, average=None)
print(p)
r = metrics.recall_score(news_data.target,kmeans.labels_, average = None)
print(r)

[[136   1 343]
 [  2   0 595]
 [  1 343 249]]
[ 0.97841727  0.          0.20977254]
[ 0.28333333  0.          0.41989882]

agglo = sk_cluster.AgglomerativeClustering(linkage = 'complete', n_clusters = 3,)
dense = data.todense()
agglo_labels = agglo.fit_predict(dense) # agglomerative needs dense data

C_agglo= metrics.confusion_matrix(news_data.target,agglo_labels)
print (C_agglo)

[[417   2  61]
 [564   2  31]
 [450  55  88]]

dbscan = sk_cluster.DBSCAN(eps=0.1)
dbscan_labels = dbscan.fit_predict(data)
C = metrics.confusion_matrix(news_data.target,dbscan.labels_)
print (C)

[[  0   0   0   0]
 [468  12   0   0]
 [568  29   0   0]
 [576  17   0   0]]