Introduction to Numpy, Scipy, SciKit

In this tutorial we will look into some functions of three very important packages:

Scipy and Numpy: http://www.numpy.org/ and http://docs.scipy.org/doc/numpy/reference/index.html

SciKit: http://scikit-learn.org/stable/

In [32]:
import numpy as np
import scipy as sp
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist

import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg

import time
import seaborn as sns
%matplotlib inline

Why Numpy?

In [33]:
def trad_version():
    t1 = time.time()
    X = range(10000000)
    Y = range(10000000)
    Z = [x+y for x,y in zip(X,Y)]
    #Z=[]
    #for i in range(len(X)):
    #    Z.append(X[i] + Y[i])
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = X + Y
    return time.time() - t1


traditional_time = trad_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Numpy time       = "+ str(numpy_time))
Traditional time = 1.8354318141937256
Numpy time       = 0.0575408935546875

Arrays

In [34]:
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)

x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)

#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)

x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)
print (x.shape)

x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)
print ("\n Transpose of the matrix \n")
print (x.T)
print (x.T.shape)

x = np.zeros((4,4))
print ("\n 4x4 array with zeros \n")
print(x)

x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)

x = np.eye(4)
print ("\n Identity matrix of size 4\n")
print(x)
 Deterministic 1-dimensional array 

[ 2  5 18 14  4]

 Random 1-dimensional array 

[ 0.84219833  0.20151395  0.22410751  0.22395813  0.98130465]

 Deterministic 2-dimensional array 

[[ 2  5 18 14  4]
 [12 15  1  2  8]]

 Random 5x5 2-dimensional array 

[[  2.02180949e-01   1.41521752e-01   3.72795467e-01   4.60430835e-01
    1.90994923e-01]
 [  5.63205416e-01   9.12211881e-02   7.50390469e-01   9.41422921e-01
    4.69112307e-01]
 [  8.85127699e-01   6.73139892e-01   5.72399850e-01   9.74857263e-01
    3.18830866e-01]
 [  8.84261548e-01   7.78186025e-01   5.58154007e-01   1.79703106e-01
    5.47397933e-01]
 [  6.61018252e-01   7.97554086e-01   2.14215180e-01   3.94746621e-04
    9.13138886e-01]]
(5, 5)

 Random 2x3 array with integers
[[3 3 9]
 [8 2 0]]

 Transpose of the matrix 

[[3 8]
 [3 2]
 [9 0]]
(3, 2)

 4x4 array with zeros 

[[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]

 4x4 array with ones 

[[ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]]

 Identity matrix of size 4

[[ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]]
In [35]:
x = np.random.rand(2,4)
print (x)
print('\n mean value of all elements')
print (np.mean(x)) 
print('\n vector of mean values for columns')
print (np.mean(x,0)) #0 signifies the dimension meaning columns
print('\n vector of mean values for rows')
print (np.mean(x,1)) #1 signifies the dimension meaning rows
print('\n standard deviation of all elements')
print (np.std(x)) 
print('\n vector of std values for rows')
print (np.std(x,1)) #1 signifies the dimension meaning rows
print('\n median value of all elements')
print (np.median(x)) 
print('\n vector of median values for rows')
print (np.median(x,1)) 
print('\n sum of all elements')
print (np.sum(x)) 
print('\n vector of row sums')
print (np.sum(x,1)) 
print('\n product of all elements')
print (np.prod(x)) 
print('\n vector of row products')
print (np.prod(x,1)) 
[[ 0.70621322  0.88091997  0.56326543  0.54455049]
 [ 0.55623153  0.33010459  0.314333    0.00764333]]

 mean value of all elements
0.48790769575

 vector of mean values for columns
[ 0.63122237  0.60551228  0.43879922  0.27609691]

 vector of mean values for rows
[ 0.67373728  0.30207811]

 standard deviation of all elements
0.250338557851

 vector of std values for rows
[ 0.13497464  0.19507785]

 median value of all elements
0.550391007172

 vector of median values for rows
[ 0.63473933  0.3222188 ]

 sum of all elements
3.903261566

 vector of row sums
[ 2.69494912  1.20831245]

 product of all elements
8.41789061956e-05

 vector of row products
[ 0.19081985  0.00044114]

Manipulating arrays

In [36]:
x = np.random.rand(4,3)
print(x)
print("\n row zero \n")
print(x[0])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0.5 \n")
print(x[x>0.5])
print("\n element\n")
print(x[1,2])
[[ 0.459352    0.57397078  0.47216331]
 [ 0.57651725  0.99798448  0.7388205 ]
 [ 0.54895649  0.27064635  0.00387585]
 [ 0.35270315  0.44169122  0.0386078 ]]

 row zero 

[ 0.459352    0.57397078  0.47216331]

 column 2 

[ 0.47216331  0.7388205   0.00387585  0.0386078 ]

 submatrix 

[[ 0.57651725  0.99798448]
 [ 0.54895649  0.27064635]]

 entries > 0.5 

[ 0.57397078  0.57651725  0.99798448  0.7388205   0.54895649]

 element

0.738820504204
In [38]:
x = np.random.rand(4,3)
print(x)

x[1,2] = -5 #change an entry
x[0:2,:] += 1 #change a set of rows
x[2:4,1:3] = 0.5 #change a block
print(x)

print('\n Set entries > 0.5 to zero')
x[x>0.5] = 0
print(x)
[[ 0.50454881  0.72494837  0.23106616]
 [ 0.57059321  0.71235832  0.19069826]
 [ 0.45103577  0.77502925  0.27084125]
 [ 0.43184455  0.31648659  0.44698125]]
[[ 1.50454881  1.72494837  1.23106616]
 [ 1.57059321  1.71235832 -4.        ]
 [ 0.45103577  0.5         0.5       ]
 [ 0.43184455  0.5         0.5       ]]

 Set entries > 0.5 to zero
[[ 0.          0.          0.        ]
 [ 0.          0.         -4.        ]
 [ 0.45103577  0.5         0.5       ]
 [ 0.43184455  0.5         0.5       ]]
In [39]:
x = np.random.rand(4,3)
print(x)

#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n") 
print(2*x+1)

y = np.array([2,-1,3])
z = np.array([-1,2,2])
print('\n y:',y)
print(' z:',z)
print('\n vector-vector dot product')
print(y.dot(z))
print(np.dot(y,z))

print('\n vector-vector external product')
print(np.outer(y,z))


y = np.array([1,0,0])
print("\n Matrix-vector right multiplication\n")
print(x.dot(y))
y = np.array([1,0,1,0])
print("\n Matrix-vector left multiplication\n")
print(y.dot(x))

y = np.random.rand(3,2)
z = x.dot(y)
print("\n Matrix-matrix multiplication\n")
print(x)
print(y)
print (z)
[[ 0.33110577  0.70820548  0.51309358]
 [ 0.72479867  0.5924881   0.17141884]
 [ 0.14770909  0.11466435  0.98176633]
 [ 0.56209993  0.12991073  0.68987083]]

 Matrix 2x+1 

[[ 1.66221154  2.41641097  2.02618715]
 [ 2.44959733  2.18497619  1.34283767]
 [ 1.29541818  1.22932871  2.96353267]
 [ 2.12419987  1.25982146  2.37974166]]

 y: [ 2 -1  3]
 z: [-1  2  2]

 vector-vector dot product
2
2

 vector-vector external product
[[-2  4  4]
 [ 1 -2 -2]
 [-3  6  6]]

 Matrix-vector right multiplication

[ 0.33110577  0.72479867  0.14770909  0.56209993]

 Matrix-vector left multiplication

[ 0.47881486  0.82286984  1.49485991]

 Matrix-matrix multiplication

[[ 0.33110577  0.70820548  0.51309358]
 [ 0.72479867  0.5924881   0.17141884]
 [ 0.14770909  0.11466435  0.98176633]
 [ 0.56209993  0.12991073  0.68987083]]
[[ 0.82596711  0.4173816 ]
 [ 0.75929501  0.41166315]
 [ 0.67249043  0.36199706]]
[[ 1.15626989  0.61547792]
 [ 1.16381064  0.60847626]
 [ 0.86929539  0.46425067]
 [ 1.02684816  0.53782084]]

Creating Sparse Arrays

For sparse arrays we need to use a different library: http://docs.scipy.org/doc/scipy/reference/sparse.html

In [40]:
import scipy.sparse as sp_sparse

# csr: compressed row format matrices. Allow for fast row computations 
# csc for compressed column format
d = np.array([[0, 0, 12],
              [0, 1, 1],
              [0, 5, 34],
              [1, 3, 12],
              [1, 2, 6],
              [2, 0, 23],
              [3, 4, 14],
              ])
#Constuction of matrix from triplets: row indices, column indices, values
row = d[:,0]
col = d[:,1]
data = d[:,2]
# a matrix M with M[row[i],col[i]] = data[i] will be created
M = sp_sparse.csr_matrix((data,(row,col)), shape=(5,6))
#sdata = sp_sparse.csr_matrix(d)
print(M)
print(M.toarray()) #transforms back to full matrix

#all operations work like before
o = np.ones((6,1))
print(M.dot(o))
print(M.dot(M.T))
# make the d matrix sparse
print('\n make d sparce')
A = sp_sparse.csr_matrix(d)
print(A)
  (0, 0)	12
  (0, 1)	1
  (0, 5)	34
  (1, 2)	6
  (1, 3)	12
  (2, 0)	23
  (3, 4)	14
[[12  1  0  0  0 34]
 [ 0  0  6 12  0  0]
 [23  0  0  0  0  0]
 [ 0  0  0  0 14  0]
 [ 0  0  0  0  0  0]]
[[ 47.]
 [ 18.]
 [ 23.]
 [ 14.]
 [  0.]]
  (0, 2)	276
  (0, 0)	1301
  (1, 1)	180
  (2, 2)	529
  (2, 0)	276
  (3, 3)	196

 make d sparce
  (0, 2)	12
  (1, 1)	1
  (1, 2)	1
  (2, 1)	5
  (2, 2)	34
  (3, 0)	1
  (3, 1)	3
  (3, 2)	12
  (4, 0)	1
  (4, 1)	2
  (4, 2)	6
  (5, 0)	2
  (5, 2)	23
  (6, 0)	3
  (6, 1)	4
  (6, 2)	14
In [41]:
# Creating a sparse matrix incrementally
# Use lil (list of lists) representation
A = sp_sparse.lil_matrix((10, 10))
A[0, :5] = np.random.randint(10,size = 5)
A[1, 5:10] = A[0, :5]
A.setdiag(np.random.randint(10,size = 10))
A[9,9] = 99
A[9,0]=1
print(A.toarray())
print(A.diagonal())
A = A.tocsr() # makes it a compressed column format. better for dot product.
B = A.dot(np.ones(10))
print(B)
[[  2.   5.   6.   9.   2.   0.   0.   0.   0.   0.]
 [  0.   9.   0.   0.   0.   9.   5.   6.   9.   2.]
 [  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   8.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   9.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   8.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   7.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   7.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.   3.   0.]
 [  1.   0.   0.   0.   0.   0.   0.   0.   0.  99.]]
[  2.   9.   0.   8.   9.   8.   7.   7.   3.  99.]
[  24.   40.    0.    8.    9.    8.    7.    7.    3.  100.]

Singluar Value Decomposition

For the singular value decomposition we will use the libraries from Numpy and SciKit

Numpy: http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html

SciKit: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html

In [42]:
import sklearn.datasets as sk_data

data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
#sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)

U, s, V = np.linalg.svd(data)
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[0:10])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
(100, 100) (50,) (50, 50)
[  1.00000000e+00   7.78800783e-01   3.67879441e-01   1.05399225e-01
   1.83156389e-02   1.93045414e-03   1.23409804e-04   4.78511739e-06
   1.12535175e-07   1.60522806e-09   1.38879467e-11   7.28750862e-14
   2.37507480e-16   1.30983700e-16   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   9.48948715e-17   9.48948715e-17   9.48948715e-17
   9.48948715e-17   3.04847135e-17]
Out[42]:
<matplotlib.text.Text at 0xe784940>
In [44]:
import scipy.sparse.linalg as sp_linalg

data2 = sp_sparse.csc_matrix(data)
print(data2.shape)
U,s,V = sp_linalg.svds(data2) #by default returns k=6 singular values
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
(100, 50)
(100, 6) (6,) (6, 50)
[ 0.00193045  0.01831564  0.10539922  0.36787944  0.77880078  1.        ]
Out[44]:
<matplotlib.text.Text at 0xe3e2898>
In [68]:
K = 6
U_k,s_k,V_k = sp_linalg.svds(data2, K, which = 'LM')
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)

for k in range(K,0,-1):
    data_k = U_k[:,k:].dot(S_k[k:,k:]).dot(V_k[k:,:])
    print(np.linalg.norm(data_k-data2,ord='fro'))
data_k = U_k.dot(S_k).dot(V_k)
print(np.linalg.norm(data_k-data2,ord='fro'))
(100, 6) (6,) (6, 50)
[ 0.00193045  0.01831564  0.10539922  0.36787944  0.77880078  1.        ]
1.32412769174
0.867936716599
0.383123327806
0.106996266627
0.018417506182
0.00193440069837
0.000123502590094
In [65]:
import scipy.spatial.distance as sp_dist
import sklearn.metrics as metrics

x = np.random.randint(2, size = 5)
y = np.random.randint(2, size = 5)
print (x)
print (y)
print (sp_dist.cosine(x,y))
print (sp_dist.euclidean(x,y))
print (sp_dist.jaccard(x,y))
print (sp_dist.hamming(x,y))
[0 0 1 1 1]
[0 1 0 0 1]
0.591751709536
1.73205080757
0.75
0.6
In [64]:
A = np.random.randint(2, size = (5,3))

# computes the matrix of all pairwise distances of rows
# returns a vector with N(N-1)/2 entries (N number of rows)
D = sp_dist.pdist(A, 'jaccard') 
print (A)
print('\n all row distances')
print (D)
# When computing jaccard similarity of 0/1 matrices, 
# 1 means that the element corresponding to the column is in the set,
# 0 that the element is not in the set
[[0 1 0]
 [1 1 1]
 [0 0 1]
 [0 0 0]
 [1 0 1]]

 all row distances
[ 0.66666667  1.          1.          1.          0.66666667  1.
  0.33333333  1.          0.5         1.        ]
In [66]:
#computes the matrix of all pairwise distances of rows
# returns a NxN matrix (N number of rows)
D2 = metrics.pairwise.pairwise_distances(A,metric = 'jaccard') 
print('\n the matrix of row distances')
print(D2)

B = np.random.randint(2, size = (3,3))
print (B)

#computes the matrix of all pairwise distances of rows of A with rows of B
# returns an NxM matrix (N rows of A, M rows of B)
D3 = metrics.pairwise.pairwise_distances(A,B,metric = 'jaccard') 
print('\n the matrix of distances between the rows of A and B')
print(D3)
 the matrix of row distances
[[ 0.          0.66666667  1.          1.          1.        ]
 [ 0.66666667  0.          0.66666667  1.          0.33333333]
 [ 1.          0.66666667  0.          1.          0.5       ]
 [ 1.          1.          1.          0.          1.        ]
 [ 1.          0.33333333  0.5         1.          0.        ]]
[[0 0 0]
 [1 1 1]
 [0 1 0]]

 the matrix of distances between the rows of A and B
[[ 1.          0.66666667  0.        ]
 [ 1.          0.          0.66666667]
 [ 1.          0.66666667  1.        ]
 [        nan  1.          1.        ]
 [ 1.          0.33333333  1.        ]]
In [67]:
#We can apply everything to sparce matrices

d = np.array([[0, 0, 12],
              [0, 1, 1],
              [0, 5, 34],
              [1, 3, 12],
              [1, 2, 6],
              [2, 0, 23],
              [3, 4, 14],
              ])
s = sp_sparse.csr_matrix((d[:,2],(d[:,0],d[:,1])), shape=(4,6))
D4 = metrics.pairwise.pairwise_distances(s,metric = 'euclidean')
print(s.toarray())
print(D4)
[[12  1  0  0  0 34]
 [ 0  0  6 12  0  0]
 [23  0  0  0  0  0]
 [ 0  0  0  0 14  0]]
[[  0.          38.48376281  35.74912586  38.69108424]
 [ 38.48376281   0.          26.62705391  19.39071943]
 [ 35.74912586  26.62705391   0.          26.92582404]
 [ 38.69108424  19.39071943  26.92582404   0.        ]]