In this tutorial we will look into some functions of three very important packages:
Scipy and Numpy: http://www.numpy.org/ and http://docs.scipy.org/doc/numpy/reference/index.html
SciKit: http://scikit-learn.org/stable/
import numpy as np
import scipy as sp
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist
import matplotlib.pyplot as plt
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg
import time
import seaborn as sns
%matplotlib inline
def trad_version():
t1 = time.time()
X = range(10000000)
Y = range(10000000)
Z = [x+y for x,y in zip(X,Y)]
#Z=[]
#for i in range(len(X)):
# Z.append(X[i] + Y[i])
return time.time() - t1
def numpy_version():
t1 = time.time()
X = np.arange(10000000)
Y = np.arange(10000000)
Z = X + Y
return time.time() - t1
traditional_time = trad_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Numpy time = "+ str(numpy_time))
Random: http://docs.scipy.org/doc/numpy-1.10.0/reference/routines.random.html
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)
x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)
#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)
x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)
print (x.shape)
x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)
print ("\n Transpose of the matrix \n")
print (x.T)
print (x.T.shape)
x = np.zeros((4,4))
print ("\n 4x4 array with zeros \n")
print(x)
x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)
x = np.eye(4)
print ("\n Identity matrix of size 4\n")
print(x)
x = np.random.rand(2,4)
print (x)
print('\n mean value of all elements')
print (np.mean(x))
print('\n vector of mean values for columns')
print (np.mean(x,0)) #0 signifies the dimension meaning columns
print('\n vector of mean values for rows')
print (np.mean(x,1)) #1 signifies the dimension meaning rows
print('\n standard deviation of all elements')
print (np.std(x))
print('\n vector of std values for rows')
print (np.std(x,1)) #1 signifies the dimension meaning rows
print('\n median value of all elements')
print (np.median(x))
print('\n vector of median values for rows')
print (np.median(x,1))
print('\n sum of all elements')
print (np.sum(x))
print('\n vector of row sums')
print (np.sum(x,1))
print('\n product of all elements')
print (np.prod(x))
print('\n vector of row products')
print (np.prod(x,1))
x = np.random.rand(4,3)
print(x)
print("\n row zero \n")
print(x[0])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0.5 \n")
print(x[x>0.5])
print("\n element\n")
print(x[1,2])
x = np.random.rand(4,3)
print(x)
x[1,2] = -5 #change an entry
x[0:2,:] += 1 #change a set of rows
x[2:4,1:3] = 0.5 #change a block
print(x)
print('\n Set entries > 0.5 to zero')
x[x>0.5] = 0
print(x)
x = np.random.rand(4,3)
print(x)
#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n")
print(2*x+1)
y = np.array([2,-1,3])
z = np.array([-1,2,2])
print('\n y:',y)
print(' z:',z)
print('\n vector-vector dot product')
print(y.dot(z))
print(np.dot(y,z))
print('\n vector-vector external product')
print(np.outer(y,z))
y = np.array([1,0,0])
print("\n Matrix-vector right multiplication\n")
print(x.dot(y))
y = np.array([1,0,1,0])
print("\n Matrix-vector left multiplication\n")
print(y.dot(x))
y = np.random.rand(3,2)
z = x.dot(y)
print("\n Matrix-matrix multiplication\n")
print(x)
print(y)
print (z)
For sparse arrays we need to use a different library: http://docs.scipy.org/doc/scipy/reference/sparse.html
import scipy.sparse as sp_sparse
# csr: compressed row format matrices. Allow for fast row computations
# csc for compressed column format
d = np.array([[0, 0, 12],
[0, 1, 1],
[0, 5, 34],
[1, 3, 12],
[1, 2, 6],
[2, 0, 23],
[3, 4, 14],
])
#Constuction of matrix from triplets: row indices, column indices, values
row = d[:,0]
col = d[:,1]
data = d[:,2]
# a matrix M with M[row[i],col[i]] = data[i] will be created
M = sp_sparse.csr_matrix((data,(row,col)), shape=(5,6))
#sdata = sp_sparse.csr_matrix(d)
print(M)
print(M.toarray()) #transforms back to full matrix
#all operations work like before
o = np.ones((6,1))
print(M.dot(o))
print(M.dot(M.T))
# make the d matrix sparse
print('\n make d sparce')
A = sp_sparse.csr_matrix(d)
print(A)
# Creating a sparse matrix incrementally
# Use lil (list of lists) representation
A = sp_sparse.lil_matrix((10, 10))
A[0, :5] = np.random.randint(10,size = 5)
A[1, 5:10] = A[0, :5]
A.setdiag(np.random.randint(10,size = 10))
A[9,9] = 99
A[9,0]=1
print(A.toarray())
print(A.diagonal())
A = A.tocsr() # makes it a compressed column format. better for dot product.
B = A.dot(np.ones(10))
print(B)
For the singular value decomposition we will use the libraries from Numpy and SciKit
Numpy: http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
SciKit: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html
import sklearn.datasets as sk_data
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
#sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
U, s, V = np.linalg.svd(data)
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[0:10])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
import scipy.sparse.linalg as sp_linalg
data2 = sp_sparse.csc_matrix(data)
print(data2.shape)
U,s,V = sp_linalg.svds(data2) #by default returns k=6 singular values
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
K = 6
U_k,s_k,V_k = sp_linalg.svds(data2, K, which = 'LM')
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)
for k in range(K,0,-1):
data_k = U_k[:,k:].dot(S_k[k:,k:]).dot(V_k[k:,:])
print(np.linalg.norm(data_k-data2,ord='fro'))
data_k = U_k.dot(S_k).dot(V_k)
print(np.linalg.norm(data_k-data2,ord='fro'))
For the computation of distances there are libraries in Scipy
http://docs.scipy.org/doc/scipy-0.15.1/reference/spatial.distance.html#module-scipy.spatial.distance
but also in SciKit metrics library:
http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html
import scipy.spatial.distance as sp_dist
import sklearn.metrics as metrics
x = np.random.randint(2, size = 5)
y = np.random.randint(2, size = 5)
print (x)
print (y)
print (sp_dist.cosine(x,y))
print (sp_dist.euclidean(x,y))
print (sp_dist.jaccard(x,y))
print (sp_dist.hamming(x,y))
A = np.random.randint(2, size = (5,3))
# computes the matrix of all pairwise distances of rows
# returns a vector with N(N-1)/2 entries (N number of rows)
D = sp_dist.pdist(A, 'jaccard')
print (A)
print('\n all row distances')
print (D)
# When computing jaccard similarity of 0/1 matrices,
# 1 means that the element corresponding to the column is in the set,
# 0 that the element is not in the set
#computes the matrix of all pairwise distances of rows
# returns a NxN matrix (N number of rows)
D2 = metrics.pairwise.pairwise_distances(A,metric = 'jaccard')
print('\n the matrix of row distances')
print(D2)
B = np.random.randint(2, size = (3,3))
print (B)
#computes the matrix of all pairwise distances of rows of A with rows of B
# returns an NxM matrix (N rows of A, M rows of B)
D3 = metrics.pairwise.pairwise_distances(A,B,metric = 'jaccard')
print('\n the matrix of distances between the rows of A and B')
print(D3)
#We can apply everything to sparce matrices
d = np.array([[0, 0, 12],
[0, 1, 1],
[0, 5, 34],
[1, 3, 12],
[1, 2, 6],
[2, 0, 23],
[3, 4, 14],
])
s = sp_sparse.csr_matrix((d[:,2],(d[:,0],d[:,1])), shape=(4,6))
D4 = metrics.pairwise.pairwise_distances(s,metric = 'euclidean')
print(s.toarray())
print(D4)