In this tutorial we will look into the Numpy library: http://www.numpy.org/
Numpy is a very important library for numerical computations and matrix manipulation. It has a lot of the functionality of Matlab, and some of the functionality of Pandas
We will also use the Scipy library for scientific computation: http://docs.scipy.org/doc/numpy/reference/index.html
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg
import time
def trad_version():
t1 = time.time()
X = range(10000000)
Y = range(10000000)
Z = [x+y for x,y in zip(X,Y)]
return time.time() - t1
def numpy_version():
t1 = time.time()
X = np.arange(10000000)
Y = np.arange(10000000)
Z = X + Y
return time.time() - t1
traditional_time = trad_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Numpy time = "+ str(numpy_time))
In Numpy data is organized into arrays. There are many different ways to create a numpy array.
For the following we will use the random library of Numpy: http://docs.scipy.org/doc/numpy-1.10.0/reference/routines.random.html
Creating arrays from lists
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)
#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)
We can also create Numpy arrays from Pandas DataFrames
d = {'A':[1., 2., 3., 4.],
'B':[4., 3., 2., 1.]}
df = pd.DataFrame(d)
x = np.array(df)
print(x)
Creating random arrays
#1-dimensional arrays
x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)
#2-dimensional arrays
x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)
x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)
Transpose and get array dimensions
print("\n Matrix Dimenasions \n")
print(x.shape)
print ("\n Transpose of the matrix \n")
print (x.T)
print (x.T.shape)
Special Arrays
x = np.zeros((4,4))
print ("\n 4x4 array with zeros \n")
print(x)
x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)
x = np.eye(4)
print ("\n Identity matrix of size 4\n")
print(x)
x = np.diag([1,2,3])
print ("\n Diagonal matrix\n")
print(x)
These are very similar to what we did with Pandas
x = np.random.rand(2,4)
print (x)
print('\n mean value of all elements')
print (np.mean(x))
print('\n vector of mean values for columns')
print (np.mean(x,0)) #0 signifies the dimension meaning columns
print('\n vector of mean values for rows')
print (np.mean(x,1)) #1 signifies the dimension meaning rows
print('\n standard deviation of all elements')
print (np.std(x))
print('\n vector of std values for rows')
print (np.std(x,1)) #1 signifies the dimension meaning rows
print('\n median value of all elements')
print (np.median(x))
print('\n vector of median values for rows')
print (np.median(x,1))
print('\n sum of all elements')
print (np.sum(x))
print('\n vector of row sums')
print (np.sum(x,1))
print('\n product of all elements')
print (np.prod(x))
print('\n vector of row products')
print (np.prod(x,1))
Accessing and Slicing
x = np.random.rand(4,3)
print(x)
print("\n row zero \n")
print(x[0,:])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0.5 \n")
print(x[x>0.5])
print("\n element\n")
print(x[1,2])
Changing entries
x = np.random.rand(4,3)
print(x)
x[1,2] = -5 #change an entry
x[0:2,:] += 1 #change a set of rows
x[2:4,1:3] = 0.5 #change a block
print(x)
print('\n Set entries > 0.5 to zero')
x[x>0.5] = 0
print(x)
print('\n Diagonal \n')
x = np.random.rand(4,4)
print(x)
print('\n Read Diagonal \n')
print(x.diagonal())
print('\n Fill Diagonal with 1s \n')
np.fill_diagonal(x,1)
print(x)
print('\n Fill Diagonal with vector \n')
x[np.diag_indices_from(x)] = [1,2,3,4]
print(x)
Multiplication and addition with scalar
x = np.random.rand(4,3)
print(x)
#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n")
print(2*x+1)
Vector-vector dot product and external product
y = np.array([2,-1,3])
z = np.array([-1,2,2])
print('\n y:',y)
print(' z:',z)
print('\n vector-vector dot product')
print(y.dot(z))
print(np.dot(y,z))
print('\n vector-vector external product')
print(np.outer(y,z))
print(y*z)
Matrix-Vector multiplication
x = np.random.rand(4,3)
print(x)
y = np.array([1,0,0])
print("\n Matrix-vector right multiplication\n")
print(x.dot(y))
y = np.array([1,0,1,0])
print("\n Matrix-vector left multiplication\n")
print(y.dot(x))
Matrix-Matrix multiplication
y = np.random.rand(3,2)
z = x.dot(y)
print("\n Matrix-matrix multiplication\n")
print(x)
print(y)
print (z)
For sparse arrays we need to use the sp_sparse library from SciPy: http://docs.scipy.org/doc/scipy/reference/sparse.html
There are three types of sparse matrices:
Triplets are of the form (row, column, value)
import scipy.sparse as sp_sparse
d = np.array([[0, 0, 12],
[0, 1, 1],
[0, 5, 34],
[1, 3, 12],
[1, 2, 6],
[2, 0, 23],
[3, 4, 14],
])
row = d[:,0]
col = d[:,1]
data = d[:,2]
# a matrix M with M[row[i],col[i]] = data[i] will be created
M = sp_sparse.csr_matrix((data,(row,col)), shape=(5,6))
print(M)
print(M.toarray()) #transforms back to full matrix
Making a full matrix sparse
x = np.random.randint(2,size = (3,4))
print(x)
print('\n make x sparce')
A = sp_sparse.csr_matrix(x)
print(A)
Creating a sparse matrix incrementally
# Use lil (list of lists) representation
A = sp_sparse.lil_matrix((10, 10))
A[0, :5] = np.random.randint(10,size = 5)
A[1, 5:10] = A[0, :5]
A.setdiag(np.random.randint(10,size = 10))
A[9,9] = 99
A[9,0]=1
print(A.toarray())
print(A.diagonal())
A = A.tocsr() # makes it a compressed column format. better for dot product.
B = A.dot(np.ones(10))
print(B)
All operations work like before
o = np.ones((6,1))
print(M.dot(o))
print(M.dot(M.T))
For the singular value decomposition we will use the libraries from Numpy and SciKit
Numpy: http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
SciKit: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html
import sklearn.datasets as sk_data
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
#sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
U, s, V = np.linalg.svd(data,full_matrices = False)
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[0:10])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
import scipy.sparse.linalg as sp_linalg
data2 = sp_sparse.csc_matrix(data)
print(data2.shape)
U,s,V = sp_linalg.svds(data2) #by default returns k=6 singular values
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
K = 6
U_k,s_k,V_k = sp_linalg.svds(data2, K, which = 'LM')
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)
for k in range(K,0,-1):
data_k = U_k[:,k:].dot(S_k[k:,k:]).dot(V_k[k:,:])
print(np.linalg.norm(data_k-data2,ord='fro'))
data_k = U_k.dot(S_k).dot(V_k)
print(np.linalg.norm(data_k-data2,ord='fro'))
import numpy as np
M1 = np.random.randint(1,50,(50,5))
M2 = np.random.randint(1,10,(50,5))
M3 = np.random.randint(1,10,(50,5))
M4 = np.random.randint(1,50,(50,5))
T = np.concatenate((M1,M2),axis=1)
B = np.concatenate((M3,M4),axis=1)
M = np.concatenate([T,B],axis = 0)
import scipy.stats as stats
import matplotlib.pyplot as plt
(U,S,V) = np.linalg.svd(M,full_matrices = False)
c = M.sum(0)
r = M.sum(1)
print(stats.pearsonr(c,V[0]))
print(stats.pearsonr(r,U[:,0]))
plt.scatter(r,U[:,0])
plt.figure()
plt.scatter(c,V[0])
plt.scatter(U[:,0],U[:,1])
plt.scatter(x = U[:50,0],y = U[:50,1],color='r')
plt.scatter(x = U[50:,0],y = U[50:,1], color = 'b')