#Introduction to Numpy, Scipy, SciKit# 

In this tutorial we will look into some functions of three very important packages:

Scipy and Numpy:  http://www.numpy.org/ and http://docs.scipy.org/doc/numpy/reference/index.html

SciKit: http://scikit-learn.org/stable/

In [None]:
import numpy as np
import scipy as sp
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist

import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg

import time
import seaborn as sns
%matplotlib inline

###Why Numpy?###

In [None]:
def trad_version():
    t1 = time.time()
    X = range(10000000)
    Y = range(10000000)
    Z = []
    for i in range(len(X)):
        Z.append(X[i] + Y[i])
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = X + Y
    return time.time() - t1


traditional_time = trad_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Numpy time       = "+ str(numpy_time))

###Creating Arrays###

Random: http://docs.scipy.org/doc/numpy-1.10.0/reference/routines.random.html

In [None]:
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)

x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)

#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)

x = np.random.rand(5,5)
print ("\n Random 2-dimensional array \n")
print (x)
print (x.shape)

x = np.random.randint(10,size=(2,3))
print("\n Random array with integers")
print(x)

x = np.zeros((4,4))
print ("\n Array with zeros \n")
print(x)

x = np.ones((4,4))
print ("\n Array with ones \n")
print (x)

x = np.eye(4)
print ("\n Identity matrix \n")
print(x)

In [None]:
x = np.random.rand(2,4)
print (x)
print (np.mean(x))
print (np.mean(x,0))
print (np.mean(x,1))
print (np.std(x))
print (np.std(x,1))
print (np.median(x))
print (np.median(x,1))
print (np.sum(x))
print (np.sum(x,1))
print (np.prod(x))
print (np.prod(x,1))

###Manipulating arrays###

In [None]:
x = np.random.rand(4,3)
print(x)
print("\n row zero \n")
print(x[0])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0 \n")
print(x[x>0.5])
print("\n element\n")
print(x[(1,2)])

print("\n Matrix 2x+1 \n")
print(2*x+1)


y = np.array([1,0,0]).T
print("\n Matrix-vector multiplication\n")
print(x.dot(y))

y = np.random.rand(3,2)
z = np.dot(x,y)
print("\n Matrix-matrix multiplication\n")
print(x)
print(y)
print (z)

### Creating Sparse Arrays ###

For sparse arrays we need to use a different library:
http://docs.scipy.org/doc/scipy/reference/sparse.html

In [None]:
import scipy.sparse as sp_sparse

# csr: compressed row format matrices. Allow for fast row computations 
# csc for compressed column format
data = np.array([[0, 0, 12],
              [0, 1, 1],
              [0, 5, 34],
              [1, 3, 12],
              [1, 2, 6],
              [2, 0, 23],
              [3, 4, 14],
              ])
sdata = sp_sparse.csr_matrix((d[:,2],(d[:,0],d[:,1])), shape=(4,6))
print(sdata)
print(sdata.toarray())
o = np.ones((6,1))
print(sdata.dot(o))
print(sdata.dot(sdata.T))

In [None]:
# Creating a sparse matrix incrementally
A = sp_sparse.lil_matrix((100, 100))
A[0, :10] = np.random.rand(10)
A[1, 10:20] = A[0, :10]
A.setdiag(np.random.randint(100,size = 100))
A[99,99] = 99
print(A.diagonal())
B = A.dot(np.ones(100))
A = A.tocsr()

### Computing distances ###

For the computation of distances there are libraries in Scipy

http://docs.scipy.org/doc/scipy-0.15.1/reference/spatial.distance.html#module-scipy.spatial.distance

but also in SciKit metrics library:

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances.html


In [None]:
import scipy.spatial.distance as sp_dist
import sklearn.metrics as metrics

x = np.random.randint(2, size = 5)
y = np.random.randint(2, size = 5)
print (x)
print (y)
print (sp_dist.cosine(x,y))
print (sp_dist.euclidean(x,y))
print (sp_dist.jaccard(x,y))
print (sp_dist.hamming(x,y))

A = np.random.randint(2, size = (5,3))
B = np.random.randint(2, size = (3,3))

D = sp_dist.pdist(A, 'jaccard')
print (A)
print (D)
D2 = metrics.pairwise.pairwise_distances(A,metric = 'jaccard')
print(D2)
print (B)
D3 = metrics.pairwise.pairwise_distances(A,B,metric = 'jaccard')
print(D3)

d = np.array([[0, 0, 12],
              [0, 1, 1],
              [0, 5, 34],
              [1, 3, 12],
              [1, 2, 6],
              [2, 0, 23],
              [3, 4, 14],
              ])
s = sp_sparse.csr_matrix((d[:,2],(d[:,0],d[:,1])), shape=(4,6))
D4 = metrics.pairwise.pairwise_distances(s,metric = 'euclidean')
print(s.toarray())
print(D4)

###Singluar Value Decomposition###

For the singular value decomposition we will use the libraries from Numpy and SciKit

Numpy: http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html

SciKit: http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html

In [None]:
import sklearn.datasets as sk_data

data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
#sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)

U, s, V = np.linalg.svd(data)
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[0:6])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')

In [None]:
import scipy.sparse.linalg as linalg

data2 = sp_sparse.csc_matrix(data)
U,s,V = linalg.svds(data2)
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s)
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')

In [None]:
k = 3
U_k,s_k,V_k = linalg.svds(data2, k, which = 'LM')
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k)
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)
data_k = U_k.dot(S_k).dot(V_k)