import numpy as np
import scipy as sp
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline

import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg

import time

def trad_version():
    t1 = time.time()
    X = range(10000000)
    Y = range(10000000)
    Z = [x+y for x,y in zip(X,Y)]
    return time.time() - t1

def naive_numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = np.zeros(10000000)
    for i in range(10000000):
        Z[i] = X[i]+Y[i]
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = X + Y
    return time.time() - t1


traditional_time = trad_version()
naive_numpy_time = naive_numpy_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Naive numpy time = "+ str(naive_numpy_time))
print ("Numpy time       = "+ str(numpy_time))

Traditional time = 1.245307207107544
Naive numpy time = 5.399410963058472
Numpy time       = 0.09222745895385742

#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)

#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)

 Deterministic 1-dimensional array 

[ 2  5 18 14  4]

 Deterministic 2-dimensional array 

[[ 2  5 18 14  4]
 [12 15  1  2  8]]

d = {'A':[1., 2., 3., 4.],
     'B':[4., 3., 2., 1.]}
df = pd.DataFrame(d)
x = np.array(df)
print(x)

[[1. 4.]
 [2. 3.]
 [3. 2.]
 [4. 1.]]

#1-dimensional arrays
x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)

#2-dimensional arrays

x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)

x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)

 Random 1-dimensional array 

[0.49099602 0.91064379 0.71043661 0.13940421 0.29914724]

 Random 5x5 2-dimensional array 

[[0.00541619 0.73529302 0.18650485 0.41589755 0.11189038]
 [0.40646779 0.12266741 0.81782302 0.40342583 0.07142324]
 [0.24911343 0.99484185 0.71630743 0.19996326 0.28899052]
 [0.519806   0.40881907 0.65696129 0.57753493 0.34645183]
 [0.78938307 0.56299184 0.53884058 0.22560672 0.56223818]]

 Random 2x3 array with integers
[[7 1 5]
 [7 8 0]]

print("\n Matrix Dimensions \n")
print(x.shape)
print ("\n Transpose of the matrix \n")
print (x.T)
print ("\n Transpose matrix Dimensions \n")
print (x.T.shape)

 Matrix Dimensions 

(2, 3)

 Transpose of the matrix 

[[7 7]
 [1 8]
 [5 0]]

 Transpose matrix Dimensions 

(3, 2)

x = np.zeros((4,4))
print ("\n 4x4 array with zeros \n")
print(x)

x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)

x = np.eye(4)
print ("\n Identity matrix of size 4\n")
print(x)

x = np.diag([1,2,3])
print ("\n Diagonal matrix\n")
print(x)

 4x4 array with zeros 

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

 4x4 array with ones 

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]

 Identity matrix of size 4

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

 Diagonal matrix

[[1 0 0]
 [0 2 0]
 [0 0 3]]

A = np.random.randint(10,size=(2,3))
A

array([[0, 1, 4],
       [0, 9, 7]])

v = np.array([4,5])
D = np.diag(v)
print(D@A)

[[ 0  4 16]
 [ 0 45 35]]

x = np.random.randint(10, size = (2,4))
print (x)
print('\n mean value of all elements')
print (np.mean(x))
print('\n vector of mean values for columns')
print (np.mean(x,0)) #0 signifies the dimension meaning columns, similar to axis in pandas
print('\n vector of mean values for rows')
print (np.mean(x,1)) #1 signifies the dimension meaning rows, similar to axis in pandas

[[7 7 7 0]
 [7 3 1 8]]

 mean value of all elements
5.0

 vector of mean values for columns
[7. 5. 4. 4.]

 vector of mean values for rows
[5.25 4.75]

print('\n standard deviation of all elements')
print (np.std(x))
print('\n vector of std values for rows')
print (np.std(x,1)) #1 signifies the dimension meaning rows
print('\n median value of all elements')
print (np.median(x))
print('\n vector of median values for rows')
print (np.median(x,1))
print('\n sum of all elements')
print (np.sum(x))
print('\n vector of column sums')
print (np.sum(x,0))
print('\n product of all elements')
print (np.prod(x))
print('\n vector of row products')
print (np.prod(x,1))

 standard deviation of all elements
2.958039891549808

 vector of std values for rows
[3.03108891 2.86138079]

 median value of all elements
7.0

 vector of median values for rows
[7. 5.]

 sum of all elements
40

 vector of column sums
[14 10  8  8]

 product of all elements
0

 vector of row products
[  0 168]

x

array([[7, 7, 7, 0],
       [7, 3, 1, 8]])

s = np.sum(x,1)
s = np.diag(1/s)
s@x

array([[0.33333333, 0.33333333, 0.33333333, 0.        ],
       [0.36842105, 0.15789474, 0.05263158, 0.42105263]])

x = np.random.rand(4,3)
print(x)
print("\n element\n")
print(x[1,2])
print("\n row zero \n")
print(x[0,:])
print(x[0])
print('\nfirst two rows\n')
print(x[0:2])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0.5 \n")
print(x[x>0.5])

[[0.81890423 0.92385671 0.69106527]
 [0.0119096  0.63765518 0.0782286 ]
 [0.36878059 0.64664594 0.27896897]
 [0.49344704 0.55123512 0.46926471]]

 element

0.07822859705613627

 row zero 

[0.81890423 0.92385671 0.69106527]
[0.81890423 0.92385671 0.69106527]

first two rows

[[0.81890423 0.92385671 0.69106527]
 [0.0119096  0.63765518 0.0782286 ]]

 column 2 

[0.69106527 0.0782286  0.27896897 0.46926471]

 submatrix 

[[0.0119096  0.63765518]
 [0.36878059 0.64664594]]

 entries > 0.5 

[0.81890423 0.92385671 0.69106527 0.63765518 0.64664594 0.55123512]

x = np.random.rand(4,3)
print(x)

x[2,0] = -5      #change an entry
x[:2,:] += 1     #change a set of rows: add 1 to all the elements of the first two rows
x[2:4,1:3] = 0.5 #change a block
print('\n')
print(x)

print('\n Set entries > 0.5 to zero')
x[x>0.5] = 0
print(x)

[[0.17265396 0.37410032 0.88153288]
 [0.26991096 0.58855742 0.41793555]
 [0.27296378 0.19710328 0.35454239]
 [0.36349169 0.6119733  0.92001733]]


[[ 1.17265396  1.37410032  1.88153288]
 [ 1.26991096  1.58855742  1.41793555]
 [-5.          0.5         0.5       ]
 [ 0.36349169  0.5         0.5       ]]

 Set entries > 0.5 to zero
[[ 0.          0.          0.        ]
 [ 0.          0.          0.        ]
 [-5.          0.5         0.5       ]
 [ 0.36349169  0.5         0.5       ]]

x = np.random.rand(4,4)
print(x)
print('\n Read Diagonal \n')
print(x.diagonal())
print('\n Fill Diagonal with 1s \n')
np.fill_diagonal(x,1)
print(x)
print('\n Fill Diagonal with vector \n')
x[np.diag_indices_from(x)] = [1,2,3,4]
print(x)

[[0.48323675 0.61275566 0.58744828 0.84647565]
 [0.73637488 0.29894126 0.40728027 0.43329871]
 [0.01232427 0.82412163 0.22230007 0.37504653]
 [0.35625428 0.9281761  0.79315754 0.65680669]]

 Read Diagonal 

[0.48323675 0.29894126 0.22230007 0.65680669]

 Fill Diagonal with 1s 

[[1.         0.61275566 0.58744828 0.84647565]
 [0.73637488 1.         0.40728027 0.43329871]
 [0.01232427 0.82412163 1.         0.37504653]
 [0.35625428 0.9281761  0.79315754 1.        ]]

 Fill Diagonal with vector 

[[1.         0.61275566 0.58744828 0.84647565]
 [0.73637488 2.         0.40728027 0.43329871]
 [0.01232427 0.82412163 3.         0.37504653]
 [0.35625428 0.9281761  0.79315754 4.        ]]

x = np.random.rand(10,5)
x[x<0.3]=1
x[x!=1]=0
x

array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0.]])

D = np.random.rand(10,5)
print(D)
D[D>=0.7] = 1     # 30% chance
D[D< 0.7] = 0     # 70% chance
#D[D <= 0.3] = 1
#D[D != 1] = 0
D

[[9.42232100e-01 3.58425633e-01 8.12084228e-02 7.51008492e-01
  7.58442267e-01]
 [9.67184463e-01 2.05657043e-02 4.41283704e-01 2.63702534e-01
  6.41301101e-01]
 [3.62625425e-01 8.80997243e-01 2.57877426e-01 4.57516425e-01
  9.49020558e-01]
 [4.75770577e-01 1.94059407e-01 5.38914226e-01 8.07661611e-01
  5.76720613e-01]
 [5.33703898e-01 9.39634586e-01 3.00815283e-01 4.17611434e-01
  2.55799710e-01]
 [8.53317070e-01 8.89848496e-04 5.40931044e-01 9.19885347e-01
  7.22101884e-01]
 [7.84254762e-01 3.52064496e-01 9.81325338e-01 4.36361615e-01
  8.95081513e-01]
 [7.15024069e-01 5.41803194e-01 9.71342622e-01 1.69731407e-01
  9.22796382e-01]
 [9.67642596e-01 9.79062265e-01 6.20071364e-01 6.04244640e-01
  5.80263800e-02]
 [2.84286306e-01 8.63639561e-02 6.48245848e-01 4.47694870e-01
  7.34820971e-01]]

array([[1., 0., 0., 1., 1.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 1., 0., 1.],
       [1., 0., 1., 0., 1.],
       [1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.]])

x = np.random.rand(4,3)
print(x)

#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n")
print(2*x+1)

[[0.70237892 0.48148116 0.93862939]
 [0.95091672 0.70373017 0.687826  ]
 [0.48222859 0.49723812 0.06310646]
 [0.47443025 0.88864703 0.07837167]]

 Matrix 2x+1 

[[2.40475783 1.96296233 2.87725877]
 [2.90183344 2.40746035 2.37565199]
 [1.96445718 1.99447623 1.12621293]
 [1.94886049 2.77729407 1.15674335]]

y = np.array([2,-1,3])
z = np.array([-1,2,2])
print('\n y:',y)
print(' z:',z)
print('\n vector-vector dot product')
print(y.dot(z))
print(np.dot(y,z))
print(y@z)

 y: [ 2 -1  3]
 z: [-1  2  2]

 vector-vector dot product
2
2
2

print('\n y:',y)
print(' z:',z)
print('\n vector-vector external product')
print(np.outer(y,z))

 y: [ 2 -1  3]
 z: [-1  2  2]

 vector-vector external product
[[-2  4  4]
 [ 1 -2 -2]
 [-3  6  6]]

print('\n y:',y)
print(' z:',z)
print('\n element-wise addition')
print(y+z)
print('\n element-wise product')
print(y*z)
print('\n element-wise division')
print(y/z)
print('\n element-wise inversion')
print(1/y)

 y: [ 2 -1  3]
 z: [-1  2  2]

 element-wise addition
[1 1 5]

 element-wise product
[-2 -2  6]

 element-wise division
[-2.  -0.5  1.5]

 element-wise inversion
[ 0.5        -1.          0.33333333]

X = np.random.randint(10, size = (4,3))
print('Matrix X:\n',X)
y = np.array([1,0,0])
print("\n Matrix-vector right multiplication with",y,"\n")
print(X.dot(y))
print(np.dot(X,y))
print(X@y)
y = np.array([1,0,1,0])
print("\n Matrix-vector left multiplication with",y,"\n")
print(y.dot(X),'\n')
print(np.dot(y,X),'\n')
print(y@X,'\n')
print(y.shape)

Matrix X:
 [[0 7 4]
 [3 3 0]
 [6 7 7]
 [8 3 1]]

 Matrix-vector right multiplication with [1 0 0] 

[0 3 6 8]
[0 3 6 8]
[0 3 6 8]

 Matrix-vector left multiplication with [1 0 1 0] 

[ 6 14 11] 

[ 6 14 11] 

[ 6 14 11] 

(4,)

Y = np.random.randint(10, size=(3,2))
print("\n Matrix-matrix multiplication\n")
print('Matrix X:\n',X)
print('Matrix Y:\n',Y)
print('Product:\n',X.dot(Y))
print('Product:\n',np.dot(X,Y))
print('Product:\n',X@Y)

 Matrix-matrix multiplication

Matrix X:
 [[0 7 4]
 [3 3 0]
 [6 7 7]
 [8 3 1]]
Matrix Y:
 [[2 4]
 [0 5]
 [0 3]]
Product:
 [[ 0 47]
 [ 6 27]
 [12 80]
 [16 50]]
Product:
 [[ 0 47]
 [ 6 27]
 [12 80]
 [16 50]]
Product:
 [[ 0 47]
 [ 6 27]
 [12 80]
 [16 50]]

Z = np.random.randint(10, size=(3,2))+1
print('Matrix Y:\n',Y)
print('Matrix Z:\n',Z)
print("\n Matrix-matrix element-wise addition\n")
print(Y+Z)
print("\n Matrix-matrix element-wise multiplication\n")
print(Y*Z)
print("\n Matrix-matrix element-wise division\n")
print(Y/Z)

Matrix Y:
 [[2 4]
 [0 5]
 [0 3]]
Matrix Z:
 [[ 6 10]
 [ 3  6]
 [ 3  9]]

 Matrix-matrix element-wise addition

[[ 8 14]
 [ 3 11]
 [ 3 12]]

 Matrix-matrix element-wise multiplication

[[12 40]
 [ 0 30]
 [ 0 27]]

 Matrix-matrix element-wise division

[[0.33333333 0.4       ]
 [0.         0.83333333]
 [0.         0.33333333]]

M = np.random.randint(10,size=(3,3))
print(M)
print('\nInversion\n')
print(np.invert(M))

[[9 3 3]
 [1 5 1]
 [3 5 0]]

Inversion

[[-10  -4  -4]
 [ -2  -6  -2]
 [ -4  -6  -1]]

import scipy.sparse as sp_sparse

d = np.array([[0, 0, 12],
              [0, 1, 1],
              [0, 5, 34],
              [1, 3, 12],
              [1, 2, 6],
              [2, 0, 23],
              [3, 4, 14],
              ])
row = d[:,0]
col = d[:,1]
data = d[:,2]
# a matrix M with M[row[i],col[i]] = data[i] will be created
M = sp_sparse.csr_matrix((data,(row,col)), shape=(5,6))
print(M)
print('\n')
print(M.toarray()) #transforms back to full matrix

  (0, 0)	12
  (0, 1)	1
  (0, 5)	34
  (1, 2)	6
  (1, 3)	12
  (2, 0)	23
  (3, 4)	14


[[12  1  0  0  0 34]
 [ 0  0  6 12  0  0]
 [23  0  0  0  0  0]
 [ 0  0  0  0 14  0]
 [ 0  0  0  0  0  0]]

x = np.random.randint(2,size = (3,4))
print(x)
print('\n make x sparce')
A = sp_sparse.csr_matrix(x)
print(A)

[[1 0 0 1]
 [0 1 1 1]
 [0 1 1 1]]

 make x sparce
  (0, 0)	1
  (0, 3)	1
  (1, 1)	1
  (1, 2)	1
  (1, 3)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1

# Use lil (list of lists) representation, you can also use coo (coordinates)
A = sp_sparse.lil_matrix((10, 10))
A[0, :5] = np.random.randint(10,size = 5)
A[1, 5:10] = A[0, :5]
A.setdiag(np.random.randint(10,size = 10))
A[9,9] = 99
A[9,0]=1
print(A)
print('\n')
print(A.toarray())
print('\n')
print(A.diagonal())
A = A.tocsr() # makes it a compressed sparce row format. better for dot product.
B = A.dot(np.ones(10))
print('\n')
print(B)

  (0, 0)	7.0
  (0, 1)	2.0
  (0, 2)	2.0
  (0, 3)	8.0
  (0, 4)	7.0
  (1, 1)	7.0
  (1, 5)	7.0
  (1, 6)	2.0
  (1, 7)	2.0
  (1, 8)	8.0
  (1, 9)	7.0
  (2, 2)	5.0
  (3, 3)	1.0
  (4, 4)	7.0
  (6, 6)	3.0
  (7, 7)	5.0
  (8, 8)	7.0
  (9, 0)	1.0
  (9, 9)	99.0


[[ 7.  2.  2.  8.  7.  0.  0.  0.  0.  0.]
 [ 0.  7.  0.  0.  0.  7.  2.  2.  8.  7.]
 [ 0.  0.  5.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  7.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  3.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  5.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  7.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0. 99.]]


[ 7.  7.  5.  1.  7.  0.  3.  5.  7. 99.]


[ 26.  33.   5.   1.   7.   0.   3.   5.   7. 100.]

print(A.dot(A.T))

  (0, 4)	49.0
  (0, 3)	8.0
  (0, 2)	10.0
  (0, 1)	14.0
  (0, 9)	7.0
  (0, 0)	170.0
  (1, 9)	693.0
  (1, 8)	56.0
  (1, 7)	10.0
  (1, 6)	6.0
  (1, 1)	219.0
  (1, 0)	14.0
  (2, 2)	25.0
  (2, 0)	10.0
  (3, 3)	1.0
  (3, 0)	8.0
  (4, 4)	49.0
  (4, 0)	49.0
  (6, 6)	9.0
  (6, 1)	6.0
  (7, 7)	25.0
  (7, 1)	10.0
  (8, 8)	49.0
  (8, 1)	56.0
  (9, 1)	693.0
  (9, 9)	9802.0
  (9, 0)	7.0

A[0].mean()

2.6

import sklearn.datasets as sk_data

data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)

<Axes: >

U, s, V = np.linalg.svd(data,full_matrices = False)
print (U.shape, s.shape, V.shape)
print('\n')
print(s)
print('\n')
print(s[0:10])
plt.plot(s[0:10])
plt.ylabel('singular value')
plt.xlabel('number of singular values')

(100, 50) (50,) (50, 50)


[1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01
 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06
 1.12535175e-07 1.60522805e-09 1.38879469e-11 7.28775707e-14
 2.30977233e-16 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 8.80606104e-17 8.80606104e-17 8.80606104e-17
 8.80606104e-17 2.79258251e-17]


[1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01
 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06
 1.12535175e-07 1.60522805e-09]

Text(0.5, 0, 'number of singular values')

import scipy.sparse.linalg as sp_linalg

data2 = sp_sparse.csc_matrix(data)    # Convers dense matrix into sparce
print(data2.shape)
U,s,V = sp_linalg.svds(data2, k = 10) # By default returns k=6 singular values (truncated)
print (U.shape, s.shape, V.shape)     # Shapes now with 10
print(s)
plt.plot(s[::-1]) #invert the order of the singular values
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')

(100, 50)
(100, 10) (10,) (10, 50)
[7.19243037e-11 1.12535172e-07 4.78511739e-06 1.23409804e-04
 1.93045414e-03 1.83156389e-02 1.05399225e-01 3.67879441e-01
 7.78800783e-01 1.00000000e+00]

Text(0.5, 0, 'number of eigenvalues')

from sklearn.decomposition import TruncatedSVD

K = 10
svd = TruncatedSVD(n_components=K)
svd.fit(data2)
print(svd.components_.shape) # the V vectors
print(svd.transform(data2).shape) # the U vectors
print(svd.singular_values_)

(10, 50)
(100, 10)
[1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01
 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06
 1.12535175e-07 1.60522806e-09]

K = 6
U_k,s_k,V_k = sp_linalg.svds(data2, K, which = 'LM')  # LM = Largest Magnitude
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)
k=K
S_k[k:,k:]

(100, 6) (6,) (6, 50)
[0.00193045 0.01831564 0.10539922 0.36787944 0.77880078 1.        ]

array([], shape=(0, 0), dtype=float64)

reconstruction_error = []
for k in range(K-1,-1,-1): #iterate from end to start
    r = K-k
    #print(S_k[k:,k:])
    data_k = U_k[:,k:].dot(S_k[k:,k:]).dot(V_k[k:,:]) #here we obtain the rank-r matrix
    error = np.linalg.norm(data_k-data2,ord='fro')
    reconstruction_error.append(error)
    print(r,error)
data_k = U_k.dot(S_k).dot(V_k)
print(np.linalg.norm(data_k-data2,ord='fro'))

plt.plot(1+np.array(range(6)),reconstruction_error)
plt.ylabel('rank-k reconstruction error')
plt.xlabel('rank')

1 0.867936716599461
2 0.38312332780557673
3 0.10699626662742331
4 0.018417506182009283
5 0.001934400698365924
6 0.0001235025900939399
0.0001235025900939399

Text(0.5, 0, 'rank')

import numpy as np

M1 = np.random.randint(1,50,(50,20))
M2 = np.random.randint(1,10,(50,20))
M3 = np.random.randint(1,10,(50,20))
M4 = np.random.randint(1,50,(50,20))

T = np.concatenate((M1,M2),axis=1)
B = np.concatenate((M3,M4),axis=1)
M = np.concatenate((T,B),axis = 0)

plt.imshow(M, cmap='hot')
plt.show()

import scipy.stats as stats
import matplotlib.pyplot as plt

(U,S,V) = np.linalg.svd(M,full_matrices = False)
#print(S)
c = M.sum(0)
r = M.sum(1)
print(stats.pearsonr(r,U[:,0]))
print(stats.pearsonr(c,V[0]))
plt.scatter(r,U[:,0])
plt.figure()
plt.scatter(c,V[0])

PearsonRResult(statistic=-0.9970663433878887, pvalue=3.378751487702053e-111)
PearsonRResult(statistic=-0.9974163998658231, pvalue=4.4817009076071554e-45)

<matplotlib.collections.PathCollection at 0x7abb75e03490>

plt.scatter(U[:,0],U[:,1])

<matplotlib.collections.PathCollection at 0x7abb7602c1c0>

plt.scatter(x = U[:50,0],y = U[:50,1], color = 'r')
plt.scatter(x = U[50:,0],y = U[50:,1], color = 'b')

<matplotlib.collections.PathCollection at 0x7abb7ac45030>

plt.plot(S)

[<matplotlib.lines.Line2D at 0x7abb752bc610>]

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(M)

PCA(n_components=2)

PCA(n_components=2)

pca.components_

array([[ 0.14306541,  0.14487534,  0.18067289,  0.17552506,  0.15029528,
         0.17138462,  0.14508232,  0.15626947,  0.17196136,  0.14519418,
         0.17969753,  0.14678201,  0.1441037 ,  0.1345826 ,  0.18730451,
         0.15489532,  0.17924409,  0.13505082,  0.1368683 ,  0.16500907,
        -0.15827515, -0.16218406, -0.13480385, -0.13759941, -0.13540814,
        -0.14028941, -0.16462207, -0.14624149, -0.16677923, -0.15058234,
        -0.17726763, -0.17309603, -0.16824696, -0.13713112, -0.16492346,
        -0.14675898, -0.17549603, -0.17515781, -0.17248733, -0.15801875],
       [ 0.11803008, -0.20279266, -0.18448435,  0.00786422,  0.13712627,
         0.14498563,  0.04517913, -0.14512324,  0.07083928,  0.16662868,
        -0.09869157, -0.14462244,  0.01899539,  0.35485067, -0.18172827,
         0.34138432, -0.17229436,  0.26509572, -0.10632316, -0.307703  ,
        -0.03928568, -0.08798147,  0.0138625 ,  0.12876915, -0.05114553,
         0.12582067, -0.09659234, -0.07648754, -0.06089009, -0.04394221,
        -0.09978427,  0.13250641,  0.09411536,  0.01104493,  0.23778638,
         0.02127305, -0.11018378,  0.20352861, -0.05370611, -0.2956784 ]])

plt.scatter(pca.components_[0],pca.components_[1])

<matplotlib.collections.PathCollection at 0x7abb752fa770>

MPCA = pca.transform(M)
print(MPCA.shape)

(100, 2)

plt.scatter(MPCA[:,0],MPCA[:,1])

<matplotlib.collections.PathCollection at 0x7abb751c27d0>

from sklearn import datasets
iris = datasets.load_iris()  # 4 Features: Sepal length/width , Petal length/width
X = iris.data
y = iris.target              # contains the labels of the data

pca = PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)
pca.components_

array([[ 0.36138659, -0.08452251,  0.85667061,  0.3582892 ],
       [ 0.65658877,  0.73016143, -0.17337266, -0.07548102],
       [-0.58202985,  0.59791083,  0.07623608,  0.54583143]])

pca.explained_variance_

array([4.22824171, 0.24267075, 0.0782095 ])

plt.scatter(X[:,0],X[:,1])

<matplotlib.collections.PathCollection at 0x7abb743f9750>

plt.scatter(X[y==0,0],X[y==0,1], color='b')
plt.scatter(X[y==1,0],X[y==1,1], color='r')
plt.scatter(X[y==2,0],X[y==2,1], color='g')

<matplotlib.collections.PathCollection at 0x7abb74790d00>

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X[y==0,0],X[y==0,1], X[y==0,2], color='b')
ax.scatter(X[y==1,0],X[y==1,1], X[y==1,2], color='r')
ax.scatter(X[y==2,0],X[y==2,1], X[y==2,2], color='g')

import plotly.express as px
import pandas as pd

# Convert data to a DataFrame for Plotly
df = pd.DataFrame(X, columns=['PC1', 'PC2', 'PC3'])
df['label'] = y

# Create an interactive 3D scatter plot
fig = px.scatter_3d(
    df,
    x='PC1',
    y='PC2',
    z='PC3',
    color='label',
    title='3D PCA Plot',
    labels={'label': 'Class'}
)

fig.update_traces(marker=dict(size=5))
fig.show()

from sklearn.datasets import fetch_20newsgroups

categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(news_data.data)

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize


stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)
       for sent in sent_tokenize(message)
        for word in word_tokenize(sent))
        for message in news_data.data]

# stemmed_data = news_data.data

dtm = vectorizer.fit_transform(stemmed_data)
terms = vectorizer.get_feature_names()
print(terms)

dtm_dense = dtm.todense()
centered_dtm = dtm_dense - np.mean(dtm_dense, axis=0)
np.sum(centered_dtm,axis=0)[:,:10]

matrix([[ 9.68843061e-16, -5.26054894e-15,  2.83258660e-15,
          5.48389459e-16,  4.95209342e-15,  2.94935517e-15,
          1.01882478e-15, -6.34345007e-15,  1.35613092e-14,
          6.73457600e-15]])

u, s, vt = np.linalg.svd(centered_dtm)

plt.xlim([0,50])
plt.plot(range(1,len(s)+1),s)

k=2
vectorsk = np.array(u[:,:k] @ np.diag(s[:k]))
labels = [news_data.target_names[i] for i in news_data.target]
sns.scatterplot(x=vectorsk[:,0], y=vectorsk[:, 1], hue=labels)

<AxesSubplot:>

import seaborn as sns
k = 5
Xk = u[:,:k] @ np.diag(s[:k])
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
g.map(plt.scatter)

<seaborn.axisgrid.PairGrid at 0x15c297b2be0>

terms = vectorizer.get_feature_names()
for i in range(6):
    top = np.argsort(np.abs(vt[i]))
    topterms = [terms[top[0,f]] for f in range(12)]
    print (i, topterms)

Introduction to Numpy, Scipy, SciKit-Learn¶

Why Numpy?¶

Arrays¶

Creating Arrays¶

Operations on arrays.¶

Manipulating arrays¶

Accessing and Slicing¶

Changing entries¶

Manipulating the diagonal¶

Quiz¶

Operations with Arrays¶

Multiplication and addition with scalar¶

Inverting a matrix¶

Creating Sparse Arrays¶

Singluar Value Decomposition¶

Obtaining a low rank approximation of the data¶

An example¶

PCA using SciKit Learn¶