import numpy as np
import scipy as sp
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist

import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import sklearn.cluster as sk_cluster
import sklearn.feature_extraction.text as sk_text


import scipy.cluster.hierarchy as hr

import time
import seaborn as sns

%matplotlib inline


from sklearn.preprocessing import OrdinalEncoder

X = [['married','Yes','Athens'],
     ['single','No', 'Ioannina'],
     ['married','No', 'Thessaloniki'],
     ['divorced', 'Yes', 'Athens']]
enc = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)
enc.fit(X)
print(enc.categories_)
print(enc.transform(X))
Y = [['married','No','Athens'],
     ['single','Yes', 'Ioannina'],
     ['single','Yes', 'Patras']
    ]
enc.transform(Y)

[array(['divorced', 'married', 'single'], dtype=object), array(['No', 'Yes'], dtype=object), array(['Athens', 'Ioannina', 'Thessaloniki'], dtype=object)]
[[1. 1. 0.]
 [2. 0. 1.]
 [1. 0. 2.]
 [0. 1. 0.]]

array([[ 1.,  0.,  0.],
       [ 2.,  1.,  1.],
       [ 2.,  1., nan]])


X = [['married','Yes',30000],
     ['single','No', 24000],
     ['divorced', 'Yes', 50000]]
enc = OrdinalEncoder(handle_unknown = 'use_encoded_value',unknown_value = -1)
enc.fit(X)
print(enc.categories_)
print(enc.transform(X))
Y = [['married','No',10000],
     ['single','Yes', 24000]]
print(enc.transform(Y))

[array(['divorced', 'married', 'single'], dtype=object), array(['No', 'Yes'], dtype=object), array([24000, 30000, 50000], dtype=object)]
[[1. 1. 1.]
 [2. 0. 0.]
 [0. 1. 2.]]
[[ 1.  0. -1.]
 [ 2.  1.  0.]]


from sklearn.feature_extraction import DictVectorizer

measurements = [
{'city': 'Dubai', 'temperature': 45},
{'city': 'London', 'temperature': 12},
{'city': 'San Fransisco', 'temperature': 23},
]
vec = DictVectorizer()
print(type(vec.fit_transform(measurements)))
print(vec.fit_transform(measurements).toarray())
vec.get_feature_names_out()

<class 'scipy.sparse.csr.csr_matrix'>
[[ 1.  0.  0. 45.]
 [ 0.  1.  0. 12.]
 [ 0.  0.  1. 23.]]

array(['city=Dubai', 'city=London', 'city=San Fransisco', 'temperature'],
      dtype=object)


from sklearn.feature_extraction import DictVectorizer

measurements = [
{'city': 'Dubai', 'temperature': 45, 'dummy': 3},
{'city': 'London', 'temperature': 12},
{'city': 'San Fransisco', 'temperature': 23},
]
vec = DictVectorizer()
vec.fit(measurements)
print(vec.get_feature_names_out())
print(vec.transform(measurements).toarray())
x = {'city': 'Athens', 'temperature': 32, 'dummy2': 2}
print(vec.transform(x).toarray())

['city=Dubai' 'city=London' 'city=San Fransisco' 'dummy' 'temperature']
[[ 1.  0.  0.  3. 45.]
 [ 0.  1.  0.  0. 12.]
 [ 0.  0.  1.  0. 23.]]
[[ 0.  0.  0.  0. 32.]]


measurements = [
    {'refund' : 'No','marital_status': 'married', 'income' : 100},
    {'refund' : 'Yes','marital_status': 'single', 'income' : 120},
    {'refund' : 'No','marital_status':'divorced', 'income' : 80},
]
vec = DictVectorizer()
print(vec.fit_transform(measurements))
vec.get_feature_names_out()

  (0, 0)	100.0
  (0, 2)	1.0
  (0, 4)	1.0
  (1, 0)	120.0
  (1, 3)	1.0
  (1, 5)	1.0
  (2, 0)	80.0
  (2, 1)	1.0
  (2, 4)	1.0

array(['income', 'marital_status=divorced', 'marital_status=married',
       'marital_status=single', 'refund=No', 'refund=Yes'], dtype=object)


X = [[0,1,2],
     [1,2,3],
     [0,1,4]]
enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc.fit(X)
enc.transform([[0,2,4],[1,1,2]]).toarray()
enc.transform([[2,2,4],[1,1,2]]).toarray()

array([[0., 0., 0., 1., 0., 0., 1.],
       [0., 1., 1., 0., 1., 0., 0.]])


enc.categories_

[array([0, 1]), array([1, 2]), array([2, 3, 4])]


X = [['married','Yes',30000],
     ['single','No', 24000],
     ['divorced', 'Yes', 50000]]
enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc.fit_transform(X).toarray()

array([[0., 1., 0., 0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1.]])


X = [[0,1,2],
     [1,2,3],
     [0,1,4]]
enc = preprocessing.OneHotEncoder(drop = 'if_binary')
enc.fit(X)
print(enc.categories_)
print(enc.transform([[0,2,4],[1,1,2]]).toarray())

[array([0, 1]), array([1, 2]), array([2, 3, 4])]
[[0. 1. 0. 0. 1.]
 [1. 0. 1. 0. 0.]]


import sklearn.feature_extraction.text as sk_text

corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first document?',
          ]

vectorizer = sk_text.CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
vectorizer.get_feature_names_out()

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)


import sklearn.feature_extraction.text as sk_text

corpus = ['This is the first document.',
           'this is the second second document.',
           'And the third one.',
           'Is this the first document?',
          ]

vectorizer = sk_text.CountVectorizer(min_df=2)
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
vectorizer.get_feature_names_out()

[[1 1 1 1 1]
 [1 0 1 1 1]
 [0 0 0 1 0]
 [1 1 1 1 1]]

array(['document', 'first', 'is', 'the', 'this'], dtype=object)


vectorizer = sk_text.CountVectorizer(min_df=1,stop_words = 'english')

X2 = vectorizer.fit_transform(corpus)
print(X2.toarray())  
vectorizer.get_feature_names()

[[1 0]
 [1 2]
 [0 0]
 [1 0]]

['document', 'second']


vectorizer = sk_text.TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
print (vectorizer.get_feature_names_out())

[[0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]
 [0.         0.27230147 0.         0.27230147 0.         0.85322574
  0.22262429 0.         0.27230147]
 [0.55280532 0.         0.         0.         0.55280532 0.
  0.28847675 0.55280532 0.        ]
 [0.         0.43877674 0.54197657 0.43877674 0.         0.
  0.35872874 0.         0.43877674]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


vectorizer = sk_text.TfidfVectorizer(stop_words = 'english',min_df=1)
X = vectorizer.fit_transform(corpus)
print(X.toarray())  
print (vectorizer.get_feature_names_out())

[[1.         0.        ]
 [0.30403549 0.9526607 ]
 [0.         0.        ]
 [1.         0.        ]]
['document', 'second']

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


from sklearn.datasets import fetch_20newsgroups

categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
#categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = sk_data.fetch_20newsgroups(subset='train', 
                               remove=('headers', 'footers', 'quotes'),
                               categories=categories)
print (news_data.target)
print (len(news_data.target))

[2 0 0 ... 2 1 2]
1781


print (type(news_data))
print (news_data.filenames)
print (news_data.target[:10])
print (news_data.data[1])
print (len(news_data.data))

<class 'sklearn.utils.Bunch'>
['C:\\Users\\tsapa\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60940'
 'C:\\Users\\tsapa\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.os.ms-windows.misc\\9955'
 'C:\\Users\\tsapa\\scikit_learn_data\\20news_home\\20news-bydate-train\\comp.os.ms-windows.misc\\9846'
 ...
 'C:\\Users\\tsapa\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60891'
 'C:\\Users\\tsapa\\scikit_learn_data\\20news_home\\20news-bydate-train\\rec.sport.baseball\\104484'
 'C:\\Users\\tsapa\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\61110']
[2 0 0 2 0 0 1 2 2 1]
Recently the following problem has arrisen.  The first time I turn on my  
computer when windows starts (from my autoexec) after the win31 title screen  
the computer reboots on its own.  Usually the second time (after reboot) or  
from the DOS prompt everything works fine.

 s far as I remember I have not changed my config.sys or autoxec.bat or  
win.ini.  I can't remember whether this problem occured before I  
optimized/defragmented my disk and created a larger swap file (Thank you  
MathCAD 4 :(  )

System 386sx, 4MB, stacker 2.0, win31, DOS 5

---
---------------------------------------------------------------------
1781


vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                             #max_features = 100,
                             min_df=4, max_df=0.8)
data = vectorizer.fit_transform(news_data.data)
print(type(data))
print(vectorizer.get_feature_names_out())
print(data[0])

<class 'scipy.sparse.csr.csr_matrix'>
['00' '02' '04' '0d' '0t' '10' '14' '145' '17' '1d9' '1t' '2di' '2tm' '34'
 '34u' '3t' '45' '5u' '6ei' '6um' '75u' '7ey' '7u' '9v' 'a86' 'ah' 'air'
 'available' 'ax' 'b8f' 'better' 'bhj' 'bj' 'bxn' 'c_' 'chz' 'ck' 'cx'
 'd9' 'data' 'did' 'does' 'don' 'dos' 'earth' 'edu' 'file' 'files' 'g9v'
 'game' 'giz' 'gk' 'good' 'got' 'information' 'just' 'know' 'launch'
 'like' 'lk' 'll' 'lunar' 'make' 'max' 'mq' 'mv' 'nasa' 'need' 'new'
 'orbit' 'people' 'pl' 'problem' 'program' 'qq' 'really' 'right' 'run'
 'satellite' 'shuttle' 'sl' 'space' 't7' 'team' 'thanks' 'think' 'time'
 'use' 'used' 'using' 'uw' 've' 'w7' 'way' 'win' 'windows' 'wm' 'work'
 'year' 'years']
  (0, 86)	0.20827202576766465
  (0, 52)	0.07120433322618305
  (0, 68)	0.9733274956243896
  (0, 85)	0.06470156469515784


from sklearn import preprocessing

X = np.array([[ 1., -1.,  2.],
              [ 2.,  0.,  1.],
              [ 0.,  1., -1.]])
print("column means: ",X.mean(axis = 0))
print("column std: ",X.std(axis = 0))
X_scaled = preprocessing.scale(X)
print("after feature normalization")
print(X_scaled)
print("normalized column means: ",X_scaled.mean(axis=0))
print("normalized column std: ",X_scaled.var(axis = 0))

column means:  [1.         0.         0.66666667]
column std:  [0.81649658 0.81649658 1.24721913]
after feature normalization
[[ 0.         -1.22474487  1.06904497]
 [ 1.22474487  0.          0.26726124]
 [-1.22474487  1.22474487 -1.33630621]]
normalized column means:  [0.00000000e+00 0.00000000e+00 1.48029737e-16]
normalized column std:  [1. 1. 1.]


print("row means: ",X.mean(axis = 1))
print("row std: ",X.std(axis = 1))
X_scaled = preprocessing.scale(X, axis = 1)
print("after row normalization")
print(X_scaled)
print("normalized row means: ",X_scaled.mean(axis=1))
print("normalized row std: ",X_scaled.var(axis = 1))

row means:  [0.66666667 1.         0.        ]
row std:  [1.24721913 0.81649658 0.81649658]
after row normalization
[[ 0.26726124 -1.33630621  1.06904497]
 [ 1.22474487 -1.22474487  0.        ]
 [ 0.          1.22474487 -1.22474487]]
normalized row means:  [1.48029737e-16 0.00000000e+00 0.00000000e+00]
normalized row std:  [1. 1. 1.]


import scipy.sparse
cX = scipy.sparse.csc_matrix(X)
cX_scaled = preprocessing.scale(cX)
print(cX_scaled)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-62-961e7864f1cd> in <module>
      1 import scipy.sparse
      2 cX = scipy.sparse.csc_matrix(X)
----> 3 cX_scaled = preprocessing.scale(cX)
      4 print(cX_scaled)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in scale(X, axis, with_mean, with_std, copy)
    204         if with_mean:
    205             raise ValueError(
--> 206                 "Cannot center sparse matrices: pass `with_mean=False` instead"
    207                 " See docstring for motivation and alternatives."
    208             )

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead See docstring for motivation and alternatives.


from sklearn import preprocessing
std_scaler = preprocessing.StandardScaler()
std_scaler.fit(X)
print(std_scaler.mean_)
print(std_scaler.scale_)
X_std = std_scaler.transform(X)
print("scaled data:")
print(X_std)

[1.         0.         0.66666667]
[0.81649658 0.81649658 1.24721913]
scaled data:
[[ 0.         -1.22474487  1.06904497]
 [ 1.22474487  0.          0.26726124]
 [-1.22474487  1.22474487 -1.33630621]]


y = np.array([[2.,3.,1.],
              [1.,2.,1.]])
print(std_scaler.transform(y))

[[1.22474487 3.67423461 0.26726124]
 [0.         2.44948974 0.26726124]]


min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
print(X_minmax)
print(min_max_scaler.transform(y))

[[0.5        0.         1.        ]
 [1.         0.5        0.66666667]
 [0.         1.         0.        ]]
[[1.         2.         0.66666667]
 [0.5        1.5        0.66666667]]


max_abs_scaler = preprocessing.MaxAbsScaler()
X_maxabs = max_abs_scaler.fit_transform(X)
X_maxabs

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0.5],
       [ 0. ,  1. , -0.5]])


# works with sparse data
cX_scaled = max_abs_scaler.transform(cX)
print(cX_scaled)

  (0, 0)	0.5
  (1, 0)	1.0
  (0, 1)	-1.0
  (2, 1)	1.0
  (0, 2)	1.0
  (1, 2)	0.5
  (2, 2)	-0.5


#works with sparse data

X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 0.89442719,  0.        ,  0.4472136 ],
       [ 0.        ,  0.70710678, -0.70710678]])


crX = scipy.sparse.csr_matrix(X)
crX_scaled = preprocessing.normalize(crX,norm='l1')
print(crX_scaled)

  (0, 0)	0.25
  (0, 1)	-0.25
  (0, 2)	0.5
  (1, 0)	0.6666666666666666
  (1, 2)	0.3333333333333333
  (2, 1)	0.5
  (2, 2)	-0.5

Processing Complex Data¶

Ordinal Encoder¶

DictVectorizer¶

OneHotEncoder¶

Text processing¶

CountVectorizer¶

TfIdfVectorizer¶

Feature normalization¶