import pandas as pd
import pandas_datareader.data as web # For accessing web data
from datetime import datetime #For handling dates
import os

import matplotlib.pyplot as plt #main plotting tool for python
import matplotlib as mpl

import seaborn as sns #A more fancy plotting library

#For presenting plots inline
%matplotlib inline


os.environ["IEX_API_KEY"] = "pk_4f1eb9a770e04d2ebc44123e297618bb"#"pk_******************************"


stocks = 'FB'
data_source = 'iex'
start = datetime(2018,1,1)
end = datetime(2018,12,31)

stocks_data = web.DataReader(stocks, data_source, start, end)

#If you want to load only some of the attributes:
#stocks_data = web.DataReader(stocks, data_source, start, end)[['open','close']]


df = stocks_data
df = df.rename(columns = {'volume':'vol'})


df['profit'] = (df.close - df.open)
for idx, row in df.iterrows():
    if row.close < row.open:
        df.loc[idx,'gain']='negative'
    elif (row.close - row.open) < 1:
        df.loc[idx,'gain']='small_gain'
    elif (row.close - row.open) < 3:
        df.loc[idx,'gain']='medium_gain'
    else:
        df.loc[idx,'gain']='large_gain'
        
for idx, row in df.iterrows():
    if row.vol < df.vol.mean():
        df.loc[idx,'size']='small'
    else:
        df.loc[idx,'size']='large'
        
df.head()


gain_groups = df.groupby('gain')
gdf= df[['open','low','high','close','vol','gain']].groupby('gain').mean()
gdf = gdf.reset_index()

gdf


#plot a column of the dataframe against the index
df.high.plot()

<AxesSubplot:xlabel='date'>


df.high.plot()
df.low.plot(label='low values')
plt.legend(loc='best') #puts the ledgent in the best possible position

<matplotlib.legend.Legend at 0x1f9de86e430>


#histogram for the values of a dataframe column
df.close.hist(bins=20)

<AxesSubplot:>


#histogram with the kernel density estimation (a smoothed function over the hitogram)
sns.histplot(df.close,bins=20,kde=True)

<AxesSubplot:xlabel='close', ylabel='Count'>


sns.displot(df.close,bins=50,kde=True)

<seaborn.axisgrid.FacetGrid at 0x1f9dea9bd90>


dff = pd.read_csv('example-functions.csv')
dfs = dff.sort_values(by='A', ascending = True) #Sorting in data frames


plt.figure(); 
dfs.plot(x = 'A', y = 'B');
plt.figure(); 
dfs.plot(x = 'A', y = 'C');
plt.figure(); 
dfs.plot(x = 'A', y = 'D');

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>


#plt.figure(); 
fig, ax = plt.subplots(1, 3,figsize=(20,5))
dfs.plot(x = 'A', y = 'B',ax = ax[0]);
dfs.plot(x = 'A', y = 'C',ax = ax[1]);
dfs.plot(x = 'A', y = 'D',ax = ax[2]);


plt.figure(); dfs.plot(x = 'A', y = ['B','C','D']);

<Figure size 640x480 with 0 Axes>


plt.figure(); dfs.plot(x = 'A', y = ['B','C','D'], loglog=True);

<Figure size 640x480 with 0 Axes>


plt.figure(); dfs.plot(x = 'A', y = ['B','C','D'], logy=True);

<Figure size 640x480 with 0 Axes>


plt.figure(figsize = (15,5)) #defines the size of figure
plt.subplot(121) #plot with 1 row, 2 columns, 1st plot
plt.plot(dfs['A'],dfs['B'],'bo-',dfs['A'],dfs['C'],'g*-',dfs['A'],dfs['D'],'rs-')
plt.subplot(122)  #plot with 1 row, 2 columns, 2nd plot
plt.loglog(dfs['A'],dfs['B'],'bo-',dfs['A'],dfs['C'],'g*-',dfs['A'],dfs['D'],'rs-')

[<matplotlib.lines.Line2D at 0x1f9e098fd00>,
 <matplotlib.lines.Line2D at 0x1f9e098fca0>,
 <matplotlib.lines.Line2D at 0x1f9e098feb0>]


sns.lineplot(x= 'A', y='B',data = dfs,marker='o')

<AxesSubplot:xlabel='A', ylabel='B'>


fig, ax = plt.subplots(1, 2, figsize=(15,5))
dff.plot(kind ='scatter', x='A', y='B', ax = ax[0])
dff.plot(kind ='scatter', x='A', y='B', loglog = True,ax = ax[1])

<AxesSubplot:xlabel='A', ylabel='B'>


plt.scatter(dff.A, dff.B)

<matplotlib.collections.PathCollection at 0x1f9e0d6ba90>


fig = plt.figure()
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('log')
plt.scatter([1,2,3],[3,2,1])

<matplotlib.collections.PathCollection at 0x1f9e047bb50>


t = dff.plot(kind='scatter', x='A', y='B', color='DarkBlue', label='B curve', loglog=True);
dff.plot(kind='scatter', x='A', y='C',color='DarkGreen', label='C curve', ax=t, loglog = True);
dff.plot(kind='scatter', x='A', y='D',color='Red', label='D curve', ax=t, loglog = True);


sns.scatterplot(x='A',y='B', data = dff)

<AxesSubplot:xlabel='A', ylabel='B'>


splot = sns.scatterplot(x='A',y='B', data = dff)
#splot.set(xscale="log", yscale="log")
splot.loglog()

[]

gdf


import scipy as sp #library for scientific computations 
from scipy import stats #The statistics part of the library


#Test statistical significance of the difference in the mean volume numbers

sm = gain_groups.get_group('small_gain').vol
lg = gain_groups.get_group('large_gain').vol
med = gain_groups.get_group('medium_gain').vol
neg = gain_groups.get_group('negative').vol
print(stats.ttest_ind(sm,neg,equal_var = False))
print(stats.ttest_ind(sm,med, equal_var = False))
print(stats.ttest_ind(sm,lg, equal_var = False))
print(stats.ttest_ind(neg,med,equal_var = False))
print(stats.ttest_ind(neg,lg,equal_var = False))
print(stats.ttest_ind(med,lg, equal_var = False))

Ttest_indResult(statistic=-0.795639498508195, pvalue=0.429417750163685)
Ttest_indResult(statistic=-0.6701399815165451, pvalue=0.5044832095805989)
Ttest_indResult(statistic=-1.2311419812548245, pvalue=0.22206628199791936)
Ttest_indResult(statistic=-0.06722743349643102, pvalue=0.9465813743143181)
Ttest_indResult(statistic=-0.7690284467674666, pvalue=0.44515731685000526)
Ttest_indResult(statistic=-0.5334654665318223, pvalue=0.5950877691078408)


import numpy as np
stats.ks_2samp(np.array(sm), np.array(lg), alternative='two-sided')

KstestResult(statistic=0.26170072511535925, pvalue=0.10972343109925745)


stats.kstest(np.array(sm), np.array(lg), alternative='two-sided')

KstestResult(statistic=0.26170072511535925, pvalue=0.10972343109925745)


stats.kstest(np.array(sm), 'norm')

KstestResult(statistic=1.0, pvalue=0.0)


# The crosstab methond creates the contigency table for the two attributes.
cdf = pd.crosstab(df['gain'],df['size'])
cdf


stats.chi2_contingency(cdf)

(9.442582630336563,
 0.02395009556491976,
 3,
 array([[10.76095618, 26.23904382],
        [15.12350598, 36.87649402],
        [35.19123506, 85.80876494],
        [11.92430279, 29.07569721]]))


print(sm.sem())
print(neg.sem())
print(stats.sem(med))
print(stats.sem(lg))

3207950.267667195
1530132.8120272094
3271861.2395884297
3064988.17806777


conf = 0.95
t = stats.t.ppf((1+conf)/2., len(sm)-1)
print(sm.mean()-sm.sem()*t,  ",", sm.mean()+sm.sem()*t)

18399882.586583555 , 31366901.218294494


#sns.barplot(x='gain',y='vol', data = df, ci=95) #for older seaborn versions
sns.barplot(x='gain',y='vol', data = df, errorbar=('ci', 95))

<AxesSubplot:xlabel='gain', ylabel='vol'>


sns.pointplot(x='gain',y='vol', data = df,join = False, errorbar=('ci', 95), capsize = 0.1)

<AxesSubplot:xlabel='gain', ylabel='vol'>


sns.boxplot(x='gain',y='vol', data = df)

<AxesSubplot:xlabel='gain', ylabel='vol'>


#Removing outliers
sns.boxplot(x='gain',y='vol', data = df, showfliers = False)

<AxesSubplot:xlabel='gain', ylabel='vol'>


sns.violinplot(x='gain',y='vol', data = df)

<AxesSubplot:xlabel='gain', ylabel='vol'>


df['all'] = ''
sns.violinplot(x = 'all', y='profit',hue='size', split=True, data = df)

<AxesSubplot:xlabel='all', ylabel='profit'>


sns.violinplot(x='gain',y='profit', hue='size', data = df, split=True)

<AxesSubplot:xlabel='gain', ylabel='profit'>


df = df.reset_index()
df.date = df.date.apply(lambda d: datetime.strptime(d, "%Y-%m-%d"))


def get_month(row):
    return row.date.month

df['month'] = df.apply(get_month,axis = 1)


#sns.lineplot(x='month', y = 'vol', data = df, ci=95)
sns.lineplot(x='month', y = 'vol', data = df, errorbar=('ci', 95))

<AxesSubplot:xlabel='month', ylabel='vol'>


df['positive_profit'] = (df.profit>0)
sns.lineplot(x='month', y = 'vol', hue='positive_profit', data = df)

<AxesSubplot:xlabel='month', ylabel='vol'>


df.drop('date',axis=1)


stocks = ['FB','GOOG','TSLA', 'MSFT','NFLX']
attr = 'close'
dfmany = web.DataReader(stocks, 
                    data_source,                               
                    start=datetime(2018, 1, 1), 
                    end=datetime(2018, 12, 31))[attr]
dfmany.head()


dfmany.FB.plot(label = 'facebook')
dfmany.GOOG.plot(label = 'google')
dfmany.TSLA.plot(label = 'tesla')
dfmany.MSFT.plot(label = 'microsoft')
dfmany.NFLX.plot(label = 'netflix')
_ = plt.legend(loc='best')


rets = dfmany.pct_change(30)
rets.iloc[25:35]


rets.FB.plot(label = 'facebook')
rets.GOOG.plot(label = 'google')
rets.TSLA.plot(label = 'tesla')
rets.MSFT.plot(label = 'microsoft')
rets.NFLX.plot(label = 'netflix')
_ = plt.legend(loc='best')


plt.scatter(rets.TSLA, rets.GOOG)
plt.xlabel('TESLA 30-day returns')
_ = plt.ylabel('GOOGLE 30-day returns')


data_source = 'iex'
start = datetime(2018,1,1)
end = datetime(2018,12,31)

dfb = web.DataReader('FB', data_source, start, end)
dgoog = web.DataReader('GOOG', data_source, start, end)

print(dfb.head())
print(dgoog.head())

              open    high       low   close    volume
date                                                  
2018-01-02  177.68  181.58  177.5500  181.42  18151903
2018-01-03  181.88  184.78  181.3300  184.67  16886563
2018-01-04  184.90  186.21  184.0996  184.33  13880896
2018-01-05  185.59  186.90  184.9300  186.85  13574535
2018-01-08  187.20  188.90  186.3300  188.28  17994726
               open       high        low    close      volume
date                                                          
2018-01-02  52.4170  53.347000  52.261500  53.2500  24751280.0
2018-01-03  53.2155  54.314500  53.160500  54.1240  28603400.0
2018-01-04  54.4000  54.678495  54.200085  54.3200  20092100.0
2018-01-05  54.7000  55.212500  54.600000  55.1115  25582460.0
2018-01-08  55.1115  55.563500  55.081000  55.3470  20952060.0


def gainrow(row):
    if row.close < row.open:
        return 'negative'
    elif (row.close - row.open) < 1:
        return 'small_gain'
    elif (row.close - row.open) < 3:
        return 'medium_gain'
    else:
        return 'large_gain'
    
dfb['gain'] = dfb.apply(gainrow, axis = 1)
dgoog['gain'] = dgoog.apply(gainrow, axis = 1)
dfb['profit'] = dfb.close-dfb.open
dgoog['profit'] = dgoog.close-dgoog.open


#Also using seaborn
fig = sns.scatterplot(x = dfb.profit, y = dgoog.profit)
fig.set_xlabel('FB profit')
fig.set_ylabel('GOOG profit')

Text(0, 0.5, 'GOOG profit')


sns.pairplot(rets.iloc[30:])

<seaborn.axisgrid.PairGrid at 0x1f9e0c2e9a0>


rets.corr()


rets.corr(method='spearman')


_ = sns.heatmap(rets.corr(), annot=True)


print(stats.pearsonr(rets.iloc[30:].NFLX, rets.iloc[30:].TSLA))
print(stats.spearmanr(rets.iloc[30:].NFLX, rets.iloc[30:].TSLA))
print(stats.pearsonr(rets.iloc[30:].GOOG, rets.iloc[30:].FB))
print(stats.spearmanr(rets.iloc[30:].GOOG, rets.iloc[30:].FB))

PearsonRResult(statistic=-0.12076398569612257, pvalue=0.0731862052389361)
SpearmanrResult(correlation=-0.065938830644713, pvalue=0.32918605296193537)
PearsonRResult(statistic=0.5987743268934734, pvalue=6.85914166684247e-23)
SpearmanrResult(correlation=0.5409485585956174, pvalue=3.388893335195231e-18)


print(stats.pearsonr(dfb.profit, dgoog.profit))
print(stats.spearmanr(dfb.profit, dgoog.profit))

PearsonRResult(statistic=0.750317985789302, pvalue=1.1738178432513165e-46)
SpearmanrResult(correlation=0.7189444847093646, pvalue=3.2346313802209346e-41)


_ = plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Standard Deviation (Risk)')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (20, -20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

	gain	open	low	high	close	vol
0	large_gain	170.459459	169.941454	175.660722	174.990811	3.034571e+07
1	medium_gain	172.305504	171.410923	175.321108	174.185577	2.795407e+07
2	negative	171.473133	168.024464	172.441342	169.233636	2.771124e+07
3	small_gain	171.217688	169.827283	173.070561	171.699146	2.488339e+07

	gain	open	low	high	close	vol
0	large_gain	170.459459	169.941454	175.660722	174.990811	3.034571e+07
1	medium_gain	172.305504	171.410923	175.321108	174.185577	2.795407e+07
2	negative	171.473133	168.024464	172.441342	169.233636	2.771124e+07
3	small_gain	171.217688	169.827283	173.070561	171.699146	2.488339e+07

Symbols	FB	GOOG	TSLA	MSFT	NFLX
date
2018-02-07	NaN	NaN	NaN	NaN	NaN
2018-02-08	NaN	NaN	NaN	NaN	NaN
2018-02-09	NaN	NaN	NaN	NaN	NaN
2018-02-12	NaN	NaN	NaN	NaN	NaN
2018-02-13	NaN	NaN	NaN	NaN	NaN
2018-02-14	-0.010473	0.004413	0.005550	0.056545	0.322922
2018-02-15	-0.025505	0.006504	0.053002	0.073075	0.366837
2018-02-16	-0.037813	0.007732	0.066332	0.056136	0.354472
2018-02-20	-0.058014	0.000209	0.057460	0.051366	0.326492
2018-02-21	-0.055078	0.003975	-0.009243	0.036362	0.325348

Symbols	FB	GOOG	TSLA	MSFT	NFLX
Symbols
FB	1.000000	0.598774	0.226680	0.470696	0.546996
GOOG	0.598774	1.000000	0.210444	0.790085	0.348008
TSLA	0.226680	0.210444	1.000000	-0.041910	-0.120764
MSFT	0.470696	0.790085	-0.041910	1.000000	0.489569
NFLX	0.546996	0.348008	-0.120764	0.489569	1.000000

Symbols	FB	GOOG	TSLA	MSFT	NFLX
Symbols
FB	1.000000	0.540949	0.271608	0.457852	0.641344
GOOG	0.540949	1.000000	0.288135	0.803731	0.382466
TSLA	0.271608	0.288135	1.000000	0.042190	-0.065939
MSFT	0.457852	0.803731	0.042190	1.000000	0.456912
NFLX	0.641344	0.382466	-0.065939	0.456912	1.000000

Plotting - Statistical Significance¶

Simple plots¶

Histograms¶

Plotting columns against each other¶

Grid of plots¶

Plotting using matplotlib¶

Scatter plots¶

Statistical Significance¶

The Student t-test¶

Kolomogorov-Smirnov Test¶

$\chi^2$-test¶

Error bars¶

Visualizing distributions¶

Seaborn lineplot¶

Comparing multiple stocks¶

Correlation Coefficients¶

Computing p-values¶

Matplotlib¶

	open	high	low	close	vol	profit	gain	size
date
2018-01-02	177.68	181.58	177.5500	181.42	18151903	3.74	large_gain	small
2018-01-03	181.88	184.78	181.3300	184.67	16886563	2.79	medium_gain	small
2018-01-04	184.90	186.21	184.0996	184.33	13880896	-0.57	negative	small
2018-01-05	185.59	186.90	184.9300	186.85	13574535	1.26	medium_gain	small
2018-01-08	187.20	188.90	186.3300	188.28	17994726	1.08	medium_gain	small

	open	high	low	close	vol	profit	gain	size	all	month	positive_profit
0	177.68	181.58	177.5500	181.42	18151903	3.74	large_gain	small		1	True
1	181.88	184.78	181.3300	184.67	16886563	2.79	medium_gain	small		1	True
2	184.90	186.21	184.0996	184.33	13880896	-0.57	negative	small		1	False
3	185.59	186.90	184.9300	186.85	13574535	1.26	medium_gain	small		1	True
4	187.20	188.90	186.3300	188.28	17994726	1.08	medium_gain	small		1	True
...	...	...	...	...	...	...	...	...	...	...	...
246	123.10	129.74	123.0200	124.06	22066002	0.96	small_gain	small		12	True
247	126.00	134.24	125.8900	134.18	39723370	8.18	large_gain	large		12	True
248	132.44	134.99	129.6700	134.52	31202509	2.08	medium_gain	large		12	True
249	135.34	135.92	132.2000	133.20	22627569	-2.14	negative	small		12	False
250	134.45	134.64	129.9500	131.09	24625308	-3.36	negative	small		12	False