import pandas as pd
from datetime import datetime #For handling dates
import os

import matplotlib.pyplot as plt #main plotting tool for python
import matplotlib as mpl

import seaborn as sns #A more fancy plotting library

#For presenting plots inline
%matplotlib inline


os.environ["TIINGO_API_KEY"] = "614c1590a592cc6696f6082f83b2666cd83882ef"


from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
start = datetime(2018,1,1)
end = datetime(2018,12,31)
stocks_data = client.get_dataframe('META',frequency='daily',startDate=start,endDate=end)
stocks_data = stocks_data[['open','close','low','high','volume']]


stocks_data.head()


#Alternative without the client
import pandas_datareader.data as web # For accessing web data

stocks_data = web.get_data_tiingo('META', start, end, api_key=os.getenv('TIINGO_API_KEY'))

stocks_data = stocks_data.reset_index(level='symbol',drop=True)
stocks_data = stocks_data[['open','close','low','high','volume']]

C:\ProgramData\Anaconda3\lib\site-packages\pandas_datareader\tiingo.py:234: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only.
  return pd.concat(dfs, self._concat_axis)


df = stocks_data
df = df.rename(columns = {'volume':'vol'})


df['profit'] = (df.close - df.open)
for idx, row in df.iterrows():
    if row.close < row.open:
        df.loc[idx,'gain']='negative'
    elif (row.close - row.open) < 1:
        df.loc[idx,'gain']='small_gain'
    elif (row.close - row.open) < 3:
        df.loc[idx,'gain']='medium_gain'
    else:
        df.loc[idx,'gain']='large_gain'
        
for idx, row in df.iterrows():
    if row.vol < df.vol.mean():
        df.loc[idx,'size']='small'
    else:
        df.loc[idx,'size']='large'
        
df.head()


gain_groups = df.groupby('gain')
gdf= df[['open','low','high','close','vol','gain']].groupby('gain').mean()
gdf = gdf.reset_index()

gdf


#plot a column of the dataframe against the index
df.high.plot()

<AxesSubplot:xlabel='date'>


df.high.plot()
df.low.plot(label='low values')
plt.legend(loc='best') #puts the ledgent in the best possible position

<matplotlib.legend.Legend at 0x1f9de86e430>


#histogram for the values of a dataframe column
df.close.hist(bins=20)

<AxesSubplot:>


#histogram with the kernel density estimation (a smoothed function over the hitogram)
sns.histplot(df.close,bins=20,kde=True)

<AxesSubplot:xlabel='close', ylabel='Count'>


sns.displot(df.close,bins=50,kde=True)

<seaborn.axisgrid.FacetGrid at 0x1f9dea9bd90>


dff = pd.read_csv('example-functions.csv')
dfs = dff.sort_values(by='A', ascending = True) #Sorting in data frames


plt.figure(); 
dfs.plot(x = 'A', y = 'B');
plt.figure(); 
dfs.plot(x = 'A', y = 'C');
plt.figure(); 
dfs.plot(x = 'A', y = 'D');

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>


#plt.figure(); 
fig, ax = plt.subplots(1, 3,figsize=(20,5))
dfs.plot(x = 'A', y = 'B',ax = ax[0]);
dfs.plot(x = 'A', y = 'C',ax = ax[1]);
dfs.plot(x = 'A', y = 'D',ax = ax[2]);


plt.figure(); dfs.plot(x = 'A', y = ['B','C','D']);

<Figure size 640x480 with 0 Axes>


plt.figure(); dfs.plot(x = 'A', y = ['B','C','D'], loglog=True);

<Figure size 640x480 with 0 Axes>


plt.figure(); dfs.plot(x = 'A', y = ['B','C','D'], logy=True);

<Figure size 640x480 with 0 Axes>


plt.figure(figsize = (15,5)) #defines the size of figure
plt.subplot(121) #plot with 1 row, 2 columns, 1st plot
plt.plot(dfs['A'],dfs['B'],'bo-',dfs['A'],dfs['C'],'g*-',dfs['A'],dfs['D'],'rs-')
plt.subplot(122)  #plot with 1 row, 2 columns, 2nd plot
plt.loglog(dfs['A'],dfs['B'],'bo-',dfs['A'],dfs['C'],'g*-',dfs['A'],dfs['D'],'rs-')

[<matplotlib.lines.Line2D at 0x1f9e098fd00>,
 <matplotlib.lines.Line2D at 0x1f9e098fca0>,
 <matplotlib.lines.Line2D at 0x1f9e098feb0>]


sns.lineplot(x= 'A', y='B',data = dfs,marker='o')

<AxesSubplot:xlabel='A', ylabel='B'>


fig, ax = plt.subplots(1, 2, figsize=(15,5))
dff.plot(kind ='scatter', x='A', y='B', ax = ax[0])
dff.plot(kind ='scatter', x='A', y='B', loglog = True,ax = ax[1])

<AxesSubplot:xlabel='A', ylabel='B'>


plt.scatter(dff.A, dff.B)

<matplotlib.collections.PathCollection at 0x1f9e0d6ba90>


fig = plt.figure()
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('log')
plt.scatter([1,2,3],[3,2,1])

<matplotlib.collections.PathCollection at 0x1f9e047bb50>


t = dff.plot(kind='scatter', x='A', y='B', color='DarkBlue', label='B curve', loglog=True);
dff.plot(kind='scatter', x='A', y='C',color='DarkGreen', label='C curve', ax=t, loglog = True);
dff.plot(kind='scatter', x='A', y='D',color='Red', label='D curve', ax=t, loglog = True);


sns.scatterplot(x='A',y='B', data = dff)

<AxesSubplot:xlabel='A', ylabel='B'>


splot = sns.scatterplot(x='A',y='B', data = dff)
#splot.set(xscale="log", yscale="log")
splot.loglog()

[]

gdf


import scipy as sp #library for scientific computations 
from scipy import stats #The statistics part of the library


#Test statistical significance of the difference in the mean volume numbers

sm = gain_groups.get_group('small_gain').vol
lg = gain_groups.get_group('large_gain').vol
med = gain_groups.get_group('medium_gain').vol
neg = gain_groups.get_group('negative').vol
print(stats.ttest_ind(sm,neg,equal_var = False))
print(stats.ttest_ind(sm,med, equal_var = False))
print(stats.ttest_ind(sm,lg, equal_var = False))
print(stats.ttest_ind(neg,med,equal_var = False))
print(stats.ttest_ind(neg,lg,equal_var = False))
print(stats.ttest_ind(med,lg, equal_var = False))

Ttest_indResult(statistic=-0.795639498508195, pvalue=0.429417750163685)
Ttest_indResult(statistic=-0.6701399815165451, pvalue=0.5044832095805989)
Ttest_indResult(statistic=-1.2311419812548245, pvalue=0.22206628199791936)
Ttest_indResult(statistic=-0.06722743349643102, pvalue=0.9465813743143181)
Ttest_indResult(statistic=-0.7690284467674666, pvalue=0.44515731685000526)
Ttest_indResult(statistic=-0.5334654665318223, pvalue=0.5950877691078408)


import numpy as np
stats.ks_2samp(np.array(sm), np.array(lg), alternative='two-sided')

KstestResult(statistic=0.26170072511535925, pvalue=0.10972343109925745)


stats.kstest(np.array(sm), np.array(lg), alternative='two-sided')

KstestResult(statistic=0.26170072511535925, pvalue=0.10972343109925745)


stats.kstest(np.array(sm), 'norm')

KstestResult(statistic=1.0, pvalue=0.0)


# The crosstab methond creates the contigency table for the two attributes.
cdf = pd.crosstab(df['gain'],df['size'])
cdf


stats.chi2_contingency(cdf)

(9.442582630336563,
 0.02395009556491976,
 3,
 array([[10.76095618, 26.23904382],
        [15.12350598, 36.87649402],
        [35.19123506, 85.80876494],
        [11.92430279, 29.07569721]]))


print(sm.sem())
print(neg.sem())
print(stats.sem(med))
print(stats.sem(lg))

3207950.267667195
1530132.8120272094
3271861.2395884297
3064988.17806777


#confidence interval
conf = 0.95
t = stats.t.ppf((1+conf)/2.0, len(df)-1)
low = sm.mean()-sm.sem()*t
high = sm.mean()+sm.sem()*t
print(low,  ",", high)


#sns.barplot(x='gain',y='vol', data = df, ci=95) #for older seaborn versions
sns.barplot(x='gain',y='vol', data = df, errorbar=('ci', 95))

<AxesSubplot:xlabel='gain', ylabel='vol'>


sns.pointplot(x='gain',y='vol', data = df,join = False, errorbar=('ci', 95), capsize = 0.1)

<AxesSubplot:xlabel='gain', ylabel='vol'>


sns.boxplot(x='gain',y='vol', data = df)

<AxesSubplot:xlabel='gain', ylabel='vol'>


#Removing outliers
sns.boxplot(x='gain',y='vol', data = df, showfliers = False)

<AxesSubplot:xlabel='gain', ylabel='vol'>


sns.violinplot(x='gain',y='vol', data = df)

<AxesSubplot:xlabel='gain', ylabel='vol'>


df['all'] = ''
sns.violinplot(x = 'all', y='profit',hue='size', split=True, data = df)

<AxesSubplot:xlabel='all', ylabel='profit'>


sns.violinplot(x='gain',y='profit', hue='size', data = df, split=True)

<AxesSubplot:xlabel='gain', ylabel='profit'>


df = df.reset_index()
df.date = df.date.apply(lambda d: datetime.strptime(d, "%Y-%m-%d"))


def get_month(row):
    return row.date.month

df['month'] = df.apply(get_month,axis = 1)


#sns.lineplot(x='month', y = 'vol', data = df, ci=95)
sns.lineplot(x='month', y = 'vol', data = df, errorbar=('ci', 95))

<AxesSubplot:xlabel='month', ylabel='vol'>


df['positive_profit'] = (df.profit>0)
sns.lineplot(x='month', y = 'vol', hue='positive_profit', data = df)

<AxesSubplot:xlabel='month', ylabel='vol'>


df.drop('date',axis=1)


from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
ticker_history = client.get_dataframe(['GOOGL', 'AAPL'],
                                      frequency='daily',
                                      metric_name='close',
                                      startDate='2017-01-01',
                                      endDate='2018-05-31')


ticker_history.head()


stocks = ['META','GOOG','TSLA', 'MSFT','NFLX']
from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
dfmany = client.get_dataframe(stocks,
                                      frequency='daily',
                                      metric_name='close',
                                      startDate=start,
                                      endDate=end)
dfmany.head()


dfmany.META.plot(label = 'meta')
dfmany.GOOG.plot(label = 'google')
dfmany.TSLA.plot(label = 'tesla')
dfmany.MSFT.plot(label = 'microsoft')
dfmany.NFLX.plot(label = 'netflix')
_ = plt.legend(loc='best')


rets = dfmany.pct_change(30)
rets.iloc[25:35]


rets.META.plot(label = 'meta')
rets.GOOG.plot(label = 'google')
rets.TSLA.plot(label = 'tesla')
rets.MSFT.plot(label = 'microsoft')
rets.NFLX.plot(label = 'netflix')
_ = plt.legend(loc='best')


plt.scatter(rets.TSLA, rets.GOOG)
plt.xlabel('TESLA 30-day returns')
_ = plt.ylabel('GOOGLE 30-day returns')


dfb = client.get_dataframe('META',frequency='daily', startDate=start, endDate=end)[['open','high','low','close','volume']]
dgoog = client.get_dataframe('GOOG',frequency='daily', startDate=start, endDate=end)[['open','high','low','close','volume']]


start = datetime(2018,1,1)
end = datetime(2018,12,31)

dfb = client.get_dataframe('META',frequency='daily',startDate=start,endDate=end)
dfb = dfb[['open','close','low','high','volume']]

dgoog = client.get_dataframe('GOOG',frequency='daily',startDate=start,endDate=end)
dgoog = dgoog[['open','close','low','high','volume']]


print(dfb.head())
print(dgoog.head())

                             open   close     low    high    volume
date                                                               
2018-01-02 00:00:00+00:00  177.68  181.42  177.55  181.58  17694891
2018-01-03 00:00:00+00:00  181.88  184.67  181.33  184.78  16595495
2018-01-04 00:00:00+00:00  184.90  184.33  184.10  186.21  13554357
2018-01-05 00:00:00+00:00  185.59  186.85  184.93  186.90  13042388
2018-01-08 00:00:00+00:00  187.20  188.28  186.33  188.90  14719216
                              open    close       low     high   volume
date                                                                   
2018-01-02 00:00:00+00:00  1048.34  1065.00  1045.230  1066.94  1223114
2018-01-03 00:00:00+00:00  1064.31  1082.48  1063.210  1086.29  1416093
2018-01-04 00:00:00+00:00  1088.00  1086.40  1084.002  1093.57   990510
2018-01-05 00:00:00+00:00  1094.00  1102.23  1092.000  1104.25  1210974
2018-01-08 00:00:00+00:00  1102.23  1106.94  1101.620  1111.27  1003098


def gainrow(row):
    if row.close < row.open:
        return 'negative'
    elif (row.close - row.open) < 1:
        return 'small_gain'
    elif (row.close - row.open) < 3:
        return 'medium_gain'
    else:
        return 'large_gain'
    
dfb['gain'] = dfb.apply(gainrow, axis = 1)
dgoog['gain'] = dgoog.apply(gainrow, axis = 1)
dfb['profit'] = dfb.close-dfb.open
dgoog['profit'] = dgoog.close-dgoog.open


#Also using seaborn
fig = sns.scatterplot(x = dfb.profit, y = dgoog.profit)
fig.set_xlabel('FB profit')
fig.set_ylabel('GOOG profit')

Text(0, 0.5, 'GOOG profit')


sns.pairplot(rets.iloc[30:])

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In [48], line 1
----> 1 sns.pairplot(rets.iloc[30:])

NameError: name 'rets' is not defined


rets.corr()


rets.corr(method='spearman')


_ = sns.heatmap(rets.corr(), annot=True)


print(stats.pearsonr(rets.iloc[30:].NFLX, rets.iloc[30:].TSLA))
print(stats.spearmanr(rets.iloc[30:].NFLX, rets.iloc[30:].TSLA))
print(stats.pearsonr(rets.iloc[30:].GOOG, rets.iloc[30:].FB))
print(stats.spearmanr(rets.iloc[30:].GOOG, rets.iloc[30:].FB))

PearsonRResult(statistic=-0.12076398569612257, pvalue=0.0731862052389361)
SpearmanrResult(correlation=-0.065938830644713, pvalue=0.32918605296193537)
PearsonRResult(statistic=0.5987743268934734, pvalue=6.85914166684247e-23)
SpearmanrResult(correlation=0.5409485585956174, pvalue=3.388893335195231e-18)


print(stats.pearsonr(dfb.profit, dgoog.profit))
print(stats.spearmanr(dfb.profit, dgoog.profit))

PearsonRResult(statistic=0.750317985789302, pvalue=1.1738178432513165e-46)
SpearmanrResult(correlation=0.7189444847093646, pvalue=3.2346313802209346e-41)


_ = plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Standard Deviation (Risk)')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
    plt.annotate(
        label, 
        xy = (x, y), xytext = (20, -20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

	gain	open	low	high	close	vol
0	large_gain	170.459459	169.941454	175.660722	174.990811	3.034571e+07
1	medium_gain	172.305504	171.410923	175.321108	174.185577	2.795407e+07
2	negative	171.473133	168.024464	172.441342	169.233636	2.771124e+07
3	small_gain	171.217688	169.827283	173.070561	171.699146	2.488339e+07

	gain	open	low	high	close	vol
0	large_gain	170.459459	169.941454	175.660722	174.990811	3.034571e+07
1	medium_gain	172.305504	171.410923	175.321108	174.185577	2.795407e+07
2	negative	171.473133	168.024464	172.441342	169.233636	2.771124e+07
3	small_gain	171.217688	169.827283	173.070561	171.699146	2.488339e+07

	GOOGL	AAPL
2017-01-03 00:00:00+00:00	808.01	116.15
2017-01-04 00:00:00+00:00	807.77	116.02
2017-01-05 00:00:00+00:00	813.02	116.61
2017-01-06 00:00:00+00:00	825.21	117.91
2017-01-09 00:00:00+00:00	827.18	118.99

Symbols	FB	GOOG	TSLA	MSFT	NFLX
date
2018-02-07	NaN	NaN	NaN	NaN	NaN
2018-02-08	NaN	NaN	NaN	NaN	NaN
2018-02-09	NaN	NaN	NaN	NaN	NaN
2018-02-12	NaN	NaN	NaN	NaN	NaN
2018-02-13	NaN	NaN	NaN	NaN	NaN
2018-02-14	-0.010473	0.004413	0.005550	0.056545	0.322922
2018-02-15	-0.025505	0.006504	0.053002	0.073075	0.366837
2018-02-16	-0.037813	0.007732	0.066332	0.056136	0.354472
2018-02-20	-0.058014	0.000209	0.057460	0.051366	0.326492
2018-02-21	-0.055078	0.003975	-0.009243	0.036362	0.325348

Symbols	FB	GOOG	TSLA	MSFT	NFLX
Symbols
FB	1.000000	0.598774	0.226680	0.470696	0.546996
GOOG	0.598774	1.000000	0.210444	0.790085	0.348008
TSLA	0.226680	0.210444	1.000000	-0.041910	-0.120764
MSFT	0.470696	0.790085	-0.041910	1.000000	0.489569
NFLX	0.546996	0.348008	-0.120764	0.489569	1.000000

Plotting - Statistical Tests¶

Simple plots¶

Histograms¶

Plotting columns against each other¶

Grid of plots¶

Plotting using matplotlib¶

Scatter plots¶

Statistical Significance¶

The Student t-test¶

Kolomogorov-Smirnov Test¶

$\chi^2$-test¶

Error bars¶

Visualizing distributions¶

Seaborn lineplot¶

Comparing multiple stocks¶

Correlation Coefficients¶

Computing p-values¶

Matplotlib¶

	open	close	low	high	volume
date
2018-01-02 00:00:00+00:00	177.68	181.42	177.55	181.58	17694891
2018-01-03 00:00:00+00:00	181.88	184.67	181.33	184.78	16595495
2018-01-04 00:00:00+00:00	184.90	184.33	184.10	186.21	13554357
2018-01-05 00:00:00+00:00	185.59	186.85	184.93	186.90	13042388
2018-01-08 00:00:00+00:00	187.20	188.28	186.33	188.90	14719216

	open	high	low	close	vol	profit	gain	size
date
2018-01-02	177.68	181.58	177.5500	181.42	18151903	3.74	large_gain	small
2018-01-03	181.88	184.78	181.3300	184.67	16886563	2.79	medium_gain	small
2018-01-04	184.90	186.21	184.0996	184.33	13880896	-0.57	negative	small
2018-01-05	185.59	186.90	184.9300	186.85	13574535	1.26	medium_gain	small
2018-01-08	187.20	188.90	186.3300	188.28	17994726	1.08	medium_gain	small

	META	GOOG	TSLA	MSFT	NFLX
2018-01-02 00:00:00+00:00	181.42	1065.00	320.53	85.95	201.07
2018-01-03 00:00:00+00:00	184.67	1082.48	317.25	86.35	205.05
2018-01-04 00:00:00+00:00	184.33	1086.40	314.62	87.11	205.63
2018-01-05 00:00:00+00:00	186.85	1102.23	316.58	88.19	209.99
2018-01-08 00:00:00+00:00	188.28	1106.94	336.41	88.28	212.05

Symbols	FB	GOOG	TSLA	MSFT	NFLX
Symbols
FB	1.000000	0.540949	0.271608	0.457852	0.641344
GOOG	0.540949	1.000000	0.288135	0.803731	0.382466
TSLA	0.271608	0.288135	1.000000	0.042190	-0.065939
MSFT	0.457852	0.803731	0.042190	1.000000	0.456912
NFLX	0.641344	0.382466	-0.065939	0.456912	1.000000

	open	high	low	close	vol	profit	gain	size	all	month	positive_profit
0	177.68	181.58	177.5500	181.42	18151903	3.74	large_gain	small		1	True
1	181.88	184.78	181.3300	184.67	16886563	2.79	medium_gain	small		1	True
2	184.90	186.21	184.0996	184.33	13880896	-0.57	negative	small		1	False
3	185.59	186.90	184.9300	186.85	13574535	1.26	medium_gain	small		1	True
4	187.20	188.90	186.3300	188.28	17994726	1.08	medium_gain	small		1	True
...	...	...	...	...	...	...	...	...	...	...	...
246	123.10	129.74	123.0200	124.06	22066002	0.96	small_gain	small		12	True
247	126.00	134.24	125.8900	134.18	39723370	8.18	large_gain	large		12	True
248	132.44	134.99	129.6700	134.52	31202509	2.08	medium_gain	large		12	True
249	135.34	135.92	132.2000	133.20	22627569	-2.14	negative	small		12	False
250	134.45	134.64	129.9500	131.09	24625308	-3.36	negative	small		12	False