import pandas as pd
from datetime import datetime #For handling dates
import os

import matplotlib.pyplot as plt #main plotting tool for python
import matplotlib as mpl

import seaborn as sns #A more fancy plotting library

#For presenting plots inline
%matplotlib inline
!pip install tiingo

Collecting tiingo
  Downloading tiingo-0.15.6-py2.py3-none-any.whl.metadata (15 kB)
Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from tiingo) (2.32.3)
Requirement already satisfied: websocket-client in /usr/local/lib/python3.10/dist-packages (from tiingo) (1.8.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->tiingo) (3.4.0)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->tiingo) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->tiingo) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->tiingo) (2024.8.30)
Downloading tiingo-0.15.6-py2.py3-none-any.whl (15 kB)
Installing collected packages: tiingo
Successfully installed tiingo-0.15.6

os.environ["TIINGO_API_KEY"] = "614c1590a592cc6696f6082f83b2666cd83882ef"

from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
start = datetime(2018,1,1)
end = datetime(2018,12,31)
stocks_data = client.get_dataframe('META',frequency='daily',startDate=start,endDate=end)
stocks_data = stocks_data[['open','close','low','high','volume']]

stocks_data.head()

df = stocks_data
df = df.rename(columns = {'volume':'vol'})

df['profit'] = (df.close - df.open)
for idx, row in df.iterrows():
    if row.close < row.open:
        df.loc[idx,'gain']='negative'
    elif (row.close - row.open) < 1:
        df.loc[idx,'gain']='small_gain'
    elif (row.close - row.open) < 3:
        df.loc[idx,'gain']='medium_gain'
    else:
        df.loc[idx,'gain']='large_gain'

for idx, row in df.iterrows():
    if row.vol < df.vol.mean():
        df.loc[idx,'size']='small'
    else:
        df.loc[idx,'size']='large'

df.head()

gain_groups = df.groupby('gain')
gdf= df[['open','low','high','close','vol','gain']].groupby('gain').mean()
gdf = gdf.reset_index()

gdf

#plot a column of the dataframe against the index
df.high.plot()

<Axes: xlabel='date'>

df.high.plot()
df.low.plot(label='low values')
plt.legend(loc='best') #puts the ledgent in the best possible position

<matplotlib.legend.Legend at 0x794387df1cf0>

#histogram for the values of a dataframe column
df.close.hist(bins=20)

<Axes: >

#histogram with the kernel density estimation (a smoothed function over the histogram)
sns.histplot(df.close,bins=20,kde=True)

<Axes: xlabel='close', ylabel='Count'>

sns.displot(df.close,bins=50,kde=True)

<seaborn.axisgrid.FacetGrid at 0x7943812a4910>

dff = pd.read_csv('example-functions.csv')
dfs = dff.sort_values(by='A', ascending = True) #Sorting in data frames
dfs

plt.figure()
dfs.plot(x = 'A', y = 'B')
plt.figure()
dfs.plot(x = 'A', y = 'C')
plt.figure()
dfs.plot(x = 'A', y = 'D')

<Axes: xlabel='A'>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

#plt.figure()
fig, ax = plt.subplots(1, 3,figsize=(20,5))
dfs.plot(x = 'A', y = 'B',ax = ax[0])
dfs.plot(x = 'A', y = 'C',ax = ax[1])
dfs.plot(x = 'A', y = 'D',ax = ax[2])

<Axes: xlabel='A'>

plt.figure()
dfs.plot(x = 'A', y = ['B','C','D'])

<Axes: xlabel='A'>

<Figure size 640x480 with 0 Axes>

plt.figure(); dfs.plot(x = 'A', y = ['B','C','D'], loglog=True);

<Figure size 640x480 with 0 Axes>

plt.figure()
dfs.plot(x = 'A', y = ['B','C','D'], logy=True)

<Axes: xlabel='A'>

<Figure size 640x480 with 0 Axes>

plt.figure(figsize = (15,5)) #defines the size of figure
plt.subplot(121) #plot with 1 row, 2 columns, 1st plot
plt.plot(dfs['A'],dfs['B'],'bo-',dfs['A'],dfs['C'],'g*-',dfs['A'],dfs['D'],'rs-')
plt.subplot(122)  #plot with 1 row, 2 columns, 2nd plot
plt.loglog(dfs['A'],dfs['B'],'bo-',dfs['A'],dfs['C'],'g*-',dfs['A'],dfs['D'],'rs-')

[<matplotlib.lines.Line2D at 0x794380b9af20>,
 <matplotlib.lines.Line2D at 0x794380b9aef0>,
 <matplotlib.lines.Line2D at 0x794380b9b070>]

sns.lineplot(x= 'A', y='B',data = dfs,marker='o')

<Axes: xlabel='A', ylabel='B'>

fig, ax = plt.subplots(1, 2, figsize=(15,5))
dff.plot(kind ='scatter', x='A', y='B', ax = ax[0])
dff.plot(kind ='scatter', x='A', y='B', loglog = True,ax = ax[1])

<Axes: xlabel='A', ylabel='B'>

plt.scatter(dff.A, dff.B)

<matplotlib.collections.PathCollection at 0x794381379b10>

fig = plt.figure()
ax = plt.gca()
ax.set_xscale('log')
ax.set_yscale('log')
plt.scatter([1,2,3],[3,2,1])

<matplotlib.collections.PathCollection at 0x79438135cc10>

t = dff.plot(kind='scatter', x='A', y='B', color='DarkBlue', label='B curve', loglog=True);
dff.plot(kind='scatter', x='A', y='C',color='DarkGreen', label='C curve', ax=t, loglog = True);
dff.plot(kind='scatter', x='A', y='D',color='Red', label='D curve', ax=t, loglog = True);

sns.scatterplot(x='A',y='B', data = dff)

<Axes: xlabel='A', ylabel='B'>

splot = sns.scatterplot(x='A',y='B', data = dff)
#splot.set(xscale="log", yscale="log")
splot.loglog()

[]

gdf

import scipy as sp #library for scientific computations
from scipy import stats #The statistics part of the library

#Test statistical significance of the difference in the mean volume numbers

sm = gain_groups.get_group('small_gain').vol
lg = gain_groups.get_group('large_gain').vol
med = gain_groups.get_group('medium_gain').vol
neg = gain_groups.get_group('negative').vol
print(stats.ttest_ind(sm,neg,equal_var = False))
print(stats.ttest_ind(sm,med, equal_var = False))
print(stats.ttest_ind(sm,lg, equal_var = False))
print(stats.ttest_ind(neg,med,equal_var = False))
print(stats.ttest_ind(neg,lg,equal_var = False))
print(stats.ttest_ind(med,lg, equal_var = False))

TtestResult(statistic=-0.7237664320493662, pvalue=0.4720843830832102, df=58.686780477477306)
TtestResult(statistic=-0.6532701783697626, pvalue=0.5152447681117652, df=90.15236633446038)
TtestResult(statistic=-1.2743420856982142, pvalue=0.20648370530531482, df=74.86473858994728)
TtestResult(statistic=-0.12034041075217132, pvalue=0.9045425277099464, df=73.34821305978315)
TtestResult(statistic=-0.9054964354181412, pvalue=0.36935442852925515, df=52.30908445486685)
TtestResult(statistic=-0.5972302166465407, pvalue=0.5519534365894707, df=84.35529071251472)

import numpy as np
stats.ks_2samp(np.array(sm), np.array(lg), alternative='two-sided')

KstestResult(statistic=0.2703252032520325, pvalue=0.09473208642418271, statistic_location=26266081, statistic_sign=1)

stats.kstest(np.array(sm), 'norm')

KstestResult(statistic=1.0, pvalue=0.0, statistic_location=10464528, statistic_sign=-1)

sns.histplot(sm,bins=40,kde=True)

<Axes: xlabel='vol', ylabel='Count'>

df

# The crosstab methond creates the contigency table for the two attributes.
cdf = pd.crosstab(df['gain'],df['size'])
cdf

stats.chi2_contingency(cdf)

Chi2ContingencyResult(statistic=8.364478767414871, pvalue=0.03905004813922891, dof=3, expected_freq=array([[10.32669323, 25.67330677],
       [14.91633466, 37.08366534],
       [34.99601594, 87.00398406],
       [11.76095618, 29.23904382]]))

new_df = df.copy()

# Calculate the quantiles for dividing the `vol` column into five categories
quantiles = new_df['vol'].quantile([0.2, 0.4, 0.6, 0.8]).values

# Define categories based on quantile ranges
for idx, row in new_df.iterrows():
    if row.vol < quantiles[0]:
        new_df.loc[idx, 'size'] = 'very small'
    elif row.vol < quantiles[1]:
        new_df.loc[idx, 'size'] = 'small'
    elif row.vol < quantiles[2]:
        new_df.loc[idx, 'size'] = 'medium'
    elif row.vol < quantiles[3]:
        new_df.loc[idx, 'size'] = 'large'
    else:
        new_df.loc[idx, 'size'] = 'very large'

new_df.head()

cdf = pd.crosstab(new_df['gain'],new_df['size'])
cdf

stats.chi2_contingency(cdf)

Chi2ContingencyResult(statistic=17.156150315137896, pvalue=0.14381727155690543, dof=12, expected_freq=array([[ 7.17131474,  7.17131474,  7.17131474,  7.31474104,  7.17131474],
       [10.35856574, 10.35856574, 10.35856574, 10.56573705, 10.35856574],
       [24.30278884, 24.30278884, 24.30278884, 24.78884462, 24.30278884],
       [ 8.16733068,  8.16733068,  8.16733068,  8.33067729,  8.16733068]]))

df_fisher = df.copy()
df_fisher['gain'] = df['gain'].apply(lambda x: 'positive_gain' if x in ['small_gain', 'medium_gain', 'large_gain'] else 'negative')

df_fisher

# Create a 2x2 contingency table
contingency_table_fisher = pd.crosstab(df_fisher['gain'], df['size'])
print("Contingency Table:")
print(contingency_table_fisher)

Contingency Table:
size           large  small
gain                       
negative          39     83
positive_gain     33     96

from scipy.stats import fisher_exact

oddsratio, p_value = fisher_exact(contingency_table_fisher)
print("\nFisher's Exact Test Results:")
print(f"Odds Ratio: {oddsratio}")
print(f"P-Value: {p_value}")

Fisher's Exact Test Results:
Odds Ratio: 1.366922234392114
P-Value: 0.26848533307073885

print(sm.sem())
print(neg.sem())
print(stats.sem(med))
print(stats.sem(lg))

3192415.794566366
1500801.0916460739
3272021.3263068898
3115899.1280031265

#confidence interval
conf = 0.95
t = stats.t.ppf((1+conf)/2.0, len(df)-1)
low = sm.mean()-sm.sem()*t
high = sm.mean()+sm.sem()*t
print(low,  ",", high)

18475802.95317895 , 31050718.510235682

#sns.barplot(x='gain',y='vol', data = df, ci=95) #for older seaborn versions
sns.barplot(x='gain',y='vol', data = df, errorbar=('ci', 95))

<Axes: xlabel='gain', ylabel='vol'>

sns.pointplot(x='gain',y='vol', data = df,join = False, errorbar=('ci', 95), capsize = 0.1)

<ipython-input-32-9cf449b89e92>:1: UserWarning: 

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(x='gain',y='vol', data = df,join = False, errorbar=('ci', 95), capsize = 0.1)

<Axes: xlabel='gain', ylabel='vol'>

sns.boxplot(x='gain',y='vol', data = df)

<Axes: xlabel='gain', ylabel='vol'>

#Removing outliers
sns.boxplot(x='gain',y='vol', data = df, showfliers = False)

<Axes: xlabel='gain', ylabel='vol'>

sns.violinplot(x='gain',y='vol', data = df)

<Axes: xlabel='gain', ylabel='vol'>

df['all'] = ''
sns.violinplot(x = 'all', y='profit',hue='size', split=True, data = df)

<Axes: xlabel='all', ylabel='profit'>

sns.violinplot(x='gain',y='profit', hue='size', data = df, split=True)

<Axes: xlabel='gain', ylabel='profit'>

df

df = df.reset_index()
#df.date = df.date.apply(lambda d: datetime.strptime(d, "%Y-%m-%d"))
#df.date = df.date.apply(lambda d: datetime.strptime(d, "%Y-%m-%d %H:%M:%S%z").strftime("%Y-%m-%d"))

def get_month(row):
    return row.date.month

df['month'] = df.apply(get_month,axis = 1)

df

#sns.lineplot(x='month', y = 'vol', data = df, ci=95)
sns.lineplot(x='month', y = 'vol', data = df, errorbar=('ci', 95))

<Axes: xlabel='month', ylabel='vol'>

df['positive_profit'] = (df.profit>0)
sns.lineplot(x='month', y = 'vol', hue='positive_profit', data = df)

<Axes: xlabel='month', ylabel='vol'>

df.drop('date',axis=1)

from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
ticker_history = client.get_dataframe(['GOOGL', 'AAPL'],
                                      frequency='daily',
                                      metric_name='close',
                                      startDate='2017-01-01',
                                      endDate='2018-05-31')

ticker_history.head()

stocks = ['META','GOOG','TSLA', 'MSFT','NFLX']
from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
dfmany = client.get_dataframe(stocks,
                                      frequency='daily',
                                      metric_name='close',
                                      startDate=start,
                                      endDate=end)
dfmany.head()

dfmany.META.plot(label = 'meta')
dfmany.GOOG.plot(label = 'google')
dfmany.TSLA.plot(label = 'tesla')
dfmany.MSFT.plot(label = 'microsoft')
dfmany.NFLX.plot(label = 'netflix')
_ = plt.legend(loc='best')

rets = dfmany.pct_change(30)
rets.iloc[25:35]

rets.META.plot(label = 'meta')
rets.GOOG.plot(label = 'google')
rets.TSLA.plot(label = 'tesla')
rets.MSFT.plot(label = 'microsoft')
rets.NFLX.plot(label = 'netflix')
_ = plt.legend(loc='best')

plt.scatter(rets.TSLA, rets.GOOG)
plt.xlabel('TESLA 30-day returns')
_ = plt.ylabel('GOOGLE 30-day returns')

dfb = client.get_dataframe('META',frequency='daily', startDate=start, endDate=end)[['open','high','low','close','volume']]
dgoog = client.get_dataframe('GOOG',frequency='daily', startDate=start, endDate=end)[['open','high','low','close','volume']]

start = datetime(2018,1,1)
end = datetime(2018,12,31)

dfb = client.get_dataframe('META',frequency='daily',startDate=start,endDate=end)
dfb = dfb[['open','close','low','high','volume']]

dgoog = client.get_dataframe('GOOG',frequency='daily',startDate=start,endDate=end)
dgoog = dgoog[['open','close','low','high','volume']]


print(dfb.head())
print(dgoog.head())

                             open   close     low    high    volume
date                                                               
2018-01-02 00:00:00+00:00  177.68  181.42  177.55  181.58  17694891
2018-01-03 00:00:00+00:00  181.88  184.67  181.33  184.78  16595495
2018-01-04 00:00:00+00:00  184.90  184.33  184.10  186.21  13554357
2018-01-05 00:00:00+00:00  185.59  186.85  184.93  186.90  13042388
2018-01-08 00:00:00+00:00  187.20  188.28  186.33  188.90  14719216
                              open    close       low     high   volume
date                                                                   
2018-01-02 00:00:00+00:00  1048.34  1065.00  1045.230  1066.94  1223114
2018-01-03 00:00:00+00:00  1064.31  1082.48  1063.210  1086.29  1416093
2018-01-04 00:00:00+00:00  1088.00  1086.40  1084.002  1093.57   990510
2018-01-05 00:00:00+00:00  1094.00  1102.23  1092.000  1104.25  1210974
2018-01-08 00:00:00+00:00  1102.23  1106.94  1101.620  1111.27  1003098

def gainrow(row):
    if row.close < row.open:
        return 'negative'
    elif (row.close - row.open) < 1:
        return 'small_gain'
    elif (row.close - row.open) < 3:
        return 'medium_gain'
    else:
        return 'large_gain'

dfb['gain'] = dfb.apply(gainrow, axis = 1)
dgoog['gain'] = dgoog.apply(gainrow, axis = 1)
dfb['profit'] = dfb.close-dfb.open
dgoog['profit'] = dgoog.close-dgoog.open

#Also using seaborn
fig = sns.scatterplot(x = dfb.profit, y = dgoog.profit)
fig.set_xlabel('FB profit')
fig.set_ylabel('GOOG profit')

Text(0, 0.5, 'GOOG profit')

sns.pairplot(rets.iloc[30:])

<seaborn.axisgrid.PairGrid at 0x7f2445c8ba00>

rets.corr()

rets.corr(method='spearman')

_ = sns.heatmap(rets.corr(), annot=True)

print(stats.pearsonr(rets.iloc[30:].NFLX, rets.iloc[30:].TSLA))
print(stats.spearmanr(rets.iloc[30:].NFLX, rets.iloc[30:].TSLA))
print(stats.pearsonr(rets.iloc[30:].GOOG, rets.iloc[30:].META))
print(stats.spearmanr(rets.iloc[30:].GOOG, rets.iloc[30:].META))

PearsonRResult(statistic=-0.120762891951422, pvalue=0.07318882534649845)
SignificanceResult(statistic=-0.065938830644713, pvalue=0.32918605296193537)
PearsonRResult(statistic=0.5987760976044885, pvalue=6.856639483413373e-23)
SignificanceResult(statistic=0.5409485585956174, pvalue=3.3888933351952313e-18)

print(stats.pearsonr(dfb.profit, dgoog.profit))
print(stats.spearmanr(dfb.profit, dgoog.profit))

PearsonRResult(statistic=0.7546239297103, pvalue=1.817184238439732e-47)
SignificanceResult(statistic=0.724266383357056, pvalue=4.366810528478797e-42)

_ = plt.scatter(rets.mean(), rets.std())
plt.xlabel('Expected returns')
plt.ylabel('Standard Deviation (Risk)')
for label, x, y in zip(rets.columns, rets.mean(), rets.std()):
    plt.annotate(
        label,
        xy = (x, y), xytext = (20, -20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))

	gain	open	low	high	close	vol
0	large_gain	170.111111	169.610833	175.313056	174.653889	3.044808e+07
1	medium_gain	172.305769	171.410962	175.321346	174.185577	2.774962e+07
2	negative	171.605492	168.137747	172.566230	169.380246	2.731642e+07
3	small_gain	171.218049	169.827317	173.070488	171.699268	2.476326e+07

	A	B	C	D
0	0.631930	0.727462	1.015797	0.668071
1	1.301488	0.553052	0.416444	0.841456
2	2.495414	0.293590	0.160380	0.739603
3	3.449249	0.243811	0.078789	0.554745
4	3.655596	0.258616	0.073452	0.649052
...	...	...	...	...
95	94.612046	0.010506	0.000112	0.000078
96	94.796924	0.010510	0.000111	0.000076
97	95.209703	0.010449	0.000110	0.000073
98	95.933252	0.010419	0.000109	0.000068
99	99.169216	0.010055	0.000102	0.000049

	gain	open	low	high	close	vol
0	large_gain	170.111111	169.610833	175.313056	174.653889	3.044808e+07
1	medium_gain	172.305769	171.410962	175.321346	174.185577	2.774962e+07
2	negative	171.605492	168.137747	172.566230	169.380246	2.731642e+07
3	small_gain	171.218049	169.827317	173.070488	171.699268	2.476326e+07

	GOOGL	AAPL
2017-01-03 00:00:00+00:00	808.01	116.15
2017-01-04 00:00:00+00:00	807.77	116.02
2017-01-05 00:00:00+00:00	813.02	116.61
2017-01-06 00:00:00+00:00	825.21	117.91
2017-01-09 00:00:00+00:00	827.18	118.99

	META	GOOG	TSLA	MSFT	NFLX
2018-02-07 00:00:00+00:00	NaN	NaN	NaN	NaN	NaN
2018-02-08 00:00:00+00:00	NaN	NaN	NaN	NaN	NaN
2018-02-09 00:00:00+00:00	NaN	NaN	NaN	NaN	NaN
2018-02-12 00:00:00+00:00	NaN	NaN	NaN	NaN	NaN
2018-02-13 00:00:00+00:00	NaN	NaN	NaN	NaN	NaN
2018-02-14 00:00:00+00:00	-0.010473	0.004413	0.005553	0.056545	0.322922
2018-02-15 00:00:00+00:00	-0.025505	0.006504	0.053018	0.073075	0.366837
2018-02-16 00:00:00+00:00	-0.037813	0.007732	0.066334	0.056136	0.354472
2018-02-20 00:00:00+00:00	-0.058014	0.000209	0.057458	0.051366	0.326492
2018-02-21 00:00:00+00:00	-0.055078	0.003975	-0.009245	0.036362	0.325348

Plotting - Statistical Tests¶

Simple plots¶

Histograms¶

Plotting columns against each other¶

Grid of plots¶

Plotting using matplotlib¶

Scatter plots¶

Statistical Significance¶

The Student t-test¶

Kolomogorov-Smirnov Test¶

$\chi^2$-test¶

Exact fisher test¶

Error bars¶

Visualizing distributions¶

Seaborn lineplot¶

Comparing multiple stocks¶

Correlation Coefficients¶

Computing p-values¶

Matplotlib¶

	open	close	low	high	vol
date
2018-01-02 00:00:00+00:00	177.68	181.42	177.55	181.58	17694891
2018-01-03 00:00:00+00:00	181.88	184.67	181.33	184.78	16595495
2018-01-04 00:00:00+00:00	184.90	184.33	184.10	186.21	13554357
2018-01-05 00:00:00+00:00	185.59	186.85	184.93	186.90	13042388
2018-01-08 00:00:00+00:00	187.20	188.28	186.33	188.90	14719216

	META	GOOG	TSLA	MSFT	NFLX
2018-01-02 00:00:00+00:00	181.42	1065.00	320.53	85.95	201.07
2018-01-03 00:00:00+00:00	184.67	1082.48	317.25	86.35	205.05
2018-01-04 00:00:00+00:00	184.33	1086.40	314.62	87.11	205.63
2018-01-05 00:00:00+00:00	186.85	1102.23	316.58	88.19	209.99
2018-01-08 00:00:00+00:00	188.28	1106.94	336.41	88.28	212.05

	META	GOOG	TSLA	MSFT	NFLX
META	1.000000	0.598776	0.226680	0.470696	0.546996
GOOG	0.598776	1.000000	0.210441	0.790085	0.348008
TSLA	0.226680	0.210441	1.000000	-0.041910	-0.120763
MSFT	0.470696	0.790085	-0.041910	1.000000	0.489569
NFLX	0.546996	0.348008	-0.120763	0.489569	1.000000

	META	GOOG	TSLA	MSFT	NFLX
META	1.000000	0.540949	0.271608	0.457852	0.641344
GOOG	0.540949	1.000000	0.288135	0.803731	0.382466
TSLA	0.271608	0.288135	1.000000	0.042190	-0.065939
MSFT	0.457852	0.803731	0.042190	1.000000	0.456912
NFLX	0.641344	0.382466	-0.065939	0.456912	1.000000