D = [[0.3, 10, 1000],[0.5,2,509],[0.4, 8, 789]]
print(D)

[[0.3, 10, 1000], [0.5, 2, 509], [0.4, 8, 789]]


D = [[30000, 'Married', 1],[20000,'Single', 0],[45000, 'Maried', 0]]
print(D)

[[30000, 'Married', 1], [20000, 'Single', 0], [45000, 'Maried', 0]]


import numpy as np

#1-dimensional array
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)

#2-dimensional array
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)

 Deterministic 1-dimensional array 

[ 2  5 18 14  4]

 Deterministic 2-dimensional array 

[[ 2  5 18 14  4]
 [12 15  1  2  8]]


x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)

x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)

x = np.diag([1,2,3])
print ("\n Diagonal matrix\n")
print(x)

 Random 5x5 2-dimensional array 

[[0.34871799 0.62071892 0.60618484 0.12913798 0.80754808]
 [0.68774236 0.12672021 0.3677389  0.95823523 0.19599754]
 [0.04738102 0.01502908 0.04402727 0.53070598 0.82008422]
 [0.65242762 0.81563413 0.55416071 0.86403159 0.77256431]
 [0.6733942  0.49771376 0.05805456 0.51350696 0.34186178]]

 4x4 array with ones 

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]

 Diagonal matrix

[[1 0 0]
 [0 2 0]
 [0 0 3]]


x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)

#Matrix transpose
print ("\n Transpose of the matrix \n")
print (x.T)

#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n") 
print(2*x+1)

 Random 2x3 array with integers
[[6 2 3]
 [7 6 3]]

 Transpose of the matrix 

[[6 7]
 [2 6]
 [3 3]]

 Matrix 2x+1 

[[13  5  7]
 [15 13  7]]


lx = [list(y) for y in x]
lx

[[6, 2, 3], [7, 6, 3]]


import pandas as pd #The pandas library
from pandas import Series, DataFrame #Main pandas data structures


#Creating a data frame from a list of lists

df = pd.DataFrame([[1,2,3],[9,10,12]])
print(df)

# Each list becomes a row
# Names of columns are 0,1,3
# Rows are indexed by position numbers 0,1

   0   1   2
0  1   2   3
1  9  10  12


#Creating a data frame from a numpy array

df = pd.DataFrame(np.array([[1,2,3],[9,10,12]]))
print(df)

   0   1   2
0  1   2   3
1  9  10  12


# Specifying column names
df = pd.DataFrame(np.array([[1,2,3],[9,10,12]]), columns=['A','B','C'])
print(df)

   A   B   C
0  1   2   3
1  9  10  12


#Creating a data frame from a dictionary
# Keys are column names, values are lists with column values

dfe = pd.DataFrame({'A':[1,2,3], 'B':['a','b','c']})
print(dfe)

   A  B
0  1  a
1  2  b
2  3  c


# Reading from a csv file:
df = pd.read_csv('example.csv')
print(df)

# The first row of the file is used for the column names 
# The property columns gives us the column names
print(df.columns)
print(list(df.columns))

   NUMBER CHAR
0       1    a
1       2    b
2       3    c
Index(['NUMBER', 'CHAR'], dtype='object')
['NUMBER', 'CHAR']


# Reading from a csv file without header:
df = pd.read_csv('no-header.csv',header = None)
print(df)

   0  1
0  1  a
1  2  b
2  3  c


# Reading from am excel file:
df = pd.read_excel('example.xlsx')
print(df)

   NUMBER CHAR
0       1    a
1       2    b
2       3    c


#Writing to a csv file:
df.to_csv('example2.csv')
for x in open('example2.csv').readlines():
    print(x.strip())

,NUMBER,CHAR
0,1,a
1,2,b
2,3,c


# By default the row index is added as a column, we can remove it by seting index=False
df.to_csv('example2.csv',index = False)
for x in open('example2.csv').readlines():
    print(x.strip())

NUMBER,CHAR
1,a
2,b
3,c


from datetime import datetime #For handling dates


from tiingo import TiingoClient
client = TiingoClient({'api_key':'614c1590a592cc6696f6082f83b2666cd83882ef'})
start = datetime(2018,1,1)
end = datetime(2018,12,31)
stocks_data = client.get_dataframe('META',frequency='daily',startDate=start,endDate=end)
stocks_data = stocks_data[['open','close','low','high','volume']]


#Alternative without the client
import pandas_datareader.data as web # For accessing web data

stocks_data = web.get_data_tiingo('META', start, end, api_key=os.getenv('TIINGO_API_KEY'))

stocks_data = stocks_data.reset_index(level='symbol',drop=True)
stocks_data = stocks_data[['open','close','low','high','volume']]


# the method info() outputs basic information for our data frame
stocks_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2018-01-02 00:00:00+00:00 to 2018-12-31 00:00:00+00:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    251 non-null    float64
 1   close   251 non-null    float64
 2   low     251 non-null    float64
 3   high    251 non-null    float64
 4   volume  251 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 11.8 KB


len(stocks_data)

251


#the medthod head() outputs the top rows of the data frame
stocks_data.head()


#the medthod tail() outputs the last rows of the data frame
stocks_data.tail()


#trying to access the date column will give an error

stocks_data.date

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [8], line 3
      1 #trying to access the date column will give an error
----> 3 stocks_data.date

File C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:5575, in NDFrame.__getattr__(self, name)
   5568 if (
   5569     name not in self._internal_names_set
   5570     and name not in self._metadata
   5571     and name not in self._accessors
   5572     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   5573 ):
   5574     return self[name]
-> 5575 return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'date'


stocks_data.to_csv('stocks_data.csv')
for x in open('stocks_data.csv').readlines()[0:10]:
    print(x.strip()) 
df = pd.read_csv('stocks_data.csv')
df.head()

date,open,close,low,high,volume
2018-01-02 00:00:00+00:00,177.68,181.42,177.55,181.58,17694891
2018-01-03 00:00:00+00:00,181.88,184.67,181.33,184.78,16595495
2018-01-04 00:00:00+00:00,184.9,184.33,184.1,186.21,13554357
2018-01-05 00:00:00+00:00,185.59,186.85,184.93,186.9,13042388
2018-01-08 00:00:00+00:00,187.2,188.28,186.33,188.9,14719216
2018-01-09 00:00:00+00:00,188.7,187.87,187.1,188.8,12342722
2018-01-10 00:00:00+00:00,186.94,187.84,185.63,187.89,10464528
2018-01-11 00:00:00+00:00,188.4,187.77,187.38,188.4,8855144
2018-01-12 00:00:00+00:00,178.06,179.37,177.4,181.48,76645626


len(df)

251


#the medthod head() outputs the top rows of the data frame
df.head()


#an object that refers to the names of the columns
df.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume'], dtype='object')


df = df.rename(columns = {'volume':'V'})
print(list(df.columns))
df.columns = ['date', 'open', 'high', 'low', 'close', 'vol']
df.head()

['date', 'open', 'close', 'low', 'high', 'V']


df['open'].head()

0    177.68
1    181.88
2    184.90
3    185.59
4    187.20
Name: open, dtype: float64


df.open.head()

0    177.68
1    181.88
2    184.90
3    185.59
4    187.20
Name: open, dtype: float64


df[['open','close']].head()


df.open.values[:10]

array([177.68, 181.88, 184.9 , 185.59, 187.2 , 188.7 , 186.94, 188.4 ,
       178.06, 181.5 ])


type(df[['open','close']])

pandas.core.frame.DataFrame


df[['open','close']].values[:10]

array([[177.68, 181.42],
       [181.88, 184.67],
       [184.9 , 184.33],
       [185.59, 186.85],
       [187.2 , 188.28],
       [188.7 , 187.87],
       [186.94, 187.84],
       [188.4 , 187.77],
       [178.06, 179.37],
       [181.5 , 178.39]])


df.mean() #produces the mean of the columns/features

C:\Users\tsap\AppData\Local\Temp\ipykernel_20824\3584231175.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.mean() #produces the mean of the columns/features

open     1.714729e+02
high     1.715110e+02
low      1.693031e+02
close    1.736133e+02
vol      2.743828e+07
dtype: float64


df[['open','high','low','low','close']].mean()

open     171.454424
high     173.615298
low      169.303110
low      169.303110
close    171.510936
dtype: float64


df[['open','high','low','low','close']].std() #produces the standard deviation of the columns/features

open     19.683435
high     19.423837
low      20.074382
low      20.074382
close    19.977448
dtype: float64


df[['open','high','low','low','close']].sem() #produces the standard error of the mean of the columns/features

open     1.242407
high     1.226022
low      1.267084
low      1.267084
close    1.260965
dtype: float64


#confidence interval
import scipy.stats as stats
conf = 0.95
t = stats.t.ppf((1+conf)/2.0, len(df)-1)
low = df[['open','high','low','low','close']].mean() - t*df[['open','high','low','low','close']].sem()
high = df[['open','high','low','low','close']].mean() + t*df[['open','high','low','low','close']].sem()
pd.DataFrame({'CI lower end':low, 'CI higher end':high})


df[['open','high','low','low','close']].median() #produces the median of the columns/features

open     174.89
high     176.98
low      172.83
low      172.83
close    174.70
dtype: float64


df.open.mean()

171.4544243027888


#95%-confidence interval
(df.open.mean()-t*df.open.sem(), df.open.mean()+t*df.open.sem())

(169.00750496130627, 173.90134364427132)


stocks_data.describe()


stocks_data.sum()

open      4.303506e+04
high      4.357744e+04
low       4.249508e+04
close     4.304924e+04
volume    6.949682e+09
dtype: float64


df.sum(axis=1)

C:\Users\tsap\AppData\Local\Temp\ipykernel_23208\1459321664.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.sum(axis=1)

0      1.815262e+07
1      1.688730e+07
2      1.388164e+07
3      1.357528e+07
4      1.799548e+07
           ...     
246    2.206650e+07
247    3.972389e+07
248    3.120304e+07
249    2.262811e+07
250    2.462584e+07
Length: 251, dtype: float64


#when inplace is False (the default) it returns a new dataframe that is sorted.
#when it is True it does not return anything, just changes the dataframe.
stocks_data.sort_values(by = 'open', ascending =False, inplace=False).head()


df.date.head()

0    2018-01-02 00:00:00+00:00
1    2018-01-03 00:00:00+00:00
2    2018-01-04 00:00:00+00:00
3    2018-01-05 00:00:00+00:00
4    2018-01-08 00:00:00+00:00
Name: date, dtype: object


first_date = df.date.values[0]
first_date
#returns a string

'2018-01-02 00:00:00+00:00'


datetime.strptime(first_date, "%Y-%m-%d %H:%M:%S%z")

datetime.datetime(2018, 1, 2, 0, 0, tzinfo=datetime.timezone.utc)


df.date = df.date.apply(lambda d: datetime.strptime(d, "%Y-%m-%d %H:%M:%S%z").strftime("%Y-%m-%d"))
date_series = df.date # We want to keep the dates
df.date.head()

#Another way to do the same thing, by applying the function to every row (axis = 1)
#df.date = df.apply(lambda row: datetime.strptime(row.date, "%Y-%m-%d"), axis=1)

0    2018-01-02
1    2018-01-03
2    2018-01-04
3    2018-01-05
4    2018-01-08
Name: date, dtype: object


df.date.head()

0    2018-01-02
1    2018-01-03
2    2018-01-04
3    2018-01-05
4    2018-01-08
Name: date, dtype: object


#dftest = df[['open','close']]
#dftest.apply(lambda x: int(x))
df.apply(lambda r: int(r.open), axis=1)

0      177
1      181
2      184
3      185
4      187
      ... 
246    123
247    126
248    132
249    135
250    134
Length: 251, dtype: int64


dftest = df['open']
dftest.apply(lambda x: int(x))

0      177
1      181
2      184
3      185
4      187
      ... 
246    123
247    126
248    132
249    135
250    134
Name: open, Length: 251, dtype: int64


list(df.index)[0:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


df.index = df.date
df.head()

dfe


dfe.index = dfe.B

dfe


df = df.drop(columns = ['date']) #Equivalent to df = df.drop(columns = ['date']), axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2018-01-02 to 2018-12-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    251 non-null    float64
 1   high    251 non-null    float64
 2   low     251 non-null    float64
 3   close   251 non-null    float64
 4   volume  251 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 11.8 KB


#axis = 0 refers to dropping labels from rows (or you can use index = labels as a parameter). 
# Essentially we are droping a set of rows
#axis = 1 refers to dropping labels from columns.
dfe.drop(index='b')


df.loc[datetime(2018,5,7)]

open          177.35
high          179.50
low           177.17
close         177.97
vol      18697195.00
Name: 2018-05-07 00:00:00, dtype: float64


df.iloc[10:20] #dataframe with rows from 10 to 20


df.iloc[0:2,[1,3]] #dataframe with rows 0:2, and the second and fourth columns


#select rows and columns
df[['high','close']].iloc[0:2]


num_positive_days = 0
for idx, row in df.iterrows(): #returns the index name and the row
    if row.close > row.open:
        num_positive_days += 1
        
print("The total number of positive-gain days is {}.".format(num_positive_days))

The total number of positive-gain days is 130.


num_positive_days = 0
for i in range(len(df)):
    row = df.iloc[i]
    if row.close > row.open:
        num_positive_days += 1
        
print("The total number of positive-gain days is {}.".format(num_positive_days))

The total number of positive-gain days is 130.


pos_days = [idx for (idx,row) in df.iterrows() if row.close > row.open]
print("The total number of positive-gain days is "+str(len(pos_days)))

The total number of positive-gain days is 130


pos_days[0:10]

[Timestamp('2018-01-02 00:00:00'),
 Timestamp('2018-01-03 00:00:00'),
 Timestamp('2018-01-05 00:00:00'),
 Timestamp('2018-01-08 00:00:00'),
 Timestamp('2018-01-10 00:00:00'),
 Timestamp('2018-01-12 00:00:00'),
 Timestamp('2018-01-18 00:00:00'),
 Timestamp('2018-01-19 00:00:00'),
 Timestamp('2018-01-22 00:00:00'),
 Timestamp('2018-01-23 00:00:00')]


df.loc[pos_days]


#This will iteratate the column names:
for x in df:
    print(x)

open
high
low
close
vol


tmp_high = df.high > 170
tmp_high.head()

date
2018-01-02    True
2018-01-03    True
2018-01-04    True
2018-01-05    True
2018-01-08    True
Name: high, dtype: bool


sum(tmp_high)

149


df[tmp_high].head()


positive_days = df[df.close > df.open]
positive_days.head()


very_positive_days = df[df.close-df.open > 5]
very_positive_days.head()


df[(df.high<170)&(df.low>80)]


df[(df.high<170)and(df.low>80)]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15712/807571659.py in <module>
----> 1 df[(df.high<170)and(df.low>80)]

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __nonzero__(self)
   1535     @final
   1536     def __nonzero__(self):
-> 1537         raise ValueError(
   1538             f"The truth value of a {type(self).__name__} is ambiguous. "
   1539             "Use a.empty, a.bool(), a.item(), a.any() or a.all()."

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().


temp_df = df[df.high<170]
temp_df = temp_df[temp_df > 80]
temp_df


df['profit'] = (df.close - df.open)
df.head()


df.profit[df.profit>0].describe()

count    130.000000
mean       2.193566
std        1.783093
min        0.020000
25%        0.720000
50%        1.630000
75%        3.280000
max        8.180000
Name: profit, dtype: float64


for idx, row in df.iterrows():
    if row.close < row.open:
        df.loc[idx,'gain']='negative'
    elif (row.close - row.open) < 1:
        df.loc[idx,'gain']='small_gain'
    elif (row.close - row.open) < 3:
        df.loc[idx,'gain']='medium_gain'
    else:
        df.loc[idx,'gain']='large_gain'
df.head()


def gainrow(row):
    if row.close < row.open:
        return 'negative'
    elif (row.close - row.open) < 1:
        return 'small_gain'
    elif (row.close - row.open) < 3:
        return 'medium_gain'
    else:
        return 'large_gain'

df['test_column'] = df.apply(gainrow, axis = 1)
#axis = 0 means rows, axis =1 means columns


df.head()


df = df.drop('test_column', axis = 1)
df.head()


mdf = pd.read_csv('example-missing.csv')
mdf


mdf.A.mean()

4.5


mdf.fillna(0)


mdf.A = mdf.A.fillna(mdf.A.mean())
mdf = mdf.fillna('')
mdf


mdf = pd.read_csv('example-missing.csv')
mdf.dropna()


mdf[mdf.B.isnull()]


gain_groups = df.groupby('gain')


type(gain_groups)

pandas.core.groupby.generic.DataFrameGroupBy


for gain, gain_data in gain_groups:
    print(gain)
    print(gain_data.head())
    print('=============================')

large_gain
              open    high     low   close       vol  profit        gain
date                                                                    
2018-01-02  177.68  181.58  177.55  181.42  18151903    3.74  large_gain
2018-01-22  180.80  185.39  180.41  185.37  21059464    4.57  large_gain
2018-01-23  186.05  189.55  185.55  189.35  25678781    3.30  large_gain
2018-01-30  183.01  188.18  181.84  187.12  20858556    4.11  large_gain
2018-02-01  188.22  195.32  187.89  193.09  54211293    4.87  large_gain
=============================
medium_gain
              open    high     low   close       vol  profit         gain
date                                                                     
2018-01-03  181.88  184.78  181.33  184.67  16886563    2.79  medium_gain
2018-01-05  185.59  186.90  184.93  186.85  13574535    1.26  medium_gain
2018-01-08  187.20  188.90  186.33  188.28  17994726    1.08  medium_gain
2018-01-12  178.06  181.48  177.40  179.37  77551299    1.31  medium_gain
2018-01-18  178.13  180.98  177.08  179.80  23304901    1.67  medium_gain
=============================
negative
              open    high       low   close       vol  profit      gain
date                                                                    
2018-01-04  184.90  186.21  184.0996  184.33  13880896   -0.57  negative
2018-01-09  188.70  188.80  187.1000  187.87  12393057   -0.83  negative
2018-01-11  188.40  188.40  187.3800  187.77   9588587   -0.63  negative
2018-01-16  181.50  181.75  178.0400  178.39  36183842   -3.11  negative
2018-01-17  179.26  179.32  175.8000  177.60  27992376   -1.66  negative
=============================
small_gain
              open    high       low   close       vol  profit        gain
date                                                                      
2018-01-10  186.94  187.89  185.6300  187.84  10529894    0.90  small_gain
2018-01-19  180.85  182.37  180.1702  181.29  26826540    0.44  small_gain
2018-02-20  175.77  177.95  175.1100  176.01  21204921    0.24  small_gain
2018-02-22  178.70  180.21  177.4100  178.99  18464192    0.29  small_gain
2018-02-26  184.58  185.66  183.2228  184.93  17599703    0.35  small_gain
=============================


sm = gain_groups.get_group('small_gain')
sm.head()


for gain, gain_data in df.groupby("gain"):
    print('The average closing value for the {} group is {}'.format(gain,
                                                    gain_data.close.mean()))

The average closing value for the large_gain group is 174.99081081081084
The average closing value for the medium_gain group is 174.18557692307695
The average closing value for the negative group is 169.2336363636363
The average closing value for the small_gain group is 171.6991463414634


gdf= df[['open','low','high','close','vol','gain']].groupby('gain').mean()
type(gdf)

pandas.core.frame.DataFrame

gdf


gdf2= df[['open','low','high','close','vol','gain']].groupby('gain').apply(lambda x: max(x) -min(x), axis=1)
gdf2


#This can be used to remove the hiearchical index, if necessary
gdf = gdf.reset_index()
gdf


gdf.set_index('gain')


test = pd.DataFrame({'A':[1,2,3,4],'B':['a','b','b','a'],'C':['a','a','b','a']})
test


gtest = test.groupby(['B','C']).mean()
gtest
#note that in this case we get a hierarchical index


gtest = gtest.reset_index()
gtest
#the hierarchical index is flattened out


data_source = 'iex'
start = datetime(2018,1,1)
end = datetime(2018,12,31)

dfb = web.DataReader('FB', data_source, start, end)
dgoog = web.DataReader('GOOG', data_source, start, end)

print(dfb.head())
print(dgoog.head())

              open    high       low   close    volume
date                                                  
2018-01-02  177.68  181.58  177.5500  181.42  18151903
2018-01-03  181.88  184.78  181.3300  184.67  16886563
2018-01-04  184.90  186.21  184.0996  184.33  13880896
2018-01-05  185.59  186.90  184.9300  186.85  13574535
2018-01-08  187.20  188.90  186.3300  188.28  17994726
               open       high        low    close      volume
date                                                          
2018-01-02  52.4170  53.347000  52.261500  53.2500  24751280.0
2018-01-03  53.2155  54.314500  53.160500  54.1240  28603400.0
2018-01-04  54.4000  54.678495  54.200085  54.3200  20092100.0
2018-01-05  54.7000  55.212500  54.600000  55.1115  25582460.0
2018-01-08  55.1115  55.563500  55.081000  55.3470  20952060.0


common_dates = pd.merge(dfb,dgoog,on='date')
common_dates.head()


common_dates = pd.merge(dfb,dgoog,on='date',suffixes=('_fb', '_goog'))
common_dates.head()


dfb['gain'] = dfb.apply(gainrow, axis = 1)
dgoog['gain'] = dgoog.apply(gainrow, axis = 1)
dfb['profit'] = dfb.close-dfb.open
dgoog['profit'] = dgoog.close-dgoog.open


common_gain_dates = pd.merge(dfb, dgoog, on=['date','gain'],suffixes=('_fb', '_goog'))
common_gain_dates.head()


left = pd.DataFrame({'key': ['foo', 'foo', 'boo'], 'lval': [1, 2,3]})
print(left)
print('\n')
right = pd.DataFrame({'key': ['foo', 'hoo'], 'rval': [4, 5]})
print(right)
print('\n')
dfm = pd.merge(left, right, on='key') #keeps only the common key 'foo'
print(dfm)

   key  lval
0  foo     1
1  foo     2
2  boo     3


   key  rval
0  foo     4
1  hoo     5


   key  lval  rval
0  foo     1     4
1  foo     2     4


dfm = pd.merge(left, right, on='key', how='left') #keeps all the keys from the left and puts NaN for missing values
print(dfm)
print('\n')
dfm = dfm.fillna(0) #fills the NaN values with specified value
dfm

   key  lval  rval
0  foo     1   4.0
1  foo     2   4.0
2  boo     3   NaN


dfb.join(dgoog,lsuffix='_FB',rsuffix='_GOOG')


left.index=left.key
left = left.drop('key',axis=1)
right.index =right.key
right = right.drop('key',axis=1)
left.join(right)

	open	close	low	high	volume
date
2018-12-24 00:00:00+00:00	123.10	124.06	123.02	129.74	22066002
2018-12-26 00:00:00+00:00	126.00	134.18	125.89	134.24	39723370
2018-12-27 00:00:00+00:00	132.44	134.52	129.67	134.99	31202509
2018-12-28 00:00:00+00:00	135.34	133.20	132.20	135.92	22627569
2018-12-31 00:00:00+00:00	134.45	131.09	129.95	134.64	24625308

	CI lower end	CI higher end
open	169.007505	173.901344
high	171.200650	176.029945
low	166.807590	171.798629
low	166.807590	171.798629
close	169.027467	173.994406

	open	high	low	close	volume
count	251.000000	251.000000	251.000000	251.000000	2.510000e+02
mean	171.454424	173.615298	169.303110	171.510936	2.768798e+07
std	19.683435	19.423837	20.074382	19.977448	1.922117e+07
min	123.100000	129.740000	123.020000	124.060000	9.588587e+06
25%	157.815000	160.745000	155.525000	157.915000	1.782839e+07
50%	174.890000	176.980000	172.830000	174.700000	2.186093e+07
75%	184.890000	186.450000	183.420000	185.270000	3.031384e+07
max	215.715000	218.620000	214.270000	217.500000	1.698037e+08

	open	close	low	high	volume
date
2018-07-25 00:00:00+00:00	215.72	217.50	214.27	218.62	64592585
2018-07-24 00:00:00+00:00	215.11	214.67	212.60	216.20	28468681
2018-07-23 00:00:00+00:00	210.58	210.91	208.80	211.62	16731969
2018-07-18 00:00:00+00:00	209.82	209.36	208.44	210.99	15334907
2018-07-20 00:00:00+00:00	208.85	209.94	208.50	211.50	16241508

	open	high	low	close	vol
date
2018-02-06	178.57	185.77	177.7400	185.31	37758505
2018-02-14	173.45	179.81	173.2119	179.52	28929704
2018-04-10	157.93	165.98	157.0100	165.04	58947041
2018-07-17	204.90	210.46	204.8400	209.99	15349892
2018-08-02	170.68	176.79	170.2700	176.37	32399954

Introduction to Pandas and other libraries¶

Pandas¶

Storing data tables¶

Creating Data Frames¶

Working with data columns¶

Data Frame methods¶

Bulk Operations¶

Accessing rows of the DataFrame¶

To iterate over the rows, use `.iterrows()`¶

Filtering¶

Creating new columns¶

Missing values¶

Grouping¶

Joins¶

	open	close	low	high	volume
date
2018-01-02 00:00:00+00:00	177.68	181.42	177.55	181.58	17694891
2018-01-03 00:00:00+00:00	181.88	184.67	181.33	184.78	16595495
2018-01-04 00:00:00+00:00	184.90	184.33	184.10	186.21	13554357
2018-01-05 00:00:00+00:00	185.59	186.85	184.93	186.90	13042388
2018-01-08 00:00:00+00:00	187.20	188.28	186.33	188.90	14719216

	date	open	high	low	close	volume
date
2018-01-02	2018-01-02	177.68	181.58	177.5500	181.42	18151903
2018-01-03	2018-01-03	181.88	184.78	181.3300	184.67	16886563
2018-01-04	2018-01-04	184.90	186.21	184.0996	184.33	13880896
2018-01-05	2018-01-05	185.59	186.90	184.9300	186.85	13574535
2018-01-08	2018-01-08	187.20	188.90	186.3300	188.28	17994726

	open	high	low	close	vol
date
2018-01-17	179.26	179.32	175.8000	177.60	27992376
2018-01-18	178.13	180.98	177.0800	179.80	23304901
2018-01-19	180.85	182.37	180.1702	181.29	26826540
2018-01-22	180.80	185.39	180.4100	185.37	21059464
2018-01-23	186.05	189.55	185.5500	189.35	25678781
2018-01-24	189.89	190.66	186.5200	186.55	24334548
2018-01-25	187.95	188.62	186.6000	187.48	17377740
2018-01-26	187.75	190.00	186.8100	190.00	17759212
2018-01-29	188.75	188.84	185.6301	185.98	20453172
2018-01-30	183.01	188.18	181.8400	187.12	20858556

	open	high	low	close	volume
date
2018-03-23	165.44	167.10	159.02	159.39	53609706
2018-03-26	160.82	161.10	149.02	160.06	126116634
2018-03-27	156.31	162.85	150.75	152.22	79116995
2018-03-28	151.65	155.88	150.80	153.03	60029170
2018-03-29	155.15	161.42	154.14	159.79	59434293
...	...	...	...	...	...
2018-12-24	123.10	129.74	123.02	124.06	22066002
2018-12-26	126.00	134.24	125.89	134.18	39723370
2018-12-27	132.44	134.99	129.67	134.52	31202509
2018-12-28	135.34	135.92	132.20	133.20	22627569
2018-12-31	134.45	134.64	129.95	131.09	24625308

	open	high	low	close	vol	profit	gain
date
2018-01-10	186.94	187.89	185.6300	187.84	10529894	0.90	small_gain
2018-01-19	180.85	182.37	180.1702	181.29	26826540	0.44	small_gain
2018-02-20	175.77	177.95	175.1100	176.01	21204921	0.24	small_gain
2018-02-22	178.70	180.21	177.4100	178.99	18464192	0.29	small_gain
2018-02-26	184.58	185.66	183.2228	184.93	17599703	0.35	small_gain

	open	low	high	close	vol
gain
large_gain	170.459459	169.941454	175.660722	174.990811	3.034571e+07
medium_gain	172.305504	171.410923	175.321108	174.185577	2.795407e+07
negative	171.473133	168.024464	172.441342	169.233636	2.771124e+07
small_gain	171.217688	169.827283	173.070561	171.699146	2.488339e+07

	A	B	C
0	1.0	a	x
1	5.0	b	NaN
2	3.0	c	y
3	9.0	NaN	z
4	NaN	a	x

	A	B	C
0	1.0	a	x
1	5.0	b	0
2	3.0	c	y
3	9.0	0	z
4	0.0	a	x

	A	B	C
0	1.0	a	x
1	5.0	b
2	3.0	c	y
3	9.0		z
4	4.5	a	x

	A	B	C
0	1.0	a	x
2	3.0	c	y

Introduction to Pandas and other libraries¶

Pandas¶

Storing data tables¶

Creating Data Frames¶

Working with data columns¶

Data Frame methods¶

Bulk Operations¶

Accessing rows of the DataFrame¶

To iterate over the rows, use .iterrows()¶

Filtering¶

Creating new columns¶

Missing values¶

Grouping¶

Joins¶

To iterate over the rows, use `.iterrows()`¶