Python Pandas Calculate Rolling Stock Beta Using Rolling Apply To Groupby Object In Vectorized Fashion
Solution 1:
Try pd.rolling_cov() and pd.rolling.var() as follows:
import pandas as pd
import numpy as np
from StringIOimport StringIO
df = pd.read_csv(StringIO(''' id period ret_1m mkt_ret_1m
131146CAN00WG0199609-0.15380.047104133530CAN00WG0199610-0.0455-0.014143135913CAN00WG01996110.00000.040926138334CAN00WG01996120.29520.008723140794CAN00WG0199701-0.02570.039916143274CAN00WG0199702-0.0038-0.025442145754CAN00WG0199703-0.2992-0.049279148246CAN00WG0199704-0.0919-0.005948150774CAN00WG01997050.05950.122322153318CAN00WG0199706-0.03370.045765160980CAN00WH01997090.07570.079293163569CAN00WH0199710-0.0741-0.044000166159CAN00WH01997110.1000-0.014644168782CAN00WH0199712-0.0909-0.007072171399CAN00WH0199801-0.01000.001381174022CAN00WH01998020.19190.081924176637CAN00WH01998030.00850.050415179255CAN00WH0199804-0.01680.018393181880CAN00WH01998050.0427-0.051279184516CAN00WH0199806-0.0656-0.011516143275CAN00WO0199702-0.1176-0.025442145755CAN00WO0199703-0.0074-0.049279148247CAN00WO0199704-0.0075-0.005948150775CAN00WO01997050.04510.122322'''), sep='\s+')
df['beta'] = pd.rolling_cov(df['ret_1m'], df['mkt_ret_1m'], window=6) / pd.rolling_var(df['mkt_ret_1m'], window=6)
print df
Output:
id period ret_1m mkt_ret_1m beta
131146CAN00WG0199609-0.15380.047104 NaN
133530CAN00WG0199610-0.0455-0.014143 NaN
135913CAN00WG01996110.00000.040926 NaN
138334CAN00WG01996120.29520.008723 NaN
140794CAN00WG0199701-0.02570.039916 NaN
143274CAN00WG0199702-0.0038-0.025442-1.245908145754CAN00WG0199703-0.2992-0.0492792.574464148246CAN00WG0199704-0.0919-0.0059482.657887150774CAN00WG01997050.05950.1223221.371090153318CAN00WG0199706-0.03370.0457651.494095160980CAN00WH01997090.07570.0792931.616520163569CAN00WH0199710-0.0741-0.0440001.630411166159CAN00WH01997110.1000-0.0146440.651220168782CAN00WH0199712-0.0909-0.0070720.652148171399CAN00WH0199801-0.01000.0013810.724120174022CAN00WH01998020.19190.0819241.542782176637CAN00WH01998030.00850.0504151.605407179255CAN00WH0199804-0.01680.0183931.571015181880CAN00WH01998050.0427-0.0512791.139972184516CAN00WH0199806-0.0656-0.0115161.101890143275CAN00WO0199702-0.1176-0.0254421.372437145755CAN00WO0199703-0.0074-0.0492790.031939148247CAN00WO0199704-0.0075-0.005948-0.535855150775CAN00WO01997050.04510.1223220.341747
Solution 2:
I guess pd.rolling_apply
doesn't help in this case since it seems to me that it essentially only takes a Series
(Even if a dataframe is passed, it's processing one column a time). But you can always write your own rolling_apply that takes a dataframe.
import pandas as pd
import numpy as np
from StringIO import StringIO
df = pd.read_csv(StringIO(''' id period ret_1m mkt_ret_1m
131146 CAN00WG0 199609 -0.1538 0.047104
133530 CAN00WG0 199610 -0.0455 -0.014143
135913 CAN00WG0 199611 0.0000 0.040926
138334 CAN00WG0 199612 0.2952 0.008723
140794 CAN00WG0 199701 -0.0257 0.039916
143274 CAN00WG0 199702 -0.0038 -0.025442
145754 CAN00WG0 199703 -0.2992 -0.049279
148246 CAN00WG0 199704 -0.0919 -0.005948
150774 CAN00WG0 199705 0.0595 0.122322
153318 CAN00WG0 199706 -0.0337 0.045765
160980 CAN00WH0 199709 0.0757 0.079293
163569 CAN00WH0 199710 -0.0741 -0.044000
166159 CAN00WH0 199711 0.1000 -0.014644
168782 CAN00WH0 199712 -0.0909 -0.007072
171399 CAN00WH0 199801 -0.0100 0.001381
174022 CAN00WH0 199802 0.1919 0.081924
176637 CAN00WH0 199803 0.0085 0.050415
179255 CAN00WH0 199804 -0.0168 0.018393
181880 CAN00WH0 199805 0.0427 -0.051279
184516 CAN00WH0 199806 -0.0656 -0.011516
143275 CAN00WO0 199702 -0.1176 -0.025442
145755 CAN00WO0 199703 -0.0074 -0.049279
148247 CAN00WO0 199704 -0.0075 -0.005948
150775 CAN00WO0 199705 0.0451 0.122322'''), sep='\s+')
defcalc_beta(df):
np_array = df.values
s = np_array[:,0] # stock returns are column zero from numpy array
m = np_array[:,1] # market returns are column one from numpy array
covariance = np.cov(s,m) # Calculate covariance between stock and market
beta = covariance[0,1]/covariance[1,1]
return beta
defrolling_apply(df, period, func, min_periods=None):
if min_periods isNone:
min_periods = period
result = pd.Series(np.nan, index=df.index)
for i inrange(1, len(df)+1):
sub_df = df.iloc[max(i-period, 0):i,:] #I edited hereiflen(sub_df) >= min_periods:
idx = sub_df.index[-1]
result[idx] = func(sub_df)
return result
df['beta'] = np.nan
grp = df.groupby('id')
period = 6#I'm using 6 to see some not NaN values, since sample data don't have longer than 12 groupsfor stock, sub_df in grp:
beta = rolling_apply(sub_df[['ret_1m','mkt_ret_1m']], period, calc_beta, min_periods = period)
beta.name = 'beta'
df.update(beta)
print df
Output
id period ret_1m mkt_ret_1m beta
131146CAN00WG0199609-0.15380.047104 NaN
133530CAN00WG0199610-0.0455-0.014143 NaN
135913CAN00WG01996110.00000.040926 NaN
138334CAN00WG01996120.29520.008723 NaN
140794CAN00WG0199701-0.02570.039916 NaN
143274CAN00WG0199702-0.0038-0.025442-1.245908145754CAN00WG0199703-0.2992-0.0492792.574464148246CAN00WG0199704-0.0919-0.0059482.657887150774CAN00WG01997050.05950.1223221.371090153318CAN00WG0199706-0.03370.0457651.494095
... ... ... ... ... ...
171399CAN00WH0199801-0.01000.001381 NaN
174022CAN00WH01998020.19190.0819241.542782176637CAN00WH01998030.00850.0504151.605407179255CAN00WH0199804-0.01680.0183931.571015181880CAN00WH01998050.0427-0.0512791.139972184516CAN00WH0199806-0.0656-0.0115161.101890143275CAN00WO0199702-0.1176-0.025442 NaN
145755CAN00WO0199703-0.0074-0.049279 NaN
148247CAN00WO0199704-0.0075-0.005948 NaN
150775CAN00WO01997050.04510.122322 NaN
Solution 3:
defrolling_apply(df, period, func, min_periods=None):
if min_periods isNone:
min_periods = period
result = pd.Series(np.nan, index=df.index)
for i inrange(1, len(df)):
sub_df = df.iloc[max(i-period, 0):i,:] #get a subsample to runiflen(sub_df) >= min_periods:
idx = sub_df.index[-1]+1# mind the forward looking bias,your return in time t should not be inclued in the beta calculating in time t
result[idx] = func(sub_df)
return result
I fix a forward looking bias for Happy001's code. It's a finance problem, so it should be cautious.
I find that vlmercado's answer is so wrong. If you simply use pd.rolling_cov and pd.rolling_var you are making mistakes in finance. Firstly, it's obvious that the second stock CAN00WH0 do not have any NaN beta, since it use the return of CAN00WG0, which is wrong at all. Secondly, consider such a situation: a stock suspended for ten years, and you can also get that sample into your beta calculating.
I find that pandas.rolling also works for Timestamp, you can see how in my answer above if interested. I change the code of Happy001's code . It's not the fastest way, but is at least 20x faster than the origin code.
crsp_daily['date']=pd.to_datetime(crsp_daily['date'])
crsp_daily=crsp_daily.set_index('date') # rolling needs a time serie index
crsp_daily.index=pd.DatetimeIndex(crsp_daily.index)
calc=crsp_daily[['permno','ret','mkt_ret']]
grp = calc.groupby('permno') #rolling beta for each stock
beta=pd.DataFrame()
for stock, sub_df in grp:
sub2_df=sub_df[['ret','mkt_ret']].sort_index()
beta_m = sub2_df.rolling('1825d',min_periods=150).cov() # 5yr rolling beta , note that d for day, and you cannot use w/m/y, s/d are availiable.
beta_m['beta']=beta_m['ret']/beta_m['mkt_ret']
beta_m=beta_m.xs('mkt_ret',level=1,axis=0)
beta=beta.append(pd.merge(sub_df,pd.DataFrame(beta_m['beta'])))
beta=beta.reset_index()
beta=beta[['date','permno','beta']]
Solution 4:
BTW, since nobody ask the mutil-variable rolling regression in python, I also find a way to slove this problem. The key is that first stack them into one column, and reshape the dataframe in function.
Here is the Code
import pandas as pd
import numpy as np
import timeit
from numba import jit
@jit(nopython=True, cache=True,fastmath=True) # numba only support numpy but pandas, so pd.DataFrame is forbiddened.defcoefcalc(df,coefpos,varnum):
# coefpos: which coef you need, for example I want alpha, so I would set the coefpos as 5 to obtain the alpha since the constant is in the last column in X. varnum: how many variables you put in df except "stkcd and date", in this sample, it's 7 (return,five fama factors, and a constant)if np.mod(df.shape[0],varnum)==0:
df=df.reshape(df.shape[0]//varnum,varnum) # reshape the one column to n column for reg.
Y=np.ascontiguousarray(df[:,0]) # rebuild a contigous numpy array for faster reg
X=np.ascontiguousarray(df[:,1:])
try:
b=(np.linalg.inv(X.T@X))@X.T@Y
result=b[coefpos]
except:
result=np.nan
return result
else:
return np.nan
calc2=pd.read_csv(r'sample.csv')
# A way for rolling beta/alpha
calc2=calc2.set_index(['date','F_INFO_WINDCODE'])
calc2=calc2.dropna() # regression will drop Nan automatically
calc2=calc2.stack().reset_index().set_index('date') # put n columns into one columns, and let datetime64 variable (date) to be the index.
localtime = time.asctime( time.localtime(time.time()) )
print(localtime)
order_of_variable=5# expect for y (return), start from zero.
total_number_variable=7# stkcd/date/return/fama5/constant
required_sample=30*total_number_variable # Monthly data# the parallel kwarg may require loops in def so I turn it off.
alphaest=calc2.groupby('F_INFO_WINDCODE').rolling('1095d',min_periods=required_sample)[0].apply(lambda x:coefcalc(x,5,7),engine='numba', raw=True,engine_kwargs={'nopython': True, 'nogil': True, 'parallel': False})
# as the pandas's document numba engine is faster than cpython when obs is over 1 million.# you can check in https://pandas.pydata.org/pandas-docs/stable/user_guide/window.html , the numba engine part.
localtime = time.asctime( time.localtime(time.time()) )
print(localtime)
Solution 5:
Good News! In pandas 1.3.0, a new method "Table" is added to rolling.apply, everything solved!
Here is an example code.
def coefcalc(df):
Y=np.ascontiguousarray(df[:,0]) # rebuild a contigous numpy array for faster reg
X=np.ascontiguousarray(df[:,1:])
try:
b=(np.linalg.inv(X.T@X))@X.T@Y
return np.nan,b[0],b[1],b[2],b[3],b[4]
except:
return np.nan,np.nan,np.nan,np.nan,np.nan,np.nan
fama5=pd.read_csv('F-F_Research_Data_5_Factors_2x3.csv')
fama5=fama5.iloc[0:695,:].rename(columns={'Mkt-RF':'Mkt_Rf'})
fama5['date']=pd.to_datetime(fama5.date,format='%Y%m',errors='ignore')
forvarin ['Mkt_Rf','SMB','HML','RMW','CMA','RF']:
fama5[var]=pd.to_numeric(fama5[var])/100
fama5[var]=fama5[var].astype('float64')
fama5=fama5.set_index('date')
fama5=fama5.drop('RF',axis=1)
beta=fama5.rolling('100d', method="table", min_periods=0).apply(coefcalc, raw=True, engine="numba")
You can download F-F_Research_Data_5_Factors_2x3.csv from Ken French's Homepage https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_CSV.zip
Post a Comment for "Python Pandas Calculate Rolling Stock Beta Using Rolling Apply To Groupby Object In Vectorized Fashion"