Python Pandas Calculate Rolling Stock Beta Using Rolling Apply To Groupby Object In Vectorized Fashion

January 04, 2024 Post a Comment

I have a large data frame, df, containing 4 columns: id period ret_1m mkt_ret_1m 131146 CAN00WG0 199609 -0.1538 0.047104 133530 CAN00WG

Solution 1:

Try pd.rolling_cov() and pd.rolling.var() as follows:

import pandas as pd
import numpy as np
from StringIOimport StringIO

    df = pd.read_csv(StringIO('''              id  period  ret_1m  mkt_ret_1m
    131146CAN00WG0199609-0.15380.047104133530CAN00WG0199610-0.0455-0.014143135913CAN00WG01996110.00000.040926138334CAN00WG01996120.29520.008723140794CAN00WG0199701-0.02570.039916143274CAN00WG0199702-0.0038-0.025442145754CAN00WG0199703-0.2992-0.049279148246CAN00WG0199704-0.0919-0.005948150774CAN00WG01997050.05950.122322153318CAN00WG0199706-0.03370.045765160980CAN00WH01997090.07570.079293163569CAN00WH0199710-0.0741-0.044000166159CAN00WH01997110.1000-0.014644168782CAN00WH0199712-0.0909-0.007072171399CAN00WH0199801-0.01000.001381174022CAN00WH01998020.19190.081924176637CAN00WH01998030.00850.050415179255CAN00WH0199804-0.01680.018393181880CAN00WH01998050.0427-0.051279184516CAN00WH0199806-0.0656-0.011516143275CAN00WO0199702-0.1176-0.025442145755CAN00WO0199703-0.0074-0.049279148247CAN00WO0199704-0.0075-0.005948150775CAN00WO01997050.04510.122322'''), sep='\s+')

    df['beta'] = pd.rolling_cov(df['ret_1m'], df['mkt_ret_1m'], window=6) / pd.rolling_var(df['mkt_ret_1m'], window=6)

print df

Output:

id  period  ret_1m  mkt_ret_1m      beta
131146CAN00WG0199609-0.15380.047104       NaN
133530CAN00WG0199610-0.0455-0.014143       NaN
135913CAN00WG01996110.00000.040926       NaN
138334CAN00WG01996120.29520.008723       NaN
140794CAN00WG0199701-0.02570.039916       NaN
143274CAN00WG0199702-0.0038-0.025442-1.245908145754CAN00WG0199703-0.2992-0.0492792.574464148246CAN00WG0199704-0.0919-0.0059482.657887150774CAN00WG01997050.05950.1223221.371090153318CAN00WG0199706-0.03370.0457651.494095160980CAN00WH01997090.07570.0792931.616520163569CAN00WH0199710-0.0741-0.0440001.630411166159CAN00WH01997110.1000-0.0146440.651220168782CAN00WH0199712-0.0909-0.0070720.652148171399CAN00WH0199801-0.01000.0013810.724120174022CAN00WH01998020.19190.0819241.542782176637CAN00WH01998030.00850.0504151.605407179255CAN00WH0199804-0.01680.0183931.571015181880CAN00WH01998050.0427-0.0512791.139972184516CAN00WH0199806-0.0656-0.0115161.101890143275CAN00WO0199702-0.1176-0.0254421.372437145755CAN00WO0199703-0.0074-0.0492790.031939148247CAN00WO0199704-0.0075-0.005948-0.535855150775CAN00WO01997050.04510.1223220.341747

Solution 2:

I guess pd.rolling_apply doesn't help in this case since it seems to me that it essentially only takes a Series (Even if a dataframe is passed, it's processing one column a time). But you can always write your own rolling_apply that takes a dataframe.

import pandas as pd
import numpy as np
from StringIO import StringIO

df = pd.read_csv(StringIO('''              id  period  ret_1m  mkt_ret_1m
131146  CAN00WG0  199609 -0.1538    0.047104
133530  CAN00WG0  199610 -0.0455   -0.014143
135913  CAN00WG0  199611  0.0000    0.040926
138334  CAN00WG0  199612  0.2952    0.008723
140794  CAN00WG0  199701 -0.0257    0.039916
143274  CAN00WG0  199702 -0.0038   -0.025442
145754  CAN00WG0  199703 -0.2992   -0.049279
148246  CAN00WG0  199704 -0.0919   -0.005948
150774  CAN00WG0  199705  0.0595    0.122322
153318  CAN00WG0  199706 -0.0337    0.045765
160980  CAN00WH0  199709  0.0757    0.079293
163569  CAN00WH0  199710 -0.0741   -0.044000
166159  CAN00WH0  199711  0.1000   -0.014644
168782  CAN00WH0  199712 -0.0909   -0.007072
171399  CAN00WH0  199801 -0.0100    0.001381
174022  CAN00WH0  199802  0.1919    0.081924
176637  CAN00WH0  199803  0.0085    0.050415
179255  CAN00WH0  199804 -0.0168    0.018393
181880  CAN00WH0  199805  0.0427   -0.051279
184516  CAN00WH0  199806 -0.0656   -0.011516
143275  CAN00WO0  199702 -0.1176   -0.025442
145755  CAN00WO0  199703 -0.0074   -0.049279
148247  CAN00WO0  199704 -0.0075   -0.005948
150775  CAN00WO0  199705  0.0451    0.122322'''), sep='\s+')



defcalc_beta(df):
    np_array = df.values
    s = np_array[:,0] # stock returns are column zero from numpy array
    m = np_array[:,1] # market returns are column one from numpy array

    covariance = np.cov(s,m) # Calculate covariance between stock and market
    beta = covariance[0,1]/covariance[1,1]
    return beta

defrolling_apply(df, period, func, min_periods=None):
    if min_periods isNone:
        min_periods = period
    result = pd.Series(np.nan, index=df.index)

    for i inrange(1, len(df)+1):
        sub_df = df.iloc[max(i-period, 0):i,:] #I edited hereiflen(sub_df) >= min_periods:
            idx = sub_df.index[-1]
            result[idx] = func(sub_df)
    return result

df['beta'] = np.nan
grp = df.groupby('id')
period = 6#I'm using 6  to see some not NaN values, since sample data don't have longer than 12 groupsfor stock, sub_df in grp:
    beta = rolling_apply(sub_df[['ret_1m','mkt_ret_1m']], period, calc_beta, min_periods = period)  
    beta.name = 'beta'
    df.update(beta)
print df

Output

id  period  ret_1m  mkt_ret_1m      beta
131146CAN00WG0199609-0.15380.047104       NaN
133530CAN00WG0199610-0.0455-0.014143       NaN
135913CAN00WG01996110.00000.040926       NaN
138334CAN00WG01996120.29520.008723       NaN
140794CAN00WG0199701-0.02570.039916       NaN
143274CAN00WG0199702-0.0038-0.025442-1.245908145754CAN00WG0199703-0.2992-0.0492792.574464148246CAN00WG0199704-0.0919-0.0059482.657887150774CAN00WG01997050.05950.1223221.371090153318CAN00WG0199706-0.03370.0457651.494095
...          ...     ...     ...         ...       ...
171399CAN00WH0199801-0.01000.001381       NaN
174022CAN00WH01998020.19190.0819241.542782176637CAN00WH01998030.00850.0504151.605407179255CAN00WH0199804-0.01680.0183931.571015181880CAN00WH01998050.0427-0.0512791.139972184516CAN00WH0199806-0.0656-0.0115161.101890143275CAN00WO0199702-0.1176-0.025442       NaN
145755CAN00WO0199703-0.0074-0.049279       NaN
148247CAN00WO0199704-0.0075-0.005948       NaN
150775CAN00WO01997050.04510.122322       NaN

Solution 3:

defrolling_apply(df, period, func, min_periods=None):
    if min_periods isNone:
        min_periods = period
    result = pd.Series(np.nan, index=df.index)

    for i inrange(1, len(df)):
        sub_df = df.iloc[max(i-period, 0):i,:] #get a subsample to runiflen(sub_df) >= min_periods:
            idx = sub_df.index[-1]+1# mind the forward looking bias,your return in time t should not be inclued in the beta calculating in time t
            result[idx] = func(sub_df)
    return result

I fix a forward looking bias for Happy001's code. It's a finance problem, so it should be cautious.

I find that vlmercado's answer is so wrong. If you simply use pd.rolling_cov and pd.rolling_var you are making mistakes in finance. Firstly, it's obvious that the second stock CAN00WH0 do not have any NaN beta, since it use the return of CAN00WG0, which is wrong at all. Secondly, consider such a situation: a stock suspended for ten years, and you can also get that sample into your beta calculating.

I find that pandas.rolling also works for Timestamp, you can see how in my answer above if interested. I change the code of Happy001's code . It's not the fastest way, but is at least 20x faster than the origin code.

crsp_daily['date']=pd.to_datetime(crsp_daily['date'])
crsp_daily=crsp_daily.set_index('date') # rolling needs a time serie index
crsp_daily.index=pd.DatetimeIndex(crsp_daily.index)
calc=crsp_daily[['permno','ret','mkt_ret']]
grp = calc.groupby('permno') #rolling beta for each stock
beta=pd.DataFrame()
for stock, sub_df in grp:
        sub2_df=sub_df[['ret','mkt_ret']].sort_index() 
        beta_m = sub2_df.rolling('1825d',min_periods=150).cov() # 5yr rolling beta , note that d for day, and you cannot use w/m/y, s/d are availiable.
        beta_m['beta']=beta_m['ret']/beta_m['mkt_ret']
        beta_m=beta_m.xs('mkt_ret',level=1,axis=0)
        beta=beta.append(pd.merge(sub_df,pd.DataFrame(beta_m['beta'])))
beta=beta.reset_index()
beta=beta[['date','permno','beta']]

Solution 4:

BTW, since nobody ask the mutil-variable rolling regression in python, I also find a way to slove this problem. The key is that first stack them into one column, and reshape the dataframe in function.

Baca Juga

Here is the Code

import pandas as pd
import numpy as np
import timeit
from numba import jit 

@jit(nopython=True, cache=True,fastmath=True) # numba only support numpy but pandas, so pd.DataFrame is forbiddened.defcoefcalc(df,coefpos,varnum): 
# coefpos: which coef you need, for example I want alpha, so I would set the coefpos as 5 to obtain the alpha since the constant is in the last column in X. varnum: how many variables you put in df except "stkcd and date", in this sample, it's 7 (return,five fama factors, and a constant)if np.mod(df.shape[0],varnum)==0:
        df=df.reshape(df.shape[0]//varnum,varnum) # reshape the one column to n column for reg.
        Y=np.ascontiguousarray(df[:,0]) # rebuild a contigous numpy array for faster reg
        X=np.ascontiguousarray(df[:,1:])
        try:
            b=(np.linalg.inv(X.T@X))@X.T@Y
            result=b[coefpos]
        except:
            result=np.nan
        return result
    else:
        return np.nan
    
calc2=pd.read_csv(r'sample.csv')

# A way for rolling beta/alpha
calc2=calc2.set_index(['date','F_INFO_WINDCODE'])
calc2=calc2.dropna() # regression will drop Nan automatically
calc2=calc2.stack().reset_index().set_index('date') # put n columns into one columns, and let datetime64 variable (date) to be the index.
localtime = time.asctime( time.localtime(time.time()) )
print(localtime)
order_of_variable=5# expect for y (return), start from zero.
total_number_variable=7# stkcd/date/return/fama5/constant
required_sample=30*total_number_variable # Monthly data# the parallel kwarg may require loops in def so I turn it off.
alphaest=calc2.groupby('F_INFO_WINDCODE').rolling('1095d',min_periods=required_sample)[0].apply(lambda x:coefcalc(x,5,7),engine='numba', raw=True,engine_kwargs={'nopython': True, 'nogil': True, 'parallel': False})

# as the pandas's document numba engine is faster than cpython when obs is over 1 million.# you can check in https://pandas.pydata.org/pandas-docs/stable/user_guide/window.html , the numba engine part.

localtime = time.asctime( time.localtime(time.time()) )
print(localtime)

Solution 5:

Good News! In pandas 1.3.0, a new method "Table" is added to rolling.apply, everything solved!

Here is an example code.

def coefcalc(df):
Y=np.ascontiguousarray(df[:,0]) # rebuild a contigous numpy array for faster reg
X=np.ascontiguousarray(df[:,1:])
try:
    b=(np.linalg.inv(X.T@X))@X.T@Y
    return np.nan,b[0],b[1],b[2],b[3],b[4]
except:
    return np.nan,np.nan,np.nan,np.nan,np.nan,np.nan

fama5=pd.read_csv('F-F_Research_Data_5_Factors_2x3.csv')
fama5=fama5.iloc[0:695,:].rename(columns={'Mkt-RF':'Mkt_Rf'})
fama5['date']=pd.to_datetime(fama5.date,format='%Y%m',errors='ignore')
forvarin ['Mkt_Rf','SMB','HML','RMW','CMA','RF']:
    fama5[var]=pd.to_numeric(fama5[var])/100
    fama5[var]=fama5[var].astype('float64')

fama5=fama5.set_index('date')
fama5=fama5.drop('RF',axis=1)

beta=fama5.rolling('100d', method="table", min_periods=0).apply(coefcalc, raw=True, engine="numba")

You can download F-F_Research_Data_5_Factors_2x3.csv from Ken French's Homepage https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_5_Factors_2x3_CSV.zip

lacucinadiadine