Skip to content Skip to sidebar Skip to footer

How To Count Concurrent Events In A Dataframe In One Line?

I have a dataset with phone calls. I want to count how many active calls there are for each record. I found this question but I'd like to avoid loops and functions. Each call has a

Solution 1:

You can use:

#convert time and date to datetimedf['date_start'] = pd.to_datetime(df.start + ' ' + df.date)
df['date_end'] = pd.to_datetime(df.end + ' ' + df.date)
#remove columnsdf = df.drop(['start','end','date'], axis=1)

Solution with loop:

active_events= []
for i in df.index:
    active_events.append(len(df[(df["date_start"]<=df.loc[i,"date_start"]) & 
                                (df["date_end"]> df.loc[i,"date_start"])]))
df['activecalls'] = pd.Series(active_events)
print (df)
           date_start            date_end  activecalls
0 2016-08-10 09:17:12 2016-08-10 09:18:20            1
1 2016-08-11 09:15:58 2016-08-11 09:17:42            1
2 2016-08-11 09:16:40 2016-08-11 09:17:49            2
3 2016-08-11 09:17:05 2016-08-11 09:18:03            3
4 2016-08-11 09:18:22 2016-08-11 09:18:30            1

Solution with merge

#cross joindf['tmp']=1df1=pd.merge(df,df.reset_index(),on=['tmp'])df=df.drop('tmp',axis=1)#print (df1)#filtering by conditionsdf1=df1[(df1["date_start_x"]<=df1["date_start_y"])(df1["date_end_x"]>df1["date_start_y"])]print(df1)date_start_xdate_end_xactivecalls_xtmpindex\02016-08-10 09:17:12 2016-08-10 09:18:20              11062016-08-11 09:15:58 2016-08-11 09:17:42              11172016-08-11 09:15:58 2016-08-11 09:17:42              11282016-08-11 09:15:58 2016-08-11 09:17:42              113122016-08-11 09:16:40 2016-08-11 09:17:49              212132016-08-11 09:16:40 2016-08-11 09:17:49              213182016-08-11 09:17:05 2016-08-11 09:18:03              313242016-08-11 09:18:22 2016-08-11 09:18:30              114date_start_ydate_end_yactivecalls_y02016-08-10 09:17:12 2016-08-10 09:18:20              162016-08-11 09:15:58 2016-08-11 09:17:42              172016-08-11 09:16:40 2016-08-11 09:17:49              282016-08-11 09:17:05 2016-08-11 09:18:03              3122016-08-11 09:16:40 2016-08-11 09:17:49              2132016-08-11 09:17:05 2016-08-11 09:18:03              3182016-08-11 09:17:05 2016-08-11 09:18:03              3242016-08-11 09:18:22 2016-08-11 09:18:30              1
#get size - active callsprint(df1.groupby(['index'],sort=False).size())index0111223341dtype:int64df['activecalls']=df1.groupby('index').size()print(df)date_startdate_endactivecalls02016-08-10 09:17:12 2016-08-10 09:18:20            112016-08-11 09:15:58 2016-08-11 09:17:42            122016-08-11 09:16:40 2016-08-11 09:17:49            232016-08-11 09:17:05 2016-08-11 09:18:03            342016-08-11 09:18:22 2016-08-11 09:18:30            1

Timings:

def a(df):
    active_events= []
    for i in df.index:
        active_events.append(len(df[(df["date_start"]<=df.loc[i,"date_start"]) & (df["date_end"]> df.loc[i,"date_start"])]))
    df['activecalls'] = pd.Series(active_events)
    return (df)

def b(df):
    df['tmp'] = 1
    df1 = pd.merge(df,df.reset_index(),on=['tmp'])
    df = df.drop('tmp', axis=1)
    df1 = df1[(df1["date_start_x"]<=df1["date_start_y"])  & (df1["date_end_x"]> df1["date_start_y"])]
    df['activecalls'] = df1.groupby('index').size()
    return (df)

print (a(df))
print (b(df))

In [160]: %timeit (a(df))
100 loops, best of 3: 6.76 ms per loop

In [161]: %timeit (b(df))
The slowest run took 4.42 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 4.61 ms per loop

Post a Comment for "How To Count Concurrent Events In A Dataframe In One Line?"