分段取得最小值
input
import pandas as pd
import numpy as np
A=[17,18,21,15,18,19,22,16,30,50,]
cond=[False,True,False,False,False,True,False,False,True,False]
df=pd.DataFrame({'A':A,'cond':cond})
方法1
def abc(arr):
c = arr.index[-1]
l = arr.index[0]
if df['cond'][c]==True:
df.loc[c, 'fx'] =df['A'][c]
elif df['A'][c]<df['fx'][l]:
df.loc[c, 'fx'] =df['A'][c]
else:
df.loc[c, 'fx'] =df['A'][l]
return 0
df[['A', 'cond']].rolling(2).apply(abc)
#6.34 ms ± 557 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
方法2
df['expected'] = df.groupby(df.cond.cumsum())['A'].cummin()
#1.49 ms ± 88.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
方法3
# First get a list of all the index on which to reset the cummin()
my_ix = df[df['cond']].index
return_list = []
# Loop over all chunks of indexes
for i in range(0, len(my_ix)):
ix_start = my_ix[i]
try:
ix_end = my_ix[i+1]
except IndexError:
# This happens on the last record
ix_end = None
cummin_df = df[ix_start:ix_end]['A'].cummin()
return_list.append(cummin_df)
pd.concat(return_list)
#737 µs ± 53.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
注意:这里测试的是轻量级数据,之后我又用百万数据测试:
for循环竟然用了将近1分钟,apply直接死机,只有groupby仅用了0.08秒。