分段取得最小值

input

import pandas as pd
import numpy as np
A=[17,18,21,15,18,19,22,16,30,50,]
cond=[False,True,False,False,False,True,False,False,True,False]
df=pd.DataFrame({'A':A,'cond':cond})

方法1

def abc(arr):
    c = arr.index[-1]
    l = arr.index[0]
    if df['cond'][c]==True:
        df.loc[c, 'fx'] =df['A'][c]
    elif df['A'][c]<df['fx'][l]:
         df.loc[c, 'fx'] =df['A'][c]
    else:
         df.loc[c, 'fx'] =df['A'][l]                   
    return 0
df[['A', 'cond']].rolling(2).apply(abc)
#6.34 ms ± 557 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

方法2

df['expected'] = df.groupby(df.cond.cumsum())['A'].cummin()
#1.49 ms ± 88.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

方法3

# First get a list of all the index on which to reset the cummin()
my_ix = df[df['cond']].index
return_list = []
# Loop over all chunks of indexes
for i in range(0, len(my_ix)):
    ix_start = my_ix[i]
    try:
        ix_end = my_ix[i+1]
    except IndexError:
        # This happens on the last record
        ix_end = None

    cummin_df = df[ix_start:ix_end]['A'].cummin()
    return_list.append(cummin_df)
pd.concat(return_list)
#737 µs ± 53.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

注意：这里测试的是轻量级数据，之后我又用百万数据测试：

for循环竟然用了将近1分钟，apply直接死机，只有groupby仅用了0.08秒。

posted @ 2022-07-14 22:33 C羽言阅读(37) 评论(0) 收藏举报

刷新页面返回顶部