Pandas按条件分组累加

5

我有以下数据框:

df = pd.DataFrame({"values":[1,5,7,3,0,9,8,8,7,5,8,1,0,0,0,0,2,5],"signal":['L_exit',None,None,'R_entry','R_exit',None,'L_entry','L_exit',None,'R_entry','R_exit','R_entry','R_exit','L_entry','L_exit','L_entry','R_exit',None]})
df

    values  signal
0   1       L_exit
1   5       None
2   7       None
3   3       R_entry
4   0       R_exit
5   9       None
6   8       L_entry
7   8       L_exit
8   7       None
9   5       R_entry
10  8       R_exit
11  1       R_entry
12  0       R_exit
13  0       L_entry
14  0       L_exit
15  0       L_entry
16  2       R_exit
17  5       None

我的目标是添加一个类似于这样的tx列:

   values   signal  num
0   1       L_exit  nan
1   5       None    nan
2   7       None    nan
3   3       R_entry 1.00
4   0       R_exit  1.00
5   9       None    1.00
6   8       L_entry 1.00
7   8       L_exit  1.00
8   7       None    nan
9   5       R_entry 2.00
10  8       R_exit  2.00
11  1       R_entry 2.00
12  0       R_exit  2.00
13  0       L_entry 2.00
14  0       L_exit  2.00
15  0       L_entry nan
16  2       R_exit  nan
17  5       None    nan

业务逻辑:当有 R_entry 信号时,我们将交易分组直到出现 L_exit 信号(如果有另一个 R_entry 信号,则忽略它)。

可视化 enter image description here

我尝试了什么?

g = ( df['signal'].eq('R_entry') |  df_tx['signal'].eq('L_exit') ).cumsum() 
df['tx'] = g.where(df['signal'].eq('R_entry')).groupby(g).ffill() 

问题在于每次出现'R_entry'时它都会增加。
3个回答

2

让我们试试(希望可以自解释):

signals = df['signal']

after_entry = signals.where(signals.eq('R_entry')).ffill().eq('R_entry')
before_exit = signals.where(signals.eq('L_exit')).bfill().eq('L_exit')

valids = after_entry & before_exit


blocks = signals.where(valids).ffill()[::-1].eq('L_exit').cumsum()[::-1]
valid_blocks = (blocks.groupby(blocks).transform('size') > 2)
valid_entries = valid_blocks & (~blocks.duplicated())

df.loc[valid_blocks, 'num'] = valid_entries.cumsum()

输出:

    values   signal  num
0        1   L_exit  NaN
1        5     None  NaN
2        7     None  NaN
3        3  R_entry  1.0
4        0   R_exit  1.0
5        9     None  1.0
6        8  L_entry  1.0
7        8   L_exit  1.0
8        7     None  NaN
9        5  R_entry  2.0
10       8   R_exit  2.0
11       1  R_entry  2.0
12       0   R_exit  2.0
13       0  L_entry  2.0
14       0   L_exit  2.0
15       0  L_entry  NaN
16       2   R_exit  NaN
17       5     None  NaN

2
你可以先创建一个掩码,以获取连续的 R_entries 直到达到 L_exit。
然后通过比较下一个值来获取每个组的第一个 R_entry,并应用 cumsum。
# keep only 'R_entry'/'L_exit' and get groups
mask = df['signal'].where(df['signal'].isin(['R_entry', 'L_exit'])).ffill().eq('R_entry')
# get groups and extend to next value (the L_exit)
df['num'] = (mask.ne(mask.shift())&mask).cumsum().where(mask).ffill(limit=1)

输出:

    values   signal  num
0        1   L_exit  NaN
1        5     None  NaN
2        7     None  NaN
3        3  R_entry  1.0
4        0   R_exit  1.0
5        9     None  1.0
6        8  L_entry  1.0
7        8   L_exit  1.0
8        7     None  NaN
9        5  R_entry  2.0
10       8   R_exit  2.0
11       1  R_entry  2.0
12       0   R_exit  2.0
13       0  L_entry  2.0
14       0   L_exit  2.0
15       0  L_entry  NaN
16       2   R_exit  NaN
17       5     None  NaN

如何运作

以下是中间步骤:

df['isin+ffill']     = df['signal'].where(df['signal'].isin(['R_entry', 'L_exit'])).ffill()
df['mask']           = df['isin+ffill'].eq('R_entry')
df['first_of_group'] = (mask.ne(mask.shift())&mask)
df['cumsum']         = df['first_of_group'].cumsum().where(mask)
df['num']            = df['cumsum'].ffill(limit=1)

    values   signal isin+ffill   mask  first_of_group  cumsum  num
0        1   L_exit     L_exit  False           False     NaN  NaN
1        5     None     L_exit  False           False     NaN  NaN
2        7     None     L_exit  False           False     NaN  NaN
3        3  R_entry    R_entry   True            True     1.0  1.0
4        0   R_exit    R_entry   True           False     1.0  1.0
5        9     None    R_entry   True           False     1.0  1.0
6        8  L_entry    R_entry   True           False     1.0  1.0
7        8   L_exit     L_exit  False           False     NaN  1.0
8        7     None     L_exit  False           False     NaN  NaN
9        5  R_entry    R_entry   True            True     2.0  2.0
10       8   R_exit    R_entry   True           False     2.0  2.0
11       1  R_entry    R_entry   True           False     2.0  2.0
12       0   R_exit    R_entry   True           False     2.0  2.0
13       0  L_entry    R_entry   True           False     2.0  2.0
14       0   L_exit     L_exit  False           False     NaN  2.0
15       0  L_entry     L_exit  False           False     NaN  NaN
16       2   R_exit     L_exit  False           False     NaN  NaN
17       5     None     L_exit  False           False     NaN  NaN

0
def function1(dd:pd.DataFrame):
    dd1=dd.query("signal=='R_entry'").head(1)
    if dd1.pipe(len)>0:
        dd.loc[dd1.index.tolist()[0]:,'num']=dd.name
    return dd

df.assign(col1=df.signal.shift().eq("L_exit").cumsum()).groupby('col1').apply(function1)

        

输出:

     values   signal  col1  num
0        1   L_exit     0  NaN
1        5     None     1  NaN
2        7     None     1  NaN
3        3  R_entry     1  1.0
4        0   R_exit     1  1.0
5        9     None     1  1.0
6        8  L_entry     1  1.0
7        8   L_exit     1  1.0
8        7     None     2  NaN
9        5  R_entry     2  2.0
10       8   R_exit     2  2.0
11       1  R_entry     2  2.0
12       0   R_exit     2  2.0
13       0  L_entry     2  2.0
14       0   L_exit     2  2.0
15       0  L_entry     3  NaN
16       2   R_exit     3  NaN
17       5     None     3  NaN

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接