我们可以使用来自
pyjanitor 的
complete 函数,它提供了一个方便的抽象来生成缺失的行,这种情况下是按
ID
组:
import pandas as pd
import janitor as jn
years = dict(year = lambda year: range(year.min(), year.max() + 1))
df.complete(years, by = 'ID', sort = True).fillna(0, downcast = 'infer')
ID year number
0 A 2017 1
1 A 2018 0
2 A 2019 1
3 B 2017 1
4 B 2018 1
5 C 2016 1
6 C 2017 0
7 C 2018 0
8 C 2019 1
然而,
by
主要是为了方便起见;在某些情况下,做更多的工作可能更有效率,类似于@Alollz的解决方案:
years = dict(year = range(df.year.min(), df.year.max() + 1))
group = df.groupby('ID').year
(df.assign(year_max = group.transform('max'),
year_min = group.transform('min'))
.complete(years, ('ID', 'year_min', 'year_max'))
.loc[lambda df: df.year.between(df.year_min, df.year_max),
df.columns]
.sort_values([*df], ignore_index = True)
.fillna(0, downcast = 'infer')
)
ID year number
0 A 2017 1
1 A 2018 0
2 A 2019 1
3 B 2017 1
4 B 2018 1
5 C 2016 1
6 C 2017 0
7 C 2018 0
8 C 2019 1
使用 @Allolz 的示例数据:
N = 50000
df = pd.DataFrame({'ID': np.repeat(range(N), 2),
'year': np.tile([2010,2018], N),
'number': 1})
def complete_sam(df):
years = dict(year = range(df.year.min(), df.year.max() + 1))
group = df.groupby('ID').year
outcome = (df.assign(year_max = group.transform('max'),
year_min = group.transform('min'))
.complete(years, ('ID', 'year_min', 'year_max'))
.loc[lambda df: df.year.between(df.year_min,
df.year_max),
df.columns]
.sort_values([*df], ignore_index = True)
.fillna(0)
)
return outcome
def SB(df):
idx = df.groupby('ID')['year'].apply(lambda x: pd.Series(np.arange(x.iloc[0], x.iloc[-1]+1))).reset_index()
df = df.set_index(['ID','year']).reindex(pd.MultiIndex.from_arrays([idx['ID'], idx['year']]), fill_value=0).reset_index()
return df
def Alollz(df):
idx = pd.MultiIndex.from_product([np.unique(df['ID']),
np.arange(df['year'].min(), df['year'].max()+1)],
names=['ID', 'year'])
df_b = pd.DataFrame({'number': 0}, index=idx)
df_b.update(df.set_index(['ID', 'year']))
m = (df_b.groupby(level=0)['number'].cummax().eq(1)
& df_b[::-1].groupby(level=0)['number'].cummax().eq(1))
return df_b.loc[m].reset_index()
In [310]: Alollz(df).equals(complete_sam(df))
Out[310]: True
In [311]: %timeit complete_sam(df)
268 ms ± 24.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [312]: %timeit Alollz(df)
1.84 s ± 58.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [316]: SB(df).eq(complete_sam(df)).all().all()
Out[316]: True
In [317]: %timeit SB(df)
6.13 s ± 87.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)