各种方法的概览
def golomazov(df, column, num_groups):
groups = []
for i, group in df.groupby(column):
groups.append(group)
if len(groups) == num_groups:
yield pd.concat(groups)
groups = []
if groups:
yield pd.concat(groups)
def arigion(df, column, chunk_size):
df["n_group"] = df.groupby(column).ngroup()
lower_group_index = 0
upper_group_index = chunk_size - 1
max_group_index = df["n_group"].max()
while lower_group_index <= max_group_index:
yield df.loc[:, df.columns != "n_group"][
df["n_group"].between(lower_group_index, upper_group_index)
]
lower_group_index = upper_group_index + 1
upper_group_index = upper_group_index + chunk_size
def rhug123(df, column, n):
return {g: df for g, df in df.groupby(df.groupby('Symbol').ngroup().floordiv(n))}
def misantroop(df, column, num_groups):
symbol_groups = df.groupby(column)
groups = np.array_split(list(symbol_groups.groups), num_groups)
for group in groups:
yield pd.concat([symbol_groups.get_group(name) for name in group])
%timeit golomazov(df, 'Symbol', n)
157 ns ± 0.647 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
from pympler.asizeof = 414176
%timeit arigion(df, 'Symbol', n)
160 ns ± 0.903 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)
from pympler.asizeof = 414176
%timeit rhug123(df, 'Symbol', n)
5.53 ms ± 28 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
from pympler.asizeof = 57534096
%timeit misantroop(df, 'Symbol', num_groups=n*40)
191 ns ± 2.09 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
from pympler.asizeof = 414176