您可以使用
scipy.stats.mode
函数进行解决:
from scipy import stats
a = df.values.T
b = stats.mode(a)
print(b)
ModeResult(mode=array([[0, 1, 1]], dtype=int64), count=array([[3, 3, 4]]))
df['frequent'] = b[0][0]
df['freq_count'] = b[1][0]
print (df)
bit1 bit2 bit2.1 bit4 bit5 frequent freq_count
0 0 0 0 1 1 0 3
1 1 1 1 0 0 1 3
2 1 0 1 1 1 1 4
使用
Counter.most_common
方法:
from collections import Counter
def f(x):
a, b = Counter(x).most_common(1)[0]
return pd.Series([a, b])
df[['frequent','freq_count']] = df.apply(f, axis=1)
另一种解决方案:
def f(x):
counts = np.bincount(x)
a = np.argmax(counts)
b = np.max(counts)
return pd.Series([a,b])
df[['frequent','freq_count']] = df.apply(f, axis=1)
替代方案:
from collections import defaultdict
def f(x):
d = defaultdict(int)
for i in x:
d[i] += 1
return pd.Series(sorted(d.items(), key=lambda x: x[1], reverse=True)[0])
df[['frequent','freq_count']] = df.apply(f, axis=1)
时间:
np.random.seed(100)
N = 10000
df = pd.DataFrame(np.random.randint(2, size=(N,20)))
In [140]: %timeit df.apply(f1, axis=1)
1 loop, best of 3: 1.78 s per loop
In [141]: %timeit df.apply(f2, axis=1)
1 loop, best of 3: 1.66 s per loop
In [142]: %timeit df.apply(f3, axis=1)
1 loop, best of 3: 1.7 s per loop
In [143]: %timeit mod(df)
100 loops, best of 3: 8.37 ms per loop
In [144]: %timeit mod1(df)
100 loops, best of 3: 8.88 ms per loop
from collections import Counter
from collections import defaultdict
from scipy import stats
def f1(x):
a, b = Counter(x).most_common(1)[0]
return pd.Series([a, b])
def f2(x):
counts = np.bincount(x)
a = np.argmax(counts)
b = np.max(counts)
return pd.Series([a,b])
def f3(x):
d = defaultdict(int)
for i in x:
d[i] += 1
return pd.Series(sorted(d.items(), key=lambda x: x[1], reverse=True)[0])
def mod(df):
a = df.values.T
b = stats.mode(a)
df['a'] = b[0][0]
df['b'] = b[1][0]
return df
def mod1(df):
a = df.values
b = stats.mode(a, axis=1)
df['a'] = b[0][:, 0]
df['b'] = b[1][:, 0]
return df