用Python计算Pearson相关性和显著性

224
我正在寻找一个函数,它以两个列表作为输入,并返回Pearson相关系数和相关性的显著性。
19个回答

5
这是使用NumPy实现的Pearson相关函数。
def corr(data1, data2):
    "data1 & data2 should be NumPy arrays."
    mean1 = data1.mean()
    mean2 = data2.mean()
    std1 = data1.std()
    std2 = data2.std()

    #corr = ((data1-mean1)*(data2-mean2)).mean()/(std1*std2)
    corr = ((data1*data2).mean()-mean1*mean2)/(std1*std2)
    return corr

1
相关性的显著性并不存在。Numpy、Scipy以及几乎所有用于Python的统计库都有皮尔逊相关系数,但问题在于显著性,而你却忽略了它。 - undefined

3
我有一个非常简单易懂的解决方案。对于长度相等的两个数组,可以轻松计算 Pearson 系数,方法如下:
def manual_pearson(a,b):
"""
Accepts two arrays of equal length, and computes correlation coefficient. 
Numerator is the sum of product of (a - a_avg) and (b - b_avg), 
while denominator is the product of a_std and b_std multiplied by 
length of array. 
"""
  a_avg, b_avg = np.average(a), np.average(b)
  a_stdev, b_stdev = np.std(a), np.std(b)
  n = len(a)
  denominator = a_stdev * b_stdev * n
  numerator = np.sum(np.multiply(a-a_avg, b-b_avg))
  p_coef = numerator/denominator
  return p_coef

3
这里是基于稀疏向量的皮尔逊相关性的实现。这里的向量被表示为一个由元组(索引,值)表示的列表。两个稀疏向量的长度可以不同,但在整个向量大小上必须相同。这对于文本挖掘应用非常有用,因为向量的大小通常非常大,大部分特征都是词袋,并且计算通常使用稀疏向量进行。
def get_pearson_corelation(self, first_feature_vector=[], second_feature_vector=[], length_of_featureset=0):
    indexed_feature_dict = {}
    if first_feature_vector == [] or second_feature_vector == [] or length_of_featureset == 0:
        raise ValueError("Empty feature vectors or zero length of featureset in get_pearson_corelation")

    sum_a = sum(value for index, value in first_feature_vector)
    sum_b = sum(value for index, value in second_feature_vector)

    avg_a = float(sum_a) / length_of_featureset
    avg_b = float(sum_b) / length_of_featureset

    mean_sq_error_a = sqrt((sum((value - avg_a) ** 2 for index, value in first_feature_vector)) + ((
        length_of_featureset - len(first_feature_vector)) * ((0 - avg_a) ** 2)))
    mean_sq_error_b = sqrt((sum((value - avg_b) ** 2 for index, value in second_feature_vector)) + ((
        length_of_featureset - len(second_feature_vector)) * ((0 - avg_b) ** 2)))

    covariance_a_b = 0

    # Calculate covariance for the sparse vectors
    for tuple in first_feature_vector:
        if len(tuple) != 2:
            raise ValueError("Invalid feature frequency tuple in featureVector: %s") % (tuple,)
        indexed_feature_dict[tuple[0]] = tuple[1]
    count_of_features = 0
    for tuple in second_feature_vector:
        count_of_features += 1
        if len(tuple) != 2:
            raise ValueError("Invalid feature frequency tuple in featureVector: %s") % (tuple,)
        if tuple[0] in indexed_feature_dict:
            covariance_a_b += ((indexed_feature_dict[tuple[0]] - avg_a) * (tuple[1] - avg_b))
            del (indexed_feature_dict[tuple[0]])
        else:
            covariance_a_b += (0 - avg_a) * (tuple[1] - avg_b)

    for index in indexed_feature_dict:
        count_of_features += 1
        covariance_a_b += (indexed_feature_dict[index] - avg_a) * (0 - avg_b)

    # Adjust covariance with rest of vector with 0 value
    covariance_a_b += (length_of_featureset - count_of_features) * -avg_a * -avg_b

    if mean_sq_error_a == 0 or mean_sq_error_b == 0:
        return -1
    else:
        return float(covariance_a_b) / (mean_sq_error_a * mean_sq_error_b)

单元测试:
def test_get_get_pearson_corelation(self):
    vector_a = [(1, 1), (2, 2), (3, 3)]
    vector_b = [(1, 1), (2, 5), (3, 7)]
    self.assertAlmostEquals(self.sim_calculator.get_pearson_corelation(vector_a, vector_b, 3), 0.981980506062, 3, None, None)

    vector_a = [(1, 1), (2, 2), (3, 3)]
    vector_b = [(1, 1), (2, 5), (3, 7), (4, 14)]
    self.assertAlmostEquals(self.sim_calculator.get_pearson_corelation(vector_a, vector_b, 5), -0.0137089240555, 3, None, None)

2
从Python 3.10开始,皮尔逊相关系数(statistics.correlation)直接在标准库中提供:
from statistics import correlation

# a = [15, 12, 8, 8, 7, 7, 7, 6, 5, 3]
# b = [10, 25, 17, 11, 13, 17, 20, 13, 9, 15]
correlation(a, b)
# 0.1449981545806852

1
你可能会想知道如何解释你的概率,以便在特定方向上寻找相关性(负相关或正相关)。这是我编写的一个函数来帮助解决这个问题。它甚至可能是正确的!
它基于我从http://www.vassarstats.net/rsig.htmlhttp://en.wikipedia.org/wiki/Student%27s_t_distribution中获取的信息,感谢其他在此发布答案的人。
# Given (possibly random) variables, X and Y, and a correlation direction,
# returns:
#  (r, p),
# where r is the Pearson correlation coefficient, and p is the probability
# that there is no correlation in the given direction.
#
# direction:
#  if positive, p is the probability that there is no positive correlation in
#    the population sampled by X and Y
#  if negative, p is the probability that there is no negative correlation
#  if 0, p is the probability that there is no correlation in either direction
def probabilityNotCorrelated(X, Y, direction=0):
    x = len(X)
    if x != len(Y):
        raise ValueError("variables not same len: " + str(x) + ", and " + \
                         str(len(Y)))
    if x < 6:
        raise ValueError("must have at least 6 samples, but have " + str(x))
    (corr, prb_2_tail) = stats.pearsonr(X, Y)

    if not direction:
        return (corr, prb_2_tail)

    prb_1_tail = prb_2_tail / 2
    if corr * direction > 0:
        return (corr, prb_1_tail)

    return (corr, 1 - prb_1_tail)

1

0

计算相关性:

相关性 - 用于衡量两个不同变量之间的相似度

使用皮尔逊相关系数

from scipy.stats import pearsonr

# final_data is the dataframe with n set of columns
pearson_correlation = final_data.corr(method='pearson')
pearson_correlation
# Print correlation of n*n column

使用斯皮尔曼相关性
from scipy.stats import spearmanr

# final_data is the dataframe with n set of columns
spearman_correlation = final_data.corr(method='spearman')
spearman_correlation
# Print correlation of n*n column

使用肯德尔相关性
kendall_correlation=final_data.corr(method='kendall')
kendall_correlation

-1
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    
    y2 = y_pred.copy()
    y2 -= y2.mean(axis=0);    y2 /= y2.std(axis=0) 
    y1 = y_true.copy(); 
    y1 -= y1.mean(axis=0);    y1 /= y1.std(axis=0) 
        
    c = (y1*y2).mean().mean()# Correlation for rescaled matrices is just matrix product and average 
        
    return c

-2
def pearson(x,y):
  n=len(x)
  vals=range(n)

  sumx=sum([float(x[i]) for i in vals])
  sumy=sum([float(y[i]) for i in vals])

  sumxSq=sum([x[i]**2.0 for i in vals])
  sumySq=sum([y[i]**2.0 for i in vals])

  pSum=sum([x[i]*y[i] for i in vals])
  # Calculating Pearson correlation
  num=pSum-(sumx*sumy/n)
  den=((sumxSq-pow(sumx,2)/n)*(sumySq-pow(sumy,2)/n))**.5
  if den==0: return 0
  r=num/den
  return r

1
仅提供代码的答案并不是良好的做法。请考虑添加一些文字来解释您的代码如何回答问题。(阅读有关如何在SO上回答问题的帮助页面) - Yannis

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接