在文档中寻找句子之间的语义相似性

Question

在文档中寻找句子之间的语义相似性

pythonvectornltksimilaritytf-idf

3

我从这个链接中收集了一些代码（代码以漂亮的颜色编码），并进行了4个小改动来修复一些错误。我还使用了来自2个以前论坛的一些代码。

代码的作用是计算整个文本中连续句子之间的语义相似度，然后显示所有获得的相似度值，如下所示;

'the yellow door.', 'The red hammer' 0.65

'pink fox in the woods.', 'commander fox is blue.' 0.32

以下是代码;

ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85

brown_freqs = dict()
N = 0

######################### word similarity ##########################

def get_best_synset_pair(word_1, word_2):
    """ 
    Choose the pair with highest path similarity among all pairs. 
    Mimics pattern-seeking behavior of humans.
    """
    max_sim = -1.0
    synsets_1 = wn.synsets(word_1)
    synsets_2 = wn.synsets(word_2)
    if len(synsets_1) == 0 or len(synsets_2) == 0:
        return None, None
    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
               sim = wn.path_similarity(synset_1, synset_2)
               if sim > max_sim:
                   max_sim = sim
                   best_pair = synset_1, synset_2
        return best_pair

def length_dist(synset_1, synset_2):
    
    l_dist = sys.maxint
    if synset_1 is None or synset_2 is None: 
        return 0.0
    if synset_1 == synset_2:
        # if synset_1 and synset_2 are the same synset return 0
        l_dist = 0.0
    else:
        wset_1 = set([str(x.name()) for x in synset_1.lemmas()])        
        wset_2 = set([str(x.name()) for x in synset_2.lemmas()])
        if len(wset_1.intersection(wset_2)) > 0:
            # if synset_1 != synset_2 but there is word overlap, return 1.0
            l_dist = 1.0
        else:
            # just compute the shortest path between the two
            l_dist = synset_1.shortest_path_distance(synset_2)
            if l_dist is None:
                l_dist = 0.0
    # normalize path length to the range [0,1]
    return math.exp(-ALPHA * l_dist)

def hierarchy_dist(synset_1, synset_2):
   
    h_dist = sys.maxint
    if synset_1 is None or synset_2 is None: 
        return h_dist
    if synset_1 == synset_2:
        # return the depth of one of synset_1 or synset_2
        h_dist = max([x[1] for x in synset_1.hypernym_distances()])
    else:
        # find the max depth of least common subsumer
        hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}
        hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}
        lcs_candidates = set(hypernyms_1.keys()).intersection(
            set(hypernyms_2.keys()))
        if len(lcs_candidates) > 0:
            lcs_dists = []
            for lcs_candidate in lcs_candidates:
                lcs_d1 = 0
                if lcs_candidate in hypernyms_1:
                    lcs_d1 = hypernyms_1[lcs_candidate]
                lcs_d2 = 0
                if lcs_candidate in hypernyms_2:
                    lcs_d2 = hypernyms_2[lcs_candidate]
                lcs_dists.append(max([lcs_d1, lcs_d2]))
            h_dist = max(lcs_dists)
        else:
            h_dist = 0
    return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) / 
        (math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))
    
def word_similarity(word_1, word_2):
    synset_pair = get_best_synset_pair(word_1, word_2)
    return (length_dist(synset_pair[0], synset_pair[1]) * 
        hierarchy_dist(synset_pair[0], synset_pair[1]))

######################### sentence similarity ##########################

def most_similar_word(word, word_set):
    
    max_sim = -1.0
    sim_word = ""
    for ref_word in word_set:
      sim = word_similarity(word, ref_word)
      if sim > max_sim:
          max_sim = sim
          sim_word = ref_word
    return sim_word, max_sim
    
def info_content(lookup_word):
   
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not word in brown_freqs:
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))
    
def semantic_vector(words, joint_words, info_content_norm):
    
    sent_set = set(words)
    semvec = np.zeros(len(joint_words))
    i = 0
    for joint_word in joint_words:
        if joint_word in sent_set:
            # if word in union exists in the sentence, s(i) = 1 (unnormalized)
            semvec[i] = 1.0
            if info_content_norm:
                semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)
        else:
            # find the most similar word in the joint set and set the sim value
            sim_word, max_sim = most_similar_word(joint_word, sent_set)
            semvec[i] = PHI if max_sim > PHI else 0.0
            if info_content_norm:
                semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)
        i = i + 1
    return semvec                
            
def semantic_similarity(sentence_1, sentence_2, info_content_norm):
    
    words_1 = nltk.word_tokenize(sentence_1)
    words_2 = nltk.word_tokenize(sentence_2)
    joint_words = set(words_1).union(set(words_2))
    vec_1 = semantic_vector(words_1, joint_words, info_content_norm)
    vec_2 = semantic_vector(words_2, joint_words, info_content_norm)
    return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))

######################### word order similarity ##########################

def word_order_vector(words, joint_words, windex):
    
    wovec = np.zeros(len(joint_words))
    i = 0
    wordset = set(words)
    for joint_word in joint_words:
        if joint_word in wordset:
            # word in joint_words found in sentence, just populate the index
            wovec[i] = windex[joint_word]
        else:
            # word not in joint_words, find most similar word and populate
            # word_vector with the thresholded similarity
            sim_word, max_sim = most_similar_word(joint_word, wordset)
            if max_sim > ETA:
                wovec[i] = windex[sim_word]
            else:
                wovec[i] = 0
        i = i + 1
    return wovec

def word_order_similarity(sentence_1, sentence_2):
    """
    Computes the word-order similarity between two sentences as the normalized
    difference of word order between the two sentences.
    """
    words_1 = nltk.word_tokenize(sentence_1)
    words_2 = nltk.word_tokenize(sentence_2)
    joint_words = list(set(words_1).union(set(words_2)))
    windex = {x[1]: x[0] for x in enumerate(joint_words)}
    r1 = word_order_vector(words_1, joint_words, windex)
    r2 = word_order_vector(words_2, joint_words, windex)
    return 1.0 - (np.linalg.norm(r1 - r2) / np.linalg.norm(r1 + r2))

######################### overall similarity ##########################

def similarity(sentence_1, sentence_2, info_content_norm):
    """
    Calculate the semantic similarity between two sentences. The last 
    parameter is True or False depending on whether information content
    normalization is desired or not.
    """
    return DELTA * semantic_similarity(sentence_1, sentence_2, info_content_norm) + \
        (1.0 - DELTA) * word_order_similarity(sentence_1, sentence_2)

这是循环部分

with open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r") as sentence_file:
# Initialize a list to hold the results
    results = []

    # Loop until we hit the end of the file
    while True:
        # Read two lines
        x = sentence_file.readline()
        y = sentence_file.readline()

        # Check if we've reached the end of the file, if so, we're done
        if not y:
            # Break out of the infinite loop
            break
        else:
            # The .rstrip('\n') removes the newline character from each line
            x = x.rstrip('\n')
            y = y.rstrip('\n')

            # Calculate your similarity value
            similarity_value = similarity(x, y, True)

            # Add the two lines and similarity value to the results list
            results.append([x, y, similarity_value])

# Loop through the pairs in the results list and print them
for pair in results:
    print(pair)

当我在文本文件上运行代码时，我会得到一个错误代码，而不是获取句子相似度值的数字，我得到了nan。

 Warning (from warnings module):
  File "C:\Users\Lenovo2\Desktop\Semantic Analysis (1).py", line 191
    return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))
RuntimeWarning: invalid value encountered in double_scalars

在之前的论坛中，我了解到这个错误可能意味着我正在除以零，因此我们有一个零向量。我现在卡住了，由于我的 Python 经验有限，我不知道如何轻松修复程序而又不用改变太多内容。

- Sigmund Reed

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Ryan Ginstrom · Answer 1

我猜你传递了一个空字符串。你的文本中有空行吗？在检查空字符串之前，你没有去掉换行符，所以只包含换行符的字符串不会被捕获。

由于你似乎是在Windows上，可能还存在'\r\n'样式的换行符，所以你的rstrip可能无法按预期工作。

我建议添加以下修改（同时进行调试打印）：

# Loop until we hit the end of the file
while True:
    # Read two lines, removing trailing whitespace
    x = sentence_file.readline().rstrip()
    y = sentence_file.readline().rstrip()

    # Check if we've reached the end of the file, if so, we're done
    if not x or not y:
        # Break out of the infinite loop
        break
    else:
        print(x, y)
        # Calculate your similarity value
        similarity_value = similarity(x, y, True)

        # Add the two lines and similarity value to the results list
        results.append([x, y, similarity_value])

请注意，代码似乎有一个错误，因为您没有逐对比较句子。也就是说，如果您有句子[a，b，c，d]，则仅比较（a，b）和（c，d），但您实际上想要比较（a，b），（b，c），（c，d）。

您可以通过使用itertools库来简化此过程：

from itertools import pairwise

lines = open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r")
for a, b in pairwise(lines):
    x = a.rstrip()
    y = b.rstrip()
    # ... rest unchanged