运行VB代码计算相似度时,定义首字母缩写词。

3
我正在excel中使用以下vb代码计算A列和B列之间的相似度,效果很好。
接下来我要做的是定义缩写词,以便不影响计算出的相似度。例如:如果在A列中有“ABC LLC”,而在B列中有“ABC limited liability company”,当前的vb代码将返回这两列不太相似。然而,我想让它们返回100%相似,通过定义“LLC”和“Limited Liability Company”实际上是相同的东西。我应该在代码的哪里进行修改?谢谢!
免责声明 - 是的,我知道有一些插件可以做到这一点。但是我的数据集太大了,无法使用它们。
Public Function Similarity(ByVal String1 As String, _
                           ByVal String2 As String, _
                           Optional ByRef RetMatch As String, _
                           Optional min_match = 1) As Single

'Returns percentile of similarity between 2 strings (ignores case)

'"RetMatch"  returns the characters that match(in order)
'"min_match" specifies minimum number af char's in a row to match


Dim b1() As Byte, b2() As Byte
Dim lngLen1 As Long, lngLen2 As Long
Dim lngResult As Long

  If UCase(String1) = UCase(String2) Then       '..Exactly the same
    Similarity = 1

  Else                                          '..one string is empty
    lngLen1 = Len(String1)
    lngLen2 = Len(String2)
    If (lngLen1 = 0) Or (lngLen2 = 0) Then
      Similarity = 0

    Else                                        '..otherwise find similarity
      b1() = StrConv(UCase(String1), vbFromUnicode)
      b2() = StrConv(UCase(String2), vbFromUnicode)
      lngResult = Similarity_sub(0, lngLen1 - 1, _
                                 0, lngLen2 - 1, _
                                 b1, b2, _
                                 String1, _
                                 RetMatch, _
                                 min_match)
      Erase b1
      Erase b2
      If lngLen1 >= lngLen2 Then
        Similarity = lngResult / lngLen1
      Else
        Similarity = lngResult / lngLen2
      End If
    End If
  End If

End Function

Private Function Similarity_sub(ByVal start1 As Long, ByVal end1 As Long, _
                                ByVal start2 As Long, ByVal end2 As Long, _
                                ByRef b1() As Byte, ByRef b2() As Byte, _
                                ByVal FirstString As String, _
                                ByRef RetMatch As String, _
                                ByVal min_match As Long, _
                                Optional recur_level As Integer = 0) As Long
'* CALLED BY: Similarity *  (RECURSIVE)

Dim lngCurr1 As Long, lngCurr2 As Long
Dim lngMatchAt1 As Long, lngMatchAt2 As Long
Dim i As Long
Dim lngLongestMatch As Long, lngLocalLongestMatch As Long
Dim strRetMatch1 As String, strRetMatch2 As String

  If (start1 > end1) Or (start1 < 0) Or (end1 - start1 + 1 < min_match) _
  Or (start2 > end2) Or (start2 < 0) Or (end2 - start2 + 1 < min_match) Then
    Exit Function     '(exit if start/end is out of string, or length is too short)
  End If

  For lngCurr1 = start1 To end1        '(for each char of first string)
    For lngCurr2 = start2 To end2        '(for each char of second string)
      i = 0
      Do Until b1(lngCurr1 + i) <> b2(lngCurr2 + i)   'as long as chars DO match..
        i = i + 1
        If i > lngLongestMatch Then     '..if longer than previous best, store starts & length
          lngMatchAt1 = lngCurr1
          lngMatchAt2 = lngCurr2
          lngLongestMatch = i
        End If
        If (lngCurr1 + i) > end1 Or (lngCurr2 + i) > end2 Then Exit Do
      Loop
    Next lngCurr2
  Next lngCurr1

  If lngLongestMatch < min_match Then Exit Function 'no matches at all, so no point checking for sub-matches!

  lngLocalLongestMatch = lngLongestMatch                   'call again for BEFORE + AFTER
  RetMatch = ""
                              'Find longest match BEFORE the current position
  lngLongestMatch = lngLongestMatch _
                  + Similarity_sub(start1, lngMatchAt1 - 1, _
                                   start2, lngMatchAt2 - 1, _
                                   b1, b2, _
                                   FirstString, _
                                   strRetMatch1, _
                                   min_match, _
                                   recur_level + 1)
  If strRetMatch1 <> "" Then
    RetMatch = RetMatch & strRetMatch1 & "*"
  Else
    RetMatch = RetMatch & IIf(recur_level = 0 _
                              And lngLocalLongestMatch > 0 _
                              And (lngMatchAt1 > 1 Or lngMatchAt2 > 1) _
                              , "*", "")
  End If

                              'add local longest
  RetMatch = RetMatch & Mid$(FirstString, lngMatchAt1 + 1, lngLocalLongestMatch)

                              'Find longest match AFTER the current position
  lngLongestMatch = lngLongestMatch _
                  + Similarity_sub(lngMatchAt1 + lngLocalLongestMatch, end1, _
                                   lngMatchAt2 + lngLocalLongestMatch, end2, _
                                   b1, b2, _
                                   FirstString, _
                                   strRetMatch2, _
                                   min_match, _
                                   recur_level + 1)

  If strRetMatch2 <> "" Then
    RetMatch = RetMatch & "*" & strRetMatch2
  Else
    RetMatch = RetMatch & IIf(recur_level = 0 _
                              And lngLocalLongestMatch > 0 _
                              And ((lngMatchAt1 + lngLocalLongestMatch < end1) _
                                   Or (lngMatchAt2 + lngLocalLongestMatch < end2)) _
                              , "*", "")
  End If
                             'Return result
  Similarity_sub = lngLongestMatch

End Function

如果您可以创建一个包含缩写及其定义的数组(可能在另一个工作表中),则可以使用检查来检查该值是否引用了表中的索引/匹配。这可以是选择语句的一部分,其中第一个Case是您典型的检查,第二个Case是此索引/匹配检查,而第三个Case将是“不相似”。这只是一个想法。 - Cyril
2个回答

4

我可以为您提供一些整合这些 缩写 的方法,但是这需要您自己负责。请注意,此方法并不保证100%成功,但是你已经处于模糊的世界中。

假设我们有一个 字典

  • 键是长语句
  • 值是缩写

在比较两个字符串之前,我们通过将每个出现的长语句替换为其缩写来最小化它们,然后使用您的方法 相似性(或任何其他方法)进行比较。

' Fills an abbreviation dictionary
Sub InitializeDict(ByRef abbrev As Scripting.Dictionary)
    abbrev("limited liability company") = "LLC"
    abbrev("United Kingdom") = "U.K."
    '... Add all abbreviations into dict

    ' Instead of harcoding, you can better load the key/value
    ' pairs from a dedicated worksheet... 

End Sub

' Minimizes s by putting abbreviations
Sub Abbreviate(ByRef s As String)
    Static abbrev As Scripting.Dictionary ' <-- static, inititlized only once
    If abbrev Is Nothing Then
        Set abbrev = CreateObject("Scripting.Dictionary")
        abbrev.CompareMode = vbTextCompare
        InitializeDict abbrev
    End If

    Dim phrase
    For Each phrase In abbrev.Keys
        s = Replace(s, phrase, abbrev(phrase), vbTextCompare)
    Next
End Sub

' A small amendment to this function: abbreviate strings before comparing
Public Function Similarity(ByVal String1 As String, _
                       ByVal String2 As String, _
                       Optional ByRef RetMatch As String, _
                       Optional min_match = 1) As Single

    Abbreviate String1
    Abbreviate String2
    ' ... Rest of the routine
End Function

1
我想我明白了 - 非常感谢! - jonv
@jonv 欢迎,如果你实现了这个想法(其实是你的想法,我只是提出了一个技术实现),并且显著改进了你的相似性检查器,请及时更新我们。我非常感兴趣 ;) - A.S.H

0

检查字符串是否相似可能更容易。例如

If "ABC limited liability company" Like "ABC L*L*C*" Then

*匹配任意0个或多个字符,因此为True。

Option Compare Text    ' makes string comparisons case insensitive

Function areLike(str1 As String, str2 As String) As Single

    If str1 = str2 Then areLike = 1: Exit Function

    Dim pattern As String, temp As String

    If LenB(str1) < LenB(str2) Then 
        pattern = str1
        temp = str2
    Else
        pattern = str2
        temp = str1
    End If

    pattern = StrConv(pattern, vbUnicode)       ' "ABC LLC" to "A␀B␀C␀ ␀L␀L␀C␀"   
    pattern = Replace(pattern, vbNullChar, "*") ' "A*B*C* *L*L*C*"
    pattern = Replace(pattern, " *", " ")       ' "A*B*C* L*L*C*"

    If temp Like pattern Then areLike = 1: Exit Function

    ' else areLike = some other similarity function

End Function

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接