基于部分匹配合并两个Pandas DataFrame

Question

基于部分匹配合并两个Pandas DataFrame

16

两个DataFrame中的城市名称格式不同。我想执行左连接并针对两个DataFrames中字段City之间的所有部分字符串匹配提取geo字段。

import pandas as pd

df1 = pd.DataFrame({
                    'City': ['San Francisco, CA','Oakland, CA'], 
                    'Val': [1,2]
                  })

df2 = pd.DataFrame({
                    'City': ['San Francisco-Oakland, CA','Salinas, CA'], 
                    'Geo': ['geo1','geo2']
                  })

预期在连接时使用DataFrame：

 City                   Val   Geo

 San Francisco, CA      1     geo1
 Oakland, CA            2     geo1

- kms

3个回答

9

这应该可以胜任。使用Levenshtein_distance进行字符串匹配。

pip install thefuzz[speedup]

import pandas as pd
import numpy as np

from thefuzz import process

def fuzzy_match(
    a: pd.DataFrame, b: pd.DataFrame, col: str, limit: int = 5, thresh: int = 80
):
    """use fuzzy matching to join on column"""

    s = b[col].tolist()

    matches = a[col].apply(lambda x: process.extract(x, s, limit=limit))
    matches = pd.DataFrame(np.concatenate(matches), columns=["match", "score"])

    # join other columns in b to matches
    to_join = (
        pd.merge(left=b, right=matches, how="right", left_on="City", right_on="match")
        .set_index(  # create an index that represents the matching row in df a, you can drop this when `limit=1`
            np.array(
                list(
                    np.repeat(i, limit if limit < len(b) else len(b))
                    for i in range(len(a))
                )
            ).flatten()
        )
        .drop(columns=["match"])
        .astype({"score": "int16"})
    )
    print(f"\t the index here represents the row in dataframe a on which to join")
    print(to_join)

    res = pd.merge(
        left=a, right=to_join, left_index=True, right_index=True, suffixes=("", "_b")
    )

    # return only the highest match or you can just set the limit to 1
    # and remove this
    df = res.reset_index()
    df = df.iloc[df.groupby(by="index")["score"].idxmax()].reset_index(drop=True)

    return df.drop(columns=["City_b", "score", "index"])

def test(df):

    expected = pd.DataFrame(
        {
            "City": ["San Francisco, CA", "Oakland, CA"],
            "Val": [1, 2],
            "Geo": ["geo1", "geo1"],
        }
    )

    print(f'{"expected":-^70}')
    print(expected)

    print(f'{"res":-^70}')
    print(df)

    assert expected.equals(df)


if __name__ == "__main__":

    a = pd.DataFrame({"City": ["San Francisco, CA", "Oakland, CA"], "Val": [1, 2]})
    b = pd.DataFrame(
        {"City": ["San Francisco-Oakland, CA", "Salinas, CA"], "Geo": ["geo1", "geo2"]}
    )

    print(f'\n\n{"fuzzy match":-^70}')
    res = fuzzy_match(a, b, col="City")
    test(res)

- Ian Zurutuza

3

使用余弦相似度。sklearn文本特征提取对于大型数据集，计算余弦相似度可能会很慢。请查看：pip install sparse_dot_topn

参见：https://www.sun-analytics.nl/posts/2017-07-26-boosting-selection-of-most-similar-entities-in-large-scale-datasets/

pip install scikit-learn

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# https://dev59.com/L3A75IYBdhLWcg3wOWRd#27086669
# as a preprocessor for TfidfVectorizer
def clean_corpus(s: str):
    """return clean corpus -- replaced any non word chars with space"""
    for ch in ['\\','`','*','_','{','}','[',']','(',')','>','#','+','-','.','!','$','\'',',']:
        if ch in s:
            s = s.replace(ch, " ")
    return s.lower()

# why n-grams?
# this should account for any word misspellings
def fit_vectorizer(corpus: np.array, n: int = 3):

    vectorizer = TfidfVectorizer(analyzer="char_wb", preprocessor=clean_corpus, ngram_range=(n, n))
    tfidf = vectorizer.fit_transform(corpus)

    return tfidf, vectorizer


def cosine_similarity_join(a, b, col_name):

    a_len = len(a[col_name])

    # all of the "documents" in a 1D array
    corpus = np.concatenate([a[col_name].to_numpy(), b[col_name].to_numpy()])

    tfidf, vectorizer = fit_vectorizer(corpus, 3)
    # print(vectorizer.get_feature_names())

    # in this matrix each row represents the str in a and the col is the str from b, value is the cosine similarity
    res = cosine_similarity(tfidf[:a_len], tfidf[a_len:])
    print('in this matrix each row represents the str in a and the col is the str from b')
    print(res)

    res_series = pd.DataFrame(res).stack().rename("score")
    res_series.index.set_names(['a', 'b'], inplace=True)
    # print(res_series)
    
    # join scores to b
    b_scored = pd.merge(left=b, right=res_series, left_index=True, right_on='b').droplevel('b')
    # print(b_scored.sort_index())
   
    # find the indices on which to match, (highest score in each row)
    # best_match = np.argmax(res, axis=1)
    
    res = pd.merge(left=a, right=b_scored, left_index=True, right_index=True, suffixes=('', '_b'))
    print(res)

    df = res.reset_index()
    df = df.iloc[df.groupby(by="index")["score"].idxmax()].reset_index(drop=True)

    return df.drop(columns=["City_b", "score", "index"])  

def test(df):

    expected = pd.DataFrame(
        {
            "City": ["San Francisco, CA", "Oakland, CA"],
            "Val": [1, 2],
            "Geo": ["geo1", "geo1"],
        }
    )

    print(f'{"expected":-^70}')
    print(expected)

    print(f'{"res":-^70}')
    print(df)

    assert expected.equals(df)


if __name__ == "__main__":

    a = pd.DataFrame({"City": ["San Francisco, CA", "Oakland, CA"], "Val": [1, 2]})
    b = pd.DataFrame(
        {"City": ["San Francisco-Oakland, CA", "Salinas, CA"], "Geo": ["geo1", "geo2"]}
    )

    print(f'\n\n{"n-gram cosine similarity":-^70}')
    res = cosine_similarity_join(a, b, col_name="City")
    test(res)

- Ian Zurutuza

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Corralien · Accepted Answer

更新：模糊字符串匹配项目fuzzywuzzy已更名为thefuzz并转移至此处

您可以使用thefuzz包和函数extractOne：

# Python env: pip install thefuzz
# Anaconda env: pip install thefuzz
# -> thefuzz is not yet available on Anaconda (2021-09-18)
# -> you can use the old package: conda install -c conda-forge fuzzywuzzy

from thefuzz import process

best_city = lambda x: process.extractOne(x, df2["City"])[2]  # See note below
df1['Geo'] = df2.loc[df1["City"].map(best_city).values, 'Geo'].values

输出：

>>> df1
                City  Val   Geo
0  San Francisco, CA    1  geo1
1        Oakland, CA    2  geo1

注意：extractOne会从最佳匹配中返回一个由3个元素组成的元组：来自df2的城市名称[0]、准确度得分[1]和索引[2]（<- 我使用的）。