使用余弦相似度。
sklearn文本特征提取
对于大型数据集,计算余弦相似度可能会很慢。请查看:
pip install sparse_dot_topn
参见:https://www.sun-analytics.nl/posts/2017-07-26-boosting-selection-of-most-similar-entities-in-large-scale-datasets/
pip install scikit-learn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def clean_corpus(s: str):
"""return clean corpus -- replaced any non word chars with space"""
for ch in ['\\','`','*','_','{','}','[',']','(',')','>','#','+','-','.','!','$','\'',',']:
if ch in s:
s = s.replace(ch, " ")
return s.lower()
def fit_vectorizer(corpus: np.array, n: int = 3):
vectorizer = TfidfVectorizer(analyzer="char_wb", preprocessor=clean_corpus, ngram_range=(n, n))
tfidf = vectorizer.fit_transform(corpus)
return tfidf, vectorizer
def cosine_similarity_join(a, b, col_name):
a_len = len(a[col_name])
corpus = np.concatenate([a[col_name].to_numpy(), b[col_name].to_numpy()])
tfidf, vectorizer = fit_vectorizer(corpus, 3)
res = cosine_similarity(tfidf[:a_len], tfidf[a_len:])
print('in this matrix each row represents the str in a and the col is the str from b')
print(res)
res_series = pd.DataFrame(res).stack().rename("score")
res_series.index.set_names(['a', 'b'], inplace=True)
b_scored = pd.merge(left=b, right=res_series, left_index=True, right_on='b').droplevel('b')
res = pd.merge(left=a, right=b_scored, left_index=True, right_index=True, suffixes=('', '_b'))
print(res)
df = res.reset_index()
df = df.iloc[df.groupby(by="index")["score"].idxmax()].reset_index(drop=True)
return df.drop(columns=["City_b", "score", "index"])
def test(df):
expected = pd.DataFrame(
{
"City": ["San Francisco, CA", "Oakland, CA"],
"Val": [1, 2],
"Geo": ["geo1", "geo1"],
}
)
print(f'{"expected":-^70}')
print(expected)
print(f'{"res":-^70}')
print(df)
assert expected.equals(df)
if __name__ == "__main__":
a = pd.DataFrame({"City": ["San Francisco, CA", "Oakland, CA"], "Val": [1, 2]})
b = pd.DataFrame(
{"City": ["San Francisco-Oakland, CA", "Salinas, CA"], "Geo": ["geo1", "geo2"]}
)
print(f'\n\n{"n-gram cosine similarity":-^70}')
res = cosine_similarity_join(a, b, col_name="City")
test(res)