我该如何进行词干提取或词形归并？

Question

我该如何进行词干提取或词形归并？

nlpstemminglemmatization

114

我已经尝试了PorterStemmer和Snowball，但它们都无法处理所有单词，错过了一些非常常见的单词。

我的测试单词是：“cats running ran cactus cactuses cacti community communities”，但两者都只有不到一半的正确率。

另请参阅：

- manixrock

28

那应该是仙人掌吧？ - MSalters

3

只是为了循环引用Reddit上发布的原始问题： [如何以编程方式进行词干提取？（例如，“eating”变为“eat”，“cactuses”变为“cactus”）]（http://www.reddit.com/r/programming/comments/8e5d3/how_do_i_programatically_do_stemming_eg_eating_to/）在此发布，因为评论包含有用的信息。 - Renaud Bompuis

1

请参见 https://dev59.com/4WQm5IYBdhLWcg3w0Bq5 - alvas

22个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- hidan · Answer 1

import re
import pymorphy2
from pymorphy2 import MorphAnalyzer
import nltk
from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

def to_lowercase(data):
    
    data = data.lower()
return data

def noise_remove(data, remove_numbers=True):
    
    data = re.sub(r"(\w+:\/\/\S+)", " ", data)

    
    data = re.sub(r"([^0-9A-Za-zА-Яа-я])", " ", data)


if remove_numbers:
    data = re.sub(r"\d+", " ", data)
return data


def lemmatize(words):
    text = []
    
    for word in words:
        morph_word = morph.parse(word)[0]
        if morph_word.tag.POS in ['NOUN', 'ADJF', 'INFN', 'PRTS'] and morph_word[2] not in stopwords_ru:
            text.append(morph_word[2])
   return text

def tokenize(text):
    words = text.split()
    for elem in words:
        if len(elem) < 3:
            words.remove(elem)
    lemmatize_words = lemmatize(words)
    return ' '.join(lemmatize_words)

- avi · Answer 2

df_plots = pd.read_excel("Plot Summary.xlsx", index_col = 0)
df_plots
# Printing first sentence of first row and last sentence of last row
nltk.sent_tokenize(df_plots.loc[1].Plot)[0] + nltk.sent_tokenize(df_plots.loc[len(df)].Plot)[-1]

# Calculating length of all plots by words
df_plots["Length"] = df_plots.Plot.apply(lambda x : 
len(nltk.word_tokenize(x)))

print("Longest plot is for season"),
print(df_plots.Length.idxmax())

print("Shortest plot is for season"),
print(df_plots.Length.idxmin())



#What is this show about? (What are the top 3 words used , excluding the #stop words, in all the #seasons combined)

word_sample = list(["struggled", "died"])
word_list = nltk.pos_tag(word_sample)
[wnl.lemmatize(str(word_list[index][0]), pos = word_list[index][1][0].lower()) for index in range(len(word_list))]

# Figure out the stop words
stop = (stopwords.words('english'))

# Tokenize all the plots
df_plots["Tokenized"] = df_plots.Plot.apply(lambda x : nltk.word_tokenize(x.lower()))

# Remove the stop words
df_plots["Filtered"] = df_plots.Tokenized.apply(lambda x : (word for word in x if word not in stop))

# Lemmatize each word
wnl = WordNetLemmatizer()
df_plots["POS"] = df_plots.Filtered.apply(lambda x : nltk.pos_tag(list(x)))
# df_plots["POS"] = df_plots.POS.apply(lambda x : ((word[1] = word[1][0] for word in word_list) for word_list in x))
df_plots["Lemmatized"] = df_plots.POS.apply(lambda x : (wnl.lemmatize(x[index][0], pos = str(x[index][1][0]).lower()) for index in range(len(list(x)))))



#Which Season had the highest screenplay of "Jesse" compared to "Walt" 
#Screenplay of Jesse =(Occurences of "Jesse")/(Occurences of "Jesse"+ #Occurences of "Walt")

df_plots.groupby("Season").Tokenized.sum()

df_plots["Share"] = df_plots.groupby("Season").Tokenized.sum().apply(lambda x : float(x.count("jesse") * 100)/float(x.count("jesse") + x.count("walter") + x.count("walt")))

print("The highest times Jesse was mentioned compared to Walter/Walt was in season"),
print(df_plots["Share"].idxmax())
#float(df_plots.Tokenized.sum().count('jesse')) * 100 / #float((df_plots.Tokenized.sum().count('jesse') + #df_plots.Tokenized.sum().count('walt') + #df_plots.Tokenized.sum().count('walter')))