我该如何进行词干提取或词形归并?

114

28
那应该是仙人掌吧? - MSalters
3
只是为了循环引用Reddit上发布的原始问题: [如何以编程方式进行词干提取? (例如,“eating”变为“eat”,“cactuses”变为“cactus”)](http://www.reddit.com/r/programming/comments/8e5d3/how_do_i_programatically_do_stemming_eg_eating_to/)在此发布,因为评论包含有用的信息。 - Renaud Bompuis
1
请参见 https://dev59.com/4WQm5IYBdhLWcg3w0Bq5 - alvas
22个回答

0
import re
import pymorphy2
from pymorphy2 import MorphAnalyzer
import nltk
from nltk.tokenize import  word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

stopwords_ru = stopwords.words("russian")
morph = MorphAnalyzer()

def to_lowercase(data):
    
    data = data.lower()
return data

def noise_remove(data, remove_numbers=True):
    
    data = re.sub(r"(\w+:\/\/\S+)", " ", data)

    
    data = re.sub(r"([^0-9A-Za-zА-Яа-я])", " ", data)


if remove_numbers:
    data = re.sub(r"\d+", " ", data)
return data


def lemmatize(words):
    text = []
    
    for word in words:
        morph_word = morph.parse(word)[0]
        if morph_word.tag.POS in ['NOUN', 'ADJF', 'INFN', 'PRTS'] and morph_word[2] not in stopwords_ru:
            text.append(morph_word[2])
   return text

def tokenize(text):
    words = text.split()
    for elem in words:
        if len(elem) < 3:
            words.remove(elem)
    lemmatize_words = lemmatize(words)
    return ' '.join(lemmatize_words)

请在您的代码中添加一些解释,而不仅仅是发布代码。附加说明将更有帮助。 - user67275

-1
df_plots = pd.read_excel("Plot Summary.xlsx", index_col = 0)
df_plots
# Printing first sentence of first row and last sentence of last row
nltk.sent_tokenize(df_plots.loc[1].Plot)[0] + nltk.sent_tokenize(df_plots.loc[len(df)].Plot)[-1]

# Calculating length of all plots by words
df_plots["Length"] = df_plots.Plot.apply(lambda x : 
len(nltk.word_tokenize(x)))

print("Longest plot is for season"),
print(df_plots.Length.idxmax())

print("Shortest plot is for season"),
print(df_plots.Length.idxmin())



#What is this show about? (What are the top 3 words used , excluding the #stop words, in all the #seasons combined)

word_sample = list(["struggled", "died"])
word_list = nltk.pos_tag(word_sample)
[wnl.lemmatize(str(word_list[index][0]), pos = word_list[index][1][0].lower()) for index in range(len(word_list))]

# Figure out the stop words
stop = (stopwords.words('english'))

# Tokenize all the plots
df_plots["Tokenized"] = df_plots.Plot.apply(lambda x : nltk.word_tokenize(x.lower()))

# Remove the stop words
df_plots["Filtered"] = df_plots.Tokenized.apply(lambda x : (word for word in x if word not in stop))

# Lemmatize each word
wnl = WordNetLemmatizer()
df_plots["POS"] = df_plots.Filtered.apply(lambda x : nltk.pos_tag(list(x)))
# df_plots["POS"] = df_plots.POS.apply(lambda x : ((word[1] = word[1][0] for word in word_list) for word_list in x))
df_plots["Lemmatized"] = df_plots.POS.apply(lambda x : (wnl.lemmatize(x[index][0], pos = str(x[index][1][0]).lower()) for index in range(len(list(x)))))



#Which Season had the highest screenplay of "Jesse" compared to "Walt" 
#Screenplay of Jesse =(Occurences of "Jesse")/(Occurences of "Jesse"+ #Occurences of "Walt")

df_plots.groupby("Season").Tokenized.sum()

df_plots["Share"] = df_plots.groupby("Season").Tokenized.sum().apply(lambda x : float(x.count("jesse") * 100)/float(x.count("jesse") + x.count("walter") + x.count("walt")))

print("The highest times Jesse was mentioned compared to Walter/Walt was in season"),
print(df_plots["Share"].idxmax())
#float(df_plots.Tokenized.sum().count('jesse')) * 100 / #float((df_plots.Tokenized.sum().count('jesse') + #df_plots.Tokenized.sum().count('walt') + #df_plots.Tokenized.sum().count('walter')))

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接