我在stackoverflow和其他网站上搜索过,但仍然没有找到解决方案。我的问题是,我正在尝试访问包含“ ham”或“ spam”电子邮件的两个不同文件夹,以将其加入数据集进行模型训练。我似乎一直遇到权限错误,不确定如何通过Python或Windows资源管理器解决它。我想知道如何用多种方法解决这个问题,以便更好地理解它。
以下是代码:
以下是代码:
ham = 'ham'
spam = 'spam'
data = 'emails2'
hamfiles = []
spamfiles = []
'''Searching File Path'''
print('# MESSAGE: Finding for files ----------------------------------------------------------------------------------')
for subdir, folders, files in os.walk(data):
if subdir.__contains__(ham):
# print(subdir)
for file in files:
# print(os.path.join(subdir, file))
hamfiles.append(os.path.join(subdir, file))
else:
for file in files:
# print(os.path.join(subdir, file))
spamfiles.append(os.path.join(subdir, file))
import glob
X_file = []
y_class = []
eof = [('eof')]
for hamfile in hamfiles:
# print(hamfile)
files = glob.glob(hamfile)
for file in files:
# print(file)
h = open(file, encoding='UTF8', errors='replace')
buffer = h.read()
'''Tokenize'''
token = nltk.word_tokenize(buffer)
'''Part Of Speech Tagging'''
posTag = nltk.pos_tag(token)
'''Append to Array'''
for (word, tag) in posTag:
X_file.append(word)
y_class.append('ham')
for spamfile in spamfiles:
# print(spamfile)
files = glob.glob(spamfile)
for file in files:
# print(file)
s = open(file, encoding='UTF8', errors='replace')
buffer = s.read()
'''Tokenize'''
token = nltk.word_tokenize(buffer)
'''Part Of Speech Tagging'''
posTag = nltk.pos_tag(token)
'''Append to Array'''
for (word, tag) in posTag:
X_file.append(word)
y_class.append('spam')
print('# MESSAGE: Print X_ham ----------------------------------------------------------------------------------------')
print(X_file)
h.close()
def create_lexicon(X_file,y_class):
lexicon = []
with open(X_file,'r+') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
with open(y_class,'r+') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l)
lexicon += list(all_words)
我了解这可能是一个Windows权限错误,但我以前从未遇到过这种情况。