有点难以得到答案,但我尝试了你的代码,对于 DocBins 无效。下面是我粘贴的导入部分的代码。
import spacy
from spacy.tokens import DocBin
from LanguageIdentifier import predict
import fitz
import glob
import os
from datetime import datetime
import logging
FRdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
ENdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
DEdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
MULTIdoc_bin = DocBin (store_user_data=True,attrs=['ENT_TYPE','LEMMA','LIKE_EMAIL','LIKE_URL','LIKE_NUM','ORTH','POS'])
frNLP = spacy.load('fr_dep_news_trf')
enNLP = spacy.load('en_core_web_trf')
deNLP = spacy.load('de_dep_news_trf')
multiNLP = spacy.load('xx_sent_ud_sm')
ErroredFiles =[]
def processNLP(text):
lang = predict(text)
if 'fr' in lang:
doc = frNLP(text)
FRdoc_bin.add(doc)
return
elif 'de' in lang:
DEdoc_bin.add(deNLP(text))
return
elif 'en' in lang:
ENdoc_bin.add(enNLP(text))
return
else:
MULTIdoc_bin.add(multiNLP(text))
return
def get_text_from_pdf(Path):
text = ''
content = fitz.open(Path)
for page in content:
if page.number == 1:
text = page.get_text()[212:]
else:
text = text + page.get_text()
return text
FolderPath = r'C:\[Redacted]\DataSource\*\*.pdf'
PDFfiles = glob.glob(FolderPath)
counter = 0
for file in PDFfiles:
counter = counter +1
try:
textPDF = get_text_from_pdf(file)
processNLP(textPDF)
except Exception as e:
ErroredFiles.append(file)
logging.error('Error with file '+ file)
logging.error('Error message: '+ str(e))
MULTIdoc_bin.add(multiNLP(textPDF))
if(counter == 10):
break
CreatedModelPath = r'C:\[Redacted]\Results' + datetime.strftime(datetime.now(),"%Y%m%d%H%M%S")
os.mkdir(CreatedModelPath)
FRdoc_bin.to_disk(CreatedModelPath+r'\FRdocBin'+'.nlp')
FRdoc_bin.vocab.to_disk(CreatedModelPath+r'\FRdocBin'+'.voc')
ENdoc_bin.to_disk(CreatedModelPath+r'\ENdocBin'+'.nlp')
DEdoc_bin.to_disk(CreatedModelPath+r'\DEdocBin'+'.nlp')
MULTIdoc_bin.to_disk(CreatedModelPath+'\MULTIdocBin'+'.nlp')
我收到的错误信息:
Traceback (most recent call last):
File "C:\[Redacted]\ProcessingEngine.py", line 117, in <module>
FRdoc_bin.vocab.to_disk(CreatedModelPath+r'\FRdocBin'+'.voc')
AttributeError: 'DocBin' object has no attribute 'vocab'