除了nltk cookbook和nlp with python之外,我还可以使用哪些资源?
您可以考虑使用spaCy来训练自己的自定义数据进行命名实体识别任务。这里有一个例子
thread,可以在自定义训练集上训练模型以检测新实体
ANIMAL
。代码已经修复并更新以便更容易阅读。
import random
import spacy
from spacy.training import Example
LABEL = 'ANIMAL'
TRAIN_DATA = [
("Horses are too tall and they pretend to care about your feelings", {'entities': [(0, 6, LABEL)]}),
("Do they bite?", {'entities': []}),
("horses are too tall and they pretend to care about your feelings", {'entities': [(0, 6, LABEL)]}),
("horses pretend to care about your feelings", {'entities': [(0, 6, LABEL)]}),
("they pretend to care about your feelings, those horses", {'entities': [(48, 54, LABEL)]}),
("horses?", {'entities': [(0, 6, LABEL)]})
]
nlp = spacy.load('en_core_web_sm')
ner = nlp.get_pipe('ner')
ner.add_label(LABEL)
optimizer = nlp.create_optimizer()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
for itn in range(20):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update([example], drop=0.35, sgd=optimizer, losses=losses)
print(losses)
test_text = 'Do you like horses?'
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
print(ent.label_, " -- ", ent.text)
以下是输出结果:
{'ner': 9.60289144264557}
{'ner': 8.875474230820478}
{'ner': 6.370401408220459}
{'ner': 6.687456469517201}
...
{'ner': 1.3796682589133492e-05}
{'ner': 1.7709562613218738e-05}
Entities in 'Do you like horses?'
ANIMAL -- horses