使用Stanford CoreNLP（3.5.2）进行并发处理

Question

使用Stanford CoreNLP（3.5.2）进行并发处理

6

我在同时注释多个句子时遇到了并发问题。我不确定是我的操作有误还是CoreNLP中存在Bug。

我的目标是使用多个线程并行运行"tokenize, ssplit, pos, lemma, ner, parse, dcoref"管道对句子进行注释。每个线程分配自己的StanfordCoreNLP实例，并使用它进行注释。

问题是，某些情况下会抛出异常：

java.util.ConcurrentModificationException
 at java.util.ArrayList$Itr.checkForComodification(ArrayList.java:901)
 at java.util.ArrayList$Itr.next(ArrayList.java:851)
 at java.util.Collections$UnmodifiableCollection$1.next(Collections.java:1042)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:463)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
 at edu.stanford.nlp.trees.GrammaticalStructure.<init>(GrammaticalStructure.java:201)
 at edu.stanford.nlp.trees.EnglishGrammaticalStructure.<init>(EnglishGrammaticalStructure.java:89)
 at edu.stanford.nlp.semgraph.SemanticGraphFactory.makeFromTree(SemanticGraphFactory.java:139)
 at edu.stanford.nlp.pipeline.DeterministicCorefAnnotator.annotate(DeterministicCorefAnnotator.java:89)
 at edu.stanford.nlp.pipeline.AnnotationPipeline.annotate(AnnotationPipeline.java:68)
 at edu.stanford.nlp.pipeline.StanfordCoreNLP.annotate(StanfordCoreNLP.java:412)

我附上了一个应用程序的示例代码，可以在我的Core i3 370M笔记本电脑（Win 7 64位，Java 1.8.0.45 64位）上在大约20秒内重现问题。该应用程序读取Recognizing Textual Entailment（RTE）语料库的XML文件，然后使用标准的Java并发类同时解析所有句子。需要将本地RTE XML文件的路径作为命令行参数给出。在我的测试中，我使用了此处公开可用的XML文件： http://www.nist.gov/tac/data/RTE/RTE3-DEV-FINAL.tar.gz

package semante.parser.stanford.server;

import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlAttribute;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;

import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;

public class StanfordMultiThreadingTest {

 @XmlRootElement(name = "entailment-corpus")
 @XmlAccessorType (XmlAccessType.FIELD)
 public static class Corpus {
  @XmlElement(name = "pair")
  private List<Pair> pairList = new ArrayList<Pair>();

  public void addPair(Pair p) {pairList.add(p);}
  public List<Pair> getPairList() {return pairList;}
 }

 @XmlRootElement(name="pair")
 public static class Pair {

  @XmlAttribute(name = "id")
  String id;

  @XmlAttribute(name = "entailment")
  String entailment;

  @XmlElement(name = "t")
  String t;

  @XmlElement(name = "h")
  String h;

  private Pair() {}

  public Pair(int id, boolean entailment, String t, String h) {
   this();
   this.id = Integer.toString(id);
   this.entailment = entailment ? "YES" : "NO";
   this.t = t;
   this.h = h;
  }

  public String getId() {return id;}
  public String getEntailment() {return entailment;}
  public String getT() {return t;}
  public String getH() {return h;}
 }
 
 class NullStream extends OutputStream {
  @Override 
  public void write(int b) {}
 };

 private Corpus corpus;
 private Unmarshaller unmarshaller;
 private ExecutorService executor;

 public StanfordMultiThreadingTest() throws Exception {
  javax.xml.bind.JAXBContext jaxbCtx = JAXBContext.newInstance(Pair.class,Corpus.class);
  unmarshaller = jaxbCtx.createUnmarshaller();
  executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
 }

 public void readXML(String fileName) throws Exception {
  System.out.println("Reading XML - Started");
  corpus = (Corpus) unmarshaller.unmarshal(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8));
  System.out.println("Reading XML - Ended");
 }

 public void parseSentences() throws Exception {
  System.out.println("Parsing - Started");

  // turn pairs into a list of sentences
  List<String> sentences = new ArrayList<String>();
  for (Pair pair : corpus.getPairList()) {
   sentences.add(pair.getT());
   sentences.add(pair.getH());
  }

  // prepare the properties
  final Properties props = new Properties();
  props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");

  // first run is long since models are loaded
  new StanfordCoreNLP(props);

  // to avoid the CoreNLP initialization prints (e.g. "Adding annotation pos")
  final PrintStream nullPrintStream = new PrintStream(new NullStream());
  PrintStream err = System.err;
  System.setErr(nullPrintStream);

  int totalCount = sentences.size();
  AtomicInteger counter = new AtomicInteger(0);

  // use java concurrency to parallelize the parsing
  for (String sentence : sentences) {
   executor.execute(new Runnable() {
    @Override
    public void run() {
     try {
      StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
      Annotation annotation = new Annotation(sentence);
      pipeline.annotate(annotation);
      if (counter.incrementAndGet() % 20 == 0) {
       System.out.println("Done: " + String.format("%.2f", counter.get()*100/(double)totalCount));
      };
     } catch (Exception e) {
      System.setErr(err);
      e.printStackTrace();
      System.setErr(nullPrintStream);
      executor.shutdownNow();
     }
    }
   });
  }
  executor.shutdown();
  
  System.out.println("Waiting for parsing to end.");  
  executor.awaitTermination(10, TimeUnit.MINUTES);

  System.out.println("Parsing - Ended");
 }

 public static void main(String[] args) throws Exception {
  StanfordMultiThreadingTest smtt = new StanfordMultiThreadingTest();
  smtt.readXML(args[0]);
  smtt.parseSentences();
 }

}

在我尝试查找一些背景信息时，我遇到了Christopher Manning和Gabor Angeli来自斯坦福的答案，表明现代版本的Stanford CoreNLP应该是线程安全的。然而，CoreNLP版本3.4.1的最近错误报告描述了一个并发问题。正如标题中所提到的，我正在使用版本3.5.2。

我不清楚我面临的问题是由于错误还是由于我使用软件包的方式有误。如果有更多知识的人能够解决这个问题，我会非常感激。我希望示例代码对于重现问题有所帮助。谢谢！

- Assaf

2个回答

0

我曾经遇到过同样的问题，但使用最新的github版本（今天）解决了这个问题。因此，我认为这是一个CoreNLP问题，自3.5.2以来已经得到解决。

另请参阅Apache Spark上的CoreNLP

- peschü

谢谢更新。他们发布新版本时，我会尝试它。 - Assaf

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Sebastian Schuster · Accepted Answer

你尝试过使用 threads 选项吗？您可以为单个 StanfordCoreNLP 流水线指定线程数，然后它将并行处理句子。

例如，如果您想在 8 个核心上处理句子，请将 threads 选项设置为 8：

Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
props.put("threads", "8")
StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);

尽管如此，我认为您的解决方案也应该有效，并且我们将检查是否存在某些并发性错误，但使用此选项可能会在此期间解决您的问题。