java source code of ChunkParser


package com.wiseowl.WiseOwl.query;

import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.util.Span;

/**
 * Finds flat chunks instead of a tree structure using a simpler model.
 *
 * This class is not thread-safe, but should be lightweight to construct.
 */
public class ChunkParser implements Parser {

  private ChunkerME chunker;
  private POSTaggerME tagger;

  public ChunkParser(ChunkerME chunker, POSTaggerME tagger) {
    this.chunker = chunker;
    this.tagger = tagger;
  }
  public Parse parse(Parse tokens) {
    //<start id="cp.pos"/>
    Parse[] children = tokens.getChildren();//<co id="cp.child"/>
    String[] words = new String[children.length];
    double[] probs = new double[words.length];
    for (int i = 0, il = children.length; i < il; i++) {
      words[i] = children[i].toString();//<co id="cp.words"/>
    }
    String[] tags = tagger.tag(words);//<co id="cp.tag"/>
    tagger.probs(probs);//<co id="cp.probs"/>
    for (int j = 0; j < words.length; j++) {
      Parse word = children[j];
      double prob = probs[j];
      tokens.insert(new Parse(word.getText(), word.getSpan(), tags[j], prob, j));//<co id="cp.augment"/>
      tokens.addProb(Math.log(prob));
     // return tokens;
    }
    /*
    <calloutlist>
        <callout arearefs="cp.child"><para>The <methodname>parse</methodname> is a callback method from an internal OpenNLP API that tokenizes the original text.</para></callout>
        <callout arearefs="cp.words"><para>Get just the words for use with the tagger</para></callout>
        <callout arearefs="cp.tag"><para>Part of speech tag the words</para></callout>
        <callout arearefs="cp.probs"><para></para></callout>
        <callout arearefs="cp.augment"><para>Augment the initial parse with the part of speech information</para></callout>
    </calloutlist>
    */
    //<end id="cp.pos"/>
    String[] chunks = chunker.chunk(words, tags);
    chunker.probs(probs);
    int chunkStart = -1;
    String chunkType = null;
    double logProb=0;
    for (int ci=0,cn=chunks.length;ci<cn;ci++) {
      if (ci > 0 && !chunks[ci].startsWith("I-") && !chunks[ci-1].equals("O")) {
        Span span = new Span(children[chunkStart].getSpan().getStart(),children[ci-1].getSpan().getEnd());
        tokens.insert(new Parse(tokens.getText(), span, chunkType, logProb,children[ci-1]));
        logProb=0;
      }            
      if (chunks[ci].startsWith("B-")) {
        chunkStart = ci;
        chunkType = chunks[ci].substring(2);
      }
      logProb+=Math.log(probs[ci]);
    }
    if (!chunks[chunks.length-1].equals("O")) {
      int ci = chunks.length;
      Span span = new Span(children[chunkStart].getSpan().getStart(),children[ci-1].getSpan().getEnd());
      tokens.insert(new Parse(tokens.getText(), span, chunkType, logProb,children[ci-1]));
    }
    return tokens;
  }

  public Parse[] parse(Parse tokens, int numParses) {
    //TODO: get multiple tag sequences and chunk each.
    return new Parse[] {parse(tokens)};
  }
  
}