opennlp.tools.util.Span Java Exaples

Source File: NamedEntityRecognitionUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        names.add("");
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
        }
        k++;
    }
    assertThat(names).contains("John","Leonard","Penny");
}

Source File: SentenceTokenizer.java From wiseowl with MIT License

6 votes

@Override
public final boolean incrementToken() throws IOException {
 if (sentences == null) {
     fillSentences();
   }
   
   if (tokenOffset >= sentences.length) {
     return false;
   }
   
   Span sentenceSpan = sentences[tokenOffset];
   clearAttributes();
   int start = sentenceSpan.getStart();
   int end   = sentenceSpan.getEnd();
   termAtt.copyBuffer(inputSentence, start, end - start);
   posIncrAtt.setPositionIncrement(1);
   offsetAtt.setOffset(start, end);
   tokenOffset++;
   
   return true;
}

Source File: Chapter1.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void nameFinderExample() {
    try {
        String[] sentences = {
            "Tim was a good neighbor. Perhaps not as good a Bob "
            + "Haywood, but still pretty good. Of course Mr. Adam "
            + "took the cake!"};
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        TokenNameFinderModel model = new TokenNameFinderModel(new File(
                "C:\\OpenNLP Models", "en-ner-person.bin"));
        NameFinderME finder = new NameFinderME(model);

        for (String sentence : sentences) {
            // Split the sentence into tokens
            String[] tokens = tokenizer.tokenize(sentence);

            // Find the names in the tokens and return Span objects
            Span[] nameSpans = finder.find(tokens);

            // Print the names extracted from the tokens using the Span data
            System.out.println(Arrays.toString(
                    Span.spansToStrings(nameSpans, tokens)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: OpenNlpChunkerConceptProvider.java From bioasq with Apache License 2.0

6 votes

@Override
public List<Concept> getConcepts(JCas jcas) throws AnalysisEngineProcessException {
  List<Token> tokens = TypeUtil.getOrderedTokens(jcas);
  String[] texts = tokens.stream().map(Token::getCoveredText).toArray(String[]::new);
  String[] pos = tokens.stream().map(Token::getPartOfSpeech).toArray(String[]::new);
  List<Span> spans = insertOutsideSpans(chunker.chunkAsSpans(texts, pos));
  return IntStream.rangeClosed(0, spans.size() - type.size())
          .mapToObj(i -> spans.subList(i, i + type.size()))
          .filter(spansSublist -> type
                  .equals(spansSublist.stream().map(Span::getType).collect(toList())))
          .map(spansSublist -> tokens.subList(spansSublist.get(0).getStart(),
                  spansSublist.get(spansSublist.size() - 1).getEnd()))
          .filter(toks -> toks.size() >= minLength)
          .map(toks -> TypeFactory.createConceptMention(jcas, getFirstTokenBegin(toks),
                  getLastTokenEnd(toks)))
          .map(cmention -> TypeFactory.createConcept(jcas, cmention,
                  TypeFactory.createConceptType(jcas, "opennlp:" + String.join("-", type))))
          .collect(toList());
}

Source File: IntentTrainer.java From org.openhab.ui.habot with Eclipse Public License 1.0

6 votes

/**
 * Tries to understand the natural language query into an {@link Intent}
 *
 * @param query the natural language query
 * @return the resulting @{link Intent}
 */
public Intent interpret(String query) {
    String[] tokens = this.tokenizer.tokenize(query.toLowerCase());
    // remove eventual trailing punctuation
    tokens[tokens.length - 1] = tokens[tokens.length - 1].replaceAll("\\s*[!?.]+$", "");

    double[] outcome = categorizer.categorize(tokens);
    logger.debug("{}", categorizer.getAllResults(outcome));

    Intent intent = new Intent(categorizer.getBestCategory(outcome));

    Span[] spans = nameFinder.find(tokens);
    String[] names = Span.spansToStrings(spans, tokens);
    for (int i = 0; i < spans.length; i++) {
        intent.getEntities().put(spans[i].getType(), names[i]);
    }

    logger.debug("{}", intent.toString());

    return intent;
}

Source File: OpenNLPTokenizer.java From lucene-solr with Apache License 2.0

6 votes

@Override
protected boolean incrementWord() {
  if (termSpans == null || termNum == termSpans.length) {
    return false;
  }
  clearAttributes();
  Span term = termSpans[termNum];
  termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
  offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
                      correctOffset(offset + sentenceStart + term.getEnd()));
  if (termNum == termSpans.length - 1) {
    flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
  }
  ++termNum;
  return true;
}

Source File: OpenNlpService.java From elasticsearch-ingest-opennlp with Apache License 2.0

6 votes

public ExtractedEntities find(String content, String field) {
    try {
        if (!nameFinderModels.containsKey(field)) {
            throw new ElasticsearchException("Could not find field [{}], possible values {}", field, nameFinderModels.keySet());
        }
        TokenNameFinderModel finderModel = nameFinderModels.get(field);
        if (threadLocal.get() == null || !threadLocal.get().equals(finderModel)) {
            threadLocal.set(finderModel);
        }

        String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
        Span[] spans = new NameFinderME(finderModel).find(tokens);

        return new ExtractedEntities(tokens, spans);
    } finally {
        threadLocal.remove();
    }
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testPersonNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-persons.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(2));

  String[] names = Span.spansToStrings(spans, tokens);
  assertThat(names.length, is(2));
  assertThat(names[0], is("John Smith"));
  assertThat(names[1], is("Anne Green"));
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testLocationNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-locations.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(2));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(2));
  assertThat(locations[0], is("New York"));
  assertThat(locations[1], is("London"));
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testDateNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-dates.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(1));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(1));
  assertThat(locations[0], is("today"));
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testAddressNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-address.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE.tokenize("Send a taxi to 12 Pleasent Street");
  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(1));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(1));
  assertThat(locations[0], is("12 Pleasent Street"));
}

Source File: OpenNLPParser.java From baleen with Apache License 2.0

6 votes

/**
 * Parses the sentence.
 *
 * @param sentence the sentence
 * @param tokens the tokens
 * @return the parses the
 */
private Parse parseSentence(final Sentence sentence, final Collection<WordToken> tokens) {
  final String text = sentence.getCoveredText();

  final Parse parse =
      new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 1, 0);

  // Add in the POS
  int index = 0;
  for (final WordToken token : tokens) {
    final Span span =
        new Span(token.getBegin() - sentence.getBegin(), token.getEnd() - sentence.getBegin());

    parse.insert(new Parse(text, span, AbstractBottomUpParser.TOK_NODE, 0, index));
    index++;
  }

  // Parse the sentence
  return parser.parse(parse);
}

Source File: OpenNLPParser.java From baleen with Apache License 2.0

6 votes

/**
 * Adds the parsed as annotations.
 *
 * @param jCas the j cas
 * @param offset the offset
 * @param parsed the parsed
 */
private void addParsedAsAnnotations(final JCas jCas, final int offset, final Parse parsed) {
  final String type = parsed.getType();

  // Ignore non phrase types
  if (nodeTypes.contains(type)) {
    // Otherwise add new ParseChunks
    final Span span = parsed.getSpan();
    final PhraseChunk phraseChunk = new PhraseChunk(jCas);
    phraseChunk.setBegin(offset + span.getStart());
    phraseChunk.setEnd(offset + span.getEnd());
    phraseChunk.setChunkType(parsed.getType());

    addToJCasIndex(phraseChunk);
  }

  Arrays.stream(parsed.getChildren()).forEach(p -> addParsedAsAnnotations(jCas, offset, p));
}

Source File: Annotate.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Creates the multiword spans. It gets an initial list of spans (one per
 * token) and creates a multiword span when a multiword is detected.
 * 
 * @param tokens
 *          the list of tokens
 * @param wfs
 *          the list of WFs
 * @param tokenSpans
 *          the list of initial token spans
 */
private void getMultiWordSpans(final String[] tokens, final List<WF> wfs,
    final List<ixa.kaflib.Span<WF>> tokenSpans) {
  final Span[] multiWordSpans = this.multiWordMatcher
      .multiWordsToSpans(tokens);
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final Integer fromIndex = mwSpan.getStart() - counter;
    final Integer toIndex = mwSpan.getEnd() - counter;
    // add to the counter the length of the span removed
    counter = counter + tokenSpans.subList(fromIndex, toIndex).size() - 1;
    // create multiword targets and Span
    final List<WF> wfTargets = wfs
        .subList(mwSpan.getStart(), mwSpan.getEnd());
    final ixa.kaflib.Span<WF> multiWordSpan = KAFDocument
        .newWFSpan(wfTargets);
    // remove the token Spans to be replaced by the multiword span
    tokenSpans.subList(fromIndex, toIndex).clear();
    // add the new Span containing several WFs (multiWordSpan)
    // the counter is used to allow matching the spans to the
    // tokenSpans list indexes
    tokenSpans.add(fromIndex, multiWordSpan);
  }
}

Source File: MultiWordMatcher.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Get input text and join the multiwords found in the dictionary object.
 * 
 * @param tokens
 *          the input text
 * @return the output text with the joined multiwords
 */
public final String[] getTokensWithMultiWords(final String[] tokens) {
  final Span[] multiWordSpans = multiWordsToSpans(tokens);
  final List<String> tokenList = new ArrayList<String>(Arrays.asList(tokens));
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final int fromIndex = mwSpan.getStart() - counter;
    final int toIndex = mwSpan.getEnd() - counter;
    // System.err.println(fromIndex + " " + toIndex);
    // add to the counter the length of the sublist removed
    // to allow the fromIndex and toIndex to match wrt to the tokenList
    // indexes
    counter = counter + tokenList.subList(fromIndex, toIndex).size() - 1;
    // create the multiword joining the sublist
    final String multiWord = Joiner.on("#").join(
        tokenList.subList(fromIndex, toIndex));
    // remove the sublist containing the tokens to be replaced in the span
    tokenList.subList(fromIndex, toIndex).clear();
    // add the multiword containing the tokens in one Span
    tokenList.add(fromIndex, multiWord);
  }
  return tokenList.toArray(new String[tokenList.size()]);
}

Source File: MultiWordSample.java From ixa-pipe-pos with Apache License 2.0

6 votes

public MultiWordSample(final String id, final String[] sentence,
    Span[] multiwords) {

  this.id = id;
  if (sentence == null) {
    throw new IllegalArgumentException("sentence must not be null!");
  }
  if (multiwords == null) {
    multiwords = new Span[0];
  }
  this.tokens = Collections.unmodifiableList(new ArrayList<String>(Arrays
      .asList(sentence)));
  this.names = Collections.unmodifiableList(new ArrayList<Span>(Arrays
      .asList(multiwords)));
  // TODO: Check that multiword spans are not overlapping, otherwise throw
  // exception
}

Source File: DocSplitterOpenNLP15Impl.java From relex with Apache License 2.0

6 votes

/**
 * Get the next sentence out of the buffered text.
 * Return null if there are no complete sentences in the buffer.
 */
public String getNextSentence()
{
	// punt if no sentence detector
	if (detector == null)
	{
		String rc = buffer;
		buffer = null;
		return rc;
	}

	Span spans[] = detector.sentPosDetect(buffer);
	if (0 == spans.length) return null;

	start = 0;
	for (Span span : spans)
	{
		end = span.getEnd();
		if (foundSentence(buffer)) break;
	}
	if (!foundSentence(buffer)) return null;

	buffer = buffer.substring(trimmedEnd);
	return trimmedSentence;
}

Source File: OpenNLPNounPhraseFilter.java From jate with GNU Lesser General Public License v3.0

6 votes

private Span[] createSpan(String[] tags) {
    int start=-1;
    List<Span> result =new ArrayList<>();
    for(int i=0; i<tags.length;i++){
        if(tags[i].equalsIgnoreCase(npChunker.getContinueTag())){
            //do nothing
        }
        else{
            if(start!=-1){
                result.add(new Span(start, i, "NP"));
                if(tags[i].equalsIgnoreCase(npChunker.getStartTag()))
                    start=i;
                else
                    start=-1;
            }else if(tags[i].equalsIgnoreCase(npChunker.getStartTag())){
                start=i;
            }
        }
    }
    if(start!=-1){
        result.add(new Span(start, tags.length,"NP"));
    }
    return result.toArray(new Span[0]);
}

Source File: CorefParse.java From knowledge-extraction with Apache License 2.0

6 votes

private void print(Parse p, int deep) {
	if (p.getType().length() > 1 && p.getType().substring(0, 2).equals(Parser.TOK_NODE))
		return;
	
	char[] spaces = new char[deep*2];
	Arrays.fill(spaces, ' ');
	Span span = p.getSpan();
    System.out.print(new String(spaces) + p.getType() + " -- " + p.getText().substring(span.getStart(),
			span.getEnd()));
    if (parseMap.containsKey(p)) {
		System.out.print("#" + parseMap.get(p));
	}
    System.out.print("\n");
    for (Parse child : p.getChildren()) {
    	print(child, new Integer(deep + 1));
    }
}

Source File: OpenNLPSentenceDetectionTest.java From java_in_examples with Apache License 2.0

6 votes

public static void main(String[] strings) throws Exception {
    String text = "“But I don’t want to go among mad people,” Alice remarked. " +
            "“Oh, you can’t help that,” said the Cat: “we’re all mad here. I’m mad. You’re mad.” " +
            "“How do you know I’m mad?” said Alice. " +
            "“You must be,” said the Cat, “or you wouldn’t have come here.”";

    try (InputStream modelIn = new FileInputStream(NATURAL_LANGUAGE_PROCESSING_SRC_MAIN_RESOURCES_EN_SENT_BIN)) {
        SentenceModel model = new SentenceModel(modelIn);
        SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
        String sentences[] = sentenceDetector.sentDetect(text);
        Span sentences2[] = sentenceDetector.sentPosDetect(text);
        for (String sentence : sentences) {
            System.out.println(sentence);
        }
        System.out.println(Arrays.deepToString(sentences2));
    }
}

Source File: OpenNLP.java From baleen with Apache License 2.0

6 votes

/** Use the OpenNLP Sentence Detector to detect sentences and add them to the JCas index */
private List<Sentence> createBaseSentences(TextBlock block)
    throws AnalysisEngineProcessException {
  List<Sentence> sentences = new ArrayList<>();

  try {
    String text = block.getCoveredText();
    Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);

    for (Span sentSpan : sentenceSpans) {
      Sentence sent = block.newAnnotation(Sentence.class, sentSpan.getStart(), sentSpan.getEnd());

      addToJCasIndex(sent);
      sentences.add(sent);
    }
  } catch (Exception e) {
    throw new AnalysisEngineProcessException(e);
  }

  return sentences;
}

Source File: OpenNLPTokenizer.java From jate with GNU Lesser General Public License v3.0

5 votes

void loadAll() throws IOException {
    fillBuffer();
    String txtStr = new String(fullText);
    detectSentences(txtStr);
    if (paragraphOp != null) {
        detectParagraphs(txtStr);
    }
    words = new Span[sentences.length][];
    for (int i = 0; i < sentences.length; i++) {
        splitWords(i);
    }
}

Source File: Utils.java From knowledge-extraction with Apache License 2.0

5 votes

public static void printParseTree(Parse p, int deep) {
if (p.getType().length() > 1 && p.getType().substring(0, 2).equals(Parser.TOK_NODE))
	return;

char[] spaces = new char[deep*2];
Arrays.fill(spaces, ' ');
Span span = p.getSpan();
      System.out.println(new String(spaces) + p.getType() + " -- " + p.getText().substring(span.getStart(),
		span.getEnd()));
      for (Parse child : p.getChildren()) {
      	printParseTree(child, new Integer(deep + 1));
      }
  }

Source File: FileDiscoverer.java From DataDefender with Apache License 2.0

5 votes

protected List<FileMatchMetaData> getMatchedFiles(final Model model, String handler, String file, String recursivedir) {

        final String   tokens[]    = model.getTokenizer().tokenize(handler);
        final Span     nameSpans[] = model.getNameFinder().find(tokens);
        final double[] spanProbs   = model.getNameFinder().probs(nameSpans);
        List<Probability> probabilityList = new ArrayList<>();

        for (int i = 0; i < nameSpans.length; i++) {
            log.info("Span: " + nameSpans[i].toString());
            log.info("Covered text is: " + tokens[nameSpans[i].getStart()]);
            log.info("Probability is: " + spanProbs[i]);
            probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
        }

        model.getNameFinder().clearAdaptiveData();

        final double averageProbability = calculateAverage(probabilityList);

        if (averageProbability >= config.getProbabilityThreshold()) {
            final FileMatchMetaData result = new FileMatchMetaData(recursivedir, file);

            result.setAverageProbability(averageProbability);
            result.setModel(model.getName());
            fileMatches.add(result);
        }
        
        return fileMatches;
    }

Source File: LexicalLibOpenNlpImpl.java From SciGraph with Apache License 2.0

5 votes

@Override
@Beta
public List<Token<String>> getEntities(String text) {
  int sentenceStart = 0;
  List<Token<String>> retChunks = new LinkedList<>();
  for (String sentence: extractSentences(text)) {
    String[] tokens = tokenizer.tokenize(sentence);
    Span[] spans = tokenizer.tokenizePos(sentence);
    String[] tags = tagger.tag(tokens);

    for (int i = 0; i < tags.length; i++) {
      List<String> chunk = new LinkedList<>();
      int start = i;
      
      if (PhraseChunker.START_NOUN_TAGS.contains(tags[i])) {
        chunk.add(tokens[i]);
        while (i + 1 < tags.length && PhraseChunker.CONTINUE_NOUN_TAGS.contains(tags[i + 1])) {
          chunk.add(tokens[i+1]);
          i++;
        }
        retChunks.add(new NounChunk(on(' ').join(chunk).replace(" ,", ","), 
            sentenceStart + spans[start].getStart(), sentenceStart + spans[i].getEnd()));
      } else if (PhraseChunker.START_VERB_TAGS.contains(tags[i])) {
        chunk.add(tokens[i]);
        while (i + 1 < tags.length && PhraseChunker.CONTINUE_VERB_TAGS.contains(tags[i + 1])) {
          chunk.add(tokens[i+1]);
          i++;
        }
        retChunks.add(new VerbChunk(on(' ').join(chunk).replace(" ,", ","), 
            sentenceStart + spans[start].getStart(), sentenceStart + spans[i].getEnd()));
      }
    }
    sentenceStart += spans[spans.length - 1].getEnd() + 2;
  }
  return retChunks;
}

Source File: OpenNlpChunkerConceptProvider.java From bioasq with Apache License 2.0

5 votes

private static List<Span> insertOutsideSpans(Span[] spans) {
  List<Span> spansWithO = new LinkedList<>(Arrays.asList(spans));
  IntStream.range(0, spans.length - 1).filter(i -> spans[i].getEnd() < spans[i + 1].getStart())
          .forEach(i -> spansWithO.add(spansWithO.indexOf(spans[i + 1]),
                  new Span(spans[i].getEnd(), spans[i + 1].getStart(), "O")));
  return spansWithO;
}

Source File: OpenNlpNerRecommender.java From inception with Apache License 2.0

5 votes

/**
 * Extract AnnotatedTokenPairs with info on predicted and gold label for each token of the given
 * sentence.
 */
private List<LabelPair> determineLabelsForASentence(String[] sentence,
        Span[] predictedNames, Span[] goldNames)
{
    int predictedNameIdx = 0;
    int goldNameIdx = 0;
    
    List<LabelPair> labelPairs = new ArrayList<>();
    // Spans store which tokens are part of it as [begin,end). 
    // Tokens are counted 0 to length of sentence.
    // Therefore go through all tokens, determine which span they are part of 
    // for predictions and gold ones. Assign label accordingly to the annotated-token.
    for (int i = 0; i < sentence.length; i++) {

        String predictedLabel = NO_NE_TAG;
        if (predictedNameIdx < predictedNames.length) {
            Span predictedName = predictedNames[predictedNameIdx];
            predictedLabel = determineLabel(predictedName, i);

            if (i > predictedName.getEnd()) {
                predictedNameIdx++;
            }
        }

        String goldLabel = NO_NE_TAG;
        if (goldNameIdx < goldNames.length) {
            Span goldName = goldNames[goldNameIdx];
            goldLabel = determineLabel(goldName, i);
            if (i > goldName.getEnd()) {
                goldNameIdx++;
            }
        }

        labelPairs.add(new LabelPair(goldLabel, predictedLabel));

    }
    return labelPairs;
}

Source File: LexicalLibOpenNlpImpl.java From SciGraph with Apache License 2.0

5 votes

@Override
public List<PosToken> tagPOS(String sentence) {
  String[] tokens = tokenizer.tokenize(sentence);
  Span[] spans = tokenizer.tokenizePos(sentence);
  String[] tags = tagger.tag(tokens);
  List<PosToken> poss = new ArrayList<>();
  for (int i = 0; i < tokens.length; i++) {
    poss.add(new PosToken(tokens[i], tags[i], spans[i].getStart(), spans[i].getEnd()));
  }
  return poss;
}

Source File: CorefParse.java From knowledge-extraction with Apache License 2.0

5 votes

private void show(Parse p) {
	int start;
	start = p.getSpan().getStart();
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print("(");
		System.out.print(p.getType());
		if (parseMap.containsKey(p)) {
			System.out.print("#" + parseMap.get(p));
		}
		// System.out.print(p.hashCode()+"-"+parseMap.containsKey(p));
		System.out.print(" ");
	}
	Parse[] children = p.getChildren();
	for (int pi = 0, pn = children.length; pi < pn; pi++) {
		Parse c = children[pi];
		Span s = c.getSpan();
		if (start < s.getStart()) {
			System.out.print(p.getText().substring(start, s.getStart()));
		}
		show(c);
		start = s.getEnd();
	}
	System.out.print(p.getText().substring(start, p.getSpan().getEnd()));
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print(")");
	}
}

Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

opennlp.tools.util.Span Java Examples