opennlp.tools.util.Span Java Examples

The following examples show how to use opennlp.tools.util.Span. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NamedEntityRecognitionUnitTest.java    From tutorials with MIT License 6 votes vote down vote up
@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        names.add("");
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
        }
        k++;
    }
    assertThat(names).contains("John","Leonard","Penny");
}
 
Example #2
Source File: SentenceTokenizer.java    From wiseowl with MIT License 6 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
 if (sentences == null) {
     fillSentences();
   }
   
   if (tokenOffset >= sentences.length) {
     return false;
   }
   
   Span sentenceSpan = sentences[tokenOffset];
   clearAttributes();
   int start = sentenceSpan.getStart();
   int end   = sentenceSpan.getEnd();
   termAtt.copyBuffer(inputSentence, start, end - start);
   posIncrAtt.setPositionIncrement(1);
   offsetAtt.setOffset(start, end);
   tokenOffset++;
   
   return true;
}
 
Example #3
Source File: Chapter1.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void nameFinderExample() {
    try {
        String[] sentences = {
            "Tim was a good neighbor. Perhaps not as good a Bob "
            + "Haywood, but still pretty good. Of course Mr. Adam "
            + "took the cake!"};
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        TokenNameFinderModel model = new TokenNameFinderModel(new File(
                "C:\\OpenNLP Models", "en-ner-person.bin"));
        NameFinderME finder = new NameFinderME(model);

        for (String sentence : sentences) {
            // Split the sentence into tokens
            String[] tokens = tokenizer.tokenize(sentence);

            // Find the names in the tokens and return Span objects
            Span[] nameSpans = finder.find(tokens);

            // Print the names extracted from the tokens using the Span data
            System.out.println(Arrays.toString(
                    Span.spansToStrings(nameSpans, tokens)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}
 
Example #4
Source File: OpenNlpChunkerConceptProvider.java    From bioasq with Apache License 2.0 6 votes vote down vote up
@Override
public List<Concept> getConcepts(JCas jcas) throws AnalysisEngineProcessException {
  List<Token> tokens = TypeUtil.getOrderedTokens(jcas);
  String[] texts = tokens.stream().map(Token::getCoveredText).toArray(String[]::new);
  String[] pos = tokens.stream().map(Token::getPartOfSpeech).toArray(String[]::new);
  List<Span> spans = insertOutsideSpans(chunker.chunkAsSpans(texts, pos));
  return IntStream.rangeClosed(0, spans.size() - type.size())
          .mapToObj(i -> spans.subList(i, i + type.size()))
          .filter(spansSublist -> type
                  .equals(spansSublist.stream().map(Span::getType).collect(toList())))
          .map(spansSublist -> tokens.subList(spansSublist.get(0).getStart(),
                  spansSublist.get(spansSublist.size() - 1).getEnd()))
          .filter(toks -> toks.size() >= minLength)
          .map(toks -> TypeFactory.createConceptMention(jcas, getFirstTokenBegin(toks),
                  getLastTokenEnd(toks)))
          .map(cmention -> TypeFactory.createConcept(jcas, cmention,
                  TypeFactory.createConceptType(jcas, "opennlp:" + String.join("-", type))))
          .collect(toList());
}
 
Example #5
Source File: IntentTrainer.java    From org.openhab.ui.habot with Eclipse Public License 1.0 6 votes vote down vote up
/**
 * Tries to understand the natural language query into an {@link Intent}
 *
 * @param query the natural language query
 * @return the resulting @{link Intent}
 */
public Intent interpret(String query) {
    String[] tokens = this.tokenizer.tokenize(query.toLowerCase());
    // remove eventual trailing punctuation
    tokens[tokens.length - 1] = tokens[tokens.length - 1].replaceAll("\\s*[!?.]+$", "");

    double[] outcome = categorizer.categorize(tokens);
    logger.debug("{}", categorizer.getAllResults(outcome));

    Intent intent = new Intent(categorizer.getBestCategory(outcome));

    Span[] spans = nameFinder.find(tokens);
    String[] names = Span.spansToStrings(spans, tokens);
    for (int i = 0; i < spans.length; i++) {
        intent.getEntities().put(spans[i].getType(), names[i]);
    }

    logger.debug("{}", intent.toString());

    return intent;
}
 
Example #6
Source File: OpenNLPTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
protected boolean incrementWord() {
  if (termSpans == null || termNum == termSpans.length) {
    return false;
  }
  clearAttributes();
  Span term = termSpans[termNum];
  termAtt.copyBuffer(buffer, sentenceStart + term.getStart(), term.length());
  offsetAtt.setOffset(correctOffset(offset + sentenceStart + term.getStart()),
                      correctOffset(offset + sentenceStart + term.getEnd()));
  if (termNum == termSpans.length - 1) {
    flagsAtt.setFlags(flagsAtt.getFlags() | EOS_FLAG_BIT); // mark the last token in the sentence with EOS_FLAG_BIT
  }
  ++termNum;
  return true;
}
 
Example #7
Source File: OpenNlpService.java    From elasticsearch-ingest-opennlp with Apache License 2.0 6 votes vote down vote up
public ExtractedEntities find(String content, String field) {
    try {
        if (!nameFinderModels.containsKey(field)) {
            throw new ElasticsearchException("Could not find field [{}], possible values {}", field, nameFinderModels.keySet());
        }
        TokenNameFinderModel finderModel = nameFinderModels.get(field);
        if (threadLocal.get() == null || !threadLocal.get().equals(finderModel)) {
            threadLocal.set(finderModel);
        }

        String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
        Span[] spans = new NameFinderME(finderModel).find(tokens);

        return new ExtractedEntities(tokens, spans);
    } finally {
        threadLocal.remove();
    }
}
 
Example #8
Source File: TestNER.java    From Mutters with Apache License 2.0 6 votes vote down vote up
@Test
public void testPersonNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-persons.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(2));

  String[] names = Span.spansToStrings(spans, tokens);
  assertThat(names.length, is(2));
  assertThat(names[0], is("John Smith"));
  assertThat(names[1], is("Anne Green"));
}
 
Example #9
Source File: TestNER.java    From Mutters with Apache License 2.0 6 votes vote down vote up
@Test
public void testLocationNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-locations.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(2));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(2));
  assertThat(locations[0], is("New York"));
  assertThat(locations[1], is("London"));
}
 
Example #10
Source File: TestNER.java    From Mutters with Apache License 2.0 6 votes vote down vote up
@Test
public void testDateNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-dates.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(1));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(1));
  assertThat(locations[0], is("today"));
}
 
Example #11
Source File: TestNER.java    From Mutters with Apache License 2.0 6 votes vote down vote up
@Test
public void testAddressNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-address.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE.tokenize("Send a taxi to 12 Pleasent Street");
  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(1));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(1));
  assertThat(locations[0], is("12 Pleasent Street"));
}
 
Example #12
Source File: OpenNLPParser.java    From baleen with Apache License 2.0 6 votes vote down vote up
/**
 * Parses the sentence.
 *
 * @param sentence the sentence
 * @param tokens the tokens
 * @return the parses the
 */
private Parse parseSentence(final Sentence sentence, final Collection<WordToken> tokens) {
  final String text = sentence.getCoveredText();

  final Parse parse =
      new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 1, 0);

  // Add in the POS
  int index = 0;
  for (final WordToken token : tokens) {
    final Span span =
        new Span(token.getBegin() - sentence.getBegin(), token.getEnd() - sentence.getBegin());

    parse.insert(new Parse(text, span, AbstractBottomUpParser.TOK_NODE, 0, index));
    index++;
  }

  // Parse the sentence
  return parser.parse(parse);
}
 
Example #13
Source File: OpenNLPParser.java    From baleen with Apache License 2.0 6 votes vote down vote up
/**
 * Adds the parsed as annotations.
 *
 * @param jCas the j cas
 * @param offset the offset
 * @param parsed the parsed
 */
private void addParsedAsAnnotations(final JCas jCas, final int offset, final Parse parsed) {
  final String type = parsed.getType();

  // Ignore non phrase types
  if (nodeTypes.contains(type)) {
    // Otherwise add new ParseChunks
    final Span span = parsed.getSpan();
    final PhraseChunk phraseChunk = new PhraseChunk(jCas);
    phraseChunk.setBegin(offset + span.getStart());
    phraseChunk.setEnd(offset + span.getEnd());
    phraseChunk.setChunkType(parsed.getType());

    addToJCasIndex(phraseChunk);
  }

  Arrays.stream(parsed.getChildren()).forEach(p -> addParsedAsAnnotations(jCas, offset, p));
}
 
Example #14
Source File: Annotate.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Creates the multiword spans. It gets an initial list of spans (one per
 * token) and creates a multiword span when a multiword is detected.
 * 
 * @param tokens
 *          the list of tokens
 * @param wfs
 *          the list of WFs
 * @param tokenSpans
 *          the list of initial token spans
 */
private void getMultiWordSpans(final String[] tokens, final List<WF> wfs,
    final List<ixa.kaflib.Span<WF>> tokenSpans) {
  final Span[] multiWordSpans = this.multiWordMatcher
      .multiWordsToSpans(tokens);
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final Integer fromIndex = mwSpan.getStart() - counter;
    final Integer toIndex = mwSpan.getEnd() - counter;
    // add to the counter the length of the span removed
    counter = counter + tokenSpans.subList(fromIndex, toIndex).size() - 1;
    // create multiword targets and Span
    final List<WF> wfTargets = wfs
        .subList(mwSpan.getStart(), mwSpan.getEnd());
    final ixa.kaflib.Span<WF> multiWordSpan = KAFDocument
        .newWFSpan(wfTargets);
    // remove the token Spans to be replaced by the multiword span
    tokenSpans.subList(fromIndex, toIndex).clear();
    // add the new Span containing several WFs (multiWordSpan)
    // the counter is used to allow matching the spans to the
    // tokenSpans list indexes
    tokenSpans.add(fromIndex, multiWordSpan);
  }
}
 
Example #15
Source File: MultiWordMatcher.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Get input text and join the multiwords found in the dictionary object.
 * 
 * @param tokens
 *          the input text
 * @return the output text with the joined multiwords
 */
public final String[] getTokensWithMultiWords(final String[] tokens) {
  final Span[] multiWordSpans = multiWordsToSpans(tokens);
  final List<String> tokenList = new ArrayList<String>(Arrays.asList(tokens));
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final int fromIndex = mwSpan.getStart() - counter;
    final int toIndex = mwSpan.getEnd() - counter;
    // System.err.println(fromIndex + " " + toIndex);
    // add to the counter the length of the sublist removed
    // to allow the fromIndex and toIndex to match wrt to the tokenList
    // indexes
    counter = counter + tokenList.subList(fromIndex, toIndex).size() - 1;
    // create the multiword joining the sublist
    final String multiWord = Joiner.on("#").join(
        tokenList.subList(fromIndex, toIndex));
    // remove the sublist containing the tokens to be replaced in the span
    tokenList.subList(fromIndex, toIndex).clear();
    // add the multiword containing the tokens in one Span
    tokenList.add(fromIndex, multiWord);
  }
  return tokenList.toArray(new String[tokenList.size()]);
}
 
Example #16
Source File: MultiWordSample.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
public MultiWordSample(final String id, final String[] sentence,
    Span[] multiwords) {

  this.id = id;
  if (sentence == null) {
    throw new IllegalArgumentException("sentence must not be null!");
  }
  if (multiwords == null) {
    multiwords = new Span[0];
  }
  this.tokens = Collections.unmodifiableList(new ArrayList<String>(Arrays
      .asList(sentence)));
  this.names = Collections.unmodifiableList(new ArrayList<Span>(Arrays
      .asList(multiwords)));
  // TODO: Check that multiword spans are not overlapping, otherwise throw
  // exception
}
 
Example #17
Source File: DocSplitterOpenNLP15Impl.java    From relex with Apache License 2.0 6 votes vote down vote up
/**
 * Get the next sentence out of the buffered text.
 * Return null if there are no complete sentences in the buffer.
 */
public String getNextSentence()
{
	// punt if no sentence detector
	if (detector == null)
	{
		String rc = buffer;
		buffer = null;
		return rc;
	}

	Span spans[] = detector.sentPosDetect(buffer);
	if (0 == spans.length) return null;

	start = 0;
	for (Span span : spans)
	{
		end = span.getEnd();
		if (foundSentence(buffer)) break;
	}
	if (!foundSentence(buffer)) return null;

	buffer = buffer.substring(trimmedEnd);
	return trimmedSentence;
}
 
Example #18
Source File: OpenNLPNounPhraseFilter.java    From jate with GNU Lesser General Public License v3.0 6 votes vote down vote up
private Span[] createSpan(String[] tags) {
    int start=-1;
    List<Span> result =new ArrayList<>();
    for(int i=0; i<tags.length;i++){
        if(tags[i].equalsIgnoreCase(npChunker.getContinueTag())){
            //do nothing
        }
        else{
            if(start!=-1){
                result.add(new Span(start, i, "NP"));
                if(tags[i].equalsIgnoreCase(npChunker.getStartTag()))
                    start=i;
                else
                    start=-1;
            }else if(tags[i].equalsIgnoreCase(npChunker.getStartTag())){
                start=i;
            }
        }
    }
    if(start!=-1){
        result.add(new Span(start, tags.length,"NP"));
    }
    return result.toArray(new Span[0]);
}
 
Example #19
Source File: CorefParse.java    From knowledge-extraction with Apache License 2.0 6 votes vote down vote up
private void print(Parse p, int deep) {
	if (p.getType().length() > 1 && p.getType().substring(0, 2).equals(Parser.TOK_NODE))
		return;
	
	char[] spaces = new char[deep*2];
	Arrays.fill(spaces, ' ');
	Span span = p.getSpan();
    System.out.print(new String(spaces) + p.getType() + " -- " + p.getText().substring(span.getStart(),
			span.getEnd()));
    if (parseMap.containsKey(p)) {
		System.out.print("#" + parseMap.get(p));
	}
    System.out.print("\n");
    for (Parse child : p.getChildren()) {
    	print(child, new Integer(deep + 1));
    }
}
 
Example #20
Source File: OpenNLPSentenceDetectionTest.java    From java_in_examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] strings) throws Exception {
    String text = "“But I don’t want to go among mad people,” Alice remarked. " +
            "“Oh, you can’t help that,” said the Cat: “we’re all mad here. I’m mad. You’re mad.” " +
            "“How do you know I’m mad?” said Alice. " +
            "“You must be,” said the Cat, “or you wouldn’t have come here.”";

    try (InputStream modelIn = new FileInputStream(NATURAL_LANGUAGE_PROCESSING_SRC_MAIN_RESOURCES_EN_SENT_BIN)) {
        SentenceModel model = new SentenceModel(modelIn);
        SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
        String sentences[] = sentenceDetector.sentDetect(text);
        Span sentences2[] = sentenceDetector.sentPosDetect(text);
        for (String sentence : sentences) {
            System.out.println(sentence);
        }
        System.out.println(Arrays.deepToString(sentences2));
    }
}
 
Example #21
Source File: OpenNLP.java    From baleen with Apache License 2.0 6 votes vote down vote up
/** Use the OpenNLP Sentence Detector to detect sentences and add them to the JCas index */
private List<Sentence> createBaseSentences(TextBlock block)
    throws AnalysisEngineProcessException {
  List<Sentence> sentences = new ArrayList<>();

  try {
    String text = block.getCoveredText();
    Span[] sentenceSpans = sentenceDetector.sentPosDetect(text);

    for (Span sentSpan : sentenceSpans) {
      Sentence sent = block.newAnnotation(Sentence.class, sentSpan.getStart(), sentSpan.getEnd());

      addToJCasIndex(sent);
      sentences.add(sent);
    }
  } catch (Exception e) {
    throw new AnalysisEngineProcessException(e);
  }

  return sentences;
}
 
Example #22
Source File: OpenNLPTokenizer.java    From jate with GNU Lesser General Public License v3.0 5 votes vote down vote up
void loadAll() throws IOException {
    fillBuffer();
    String txtStr = new String(fullText);
    detectSentences(txtStr);
    if (paragraphOp != null) {
        detectParagraphs(txtStr);
    }
    words = new Span[sentences.length][];
    for (int i = 0; i < sentences.length; i++) {
        splitWords(i);
    }
}
 
Example #23
Source File: Utils.java    From knowledge-extraction with Apache License 2.0 5 votes vote down vote up
public static void printParseTree(Parse p, int deep) {
if (p.getType().length() > 1 && p.getType().substring(0, 2).equals(Parser.TOK_NODE))
	return;

char[] spaces = new char[deep*2];
Arrays.fill(spaces, ' ');
Span span = p.getSpan();
      System.out.println(new String(spaces) + p.getType() + " -- " + p.getText().substring(span.getStart(),
		span.getEnd()));
      for (Parse child : p.getChildren()) {
      	printParseTree(child, new Integer(deep + 1));
      }
  }
 
Example #24
Source File: FileDiscoverer.java    From DataDefender with Apache License 2.0 5 votes vote down vote up
protected List<FileMatchMetaData> getMatchedFiles(final Model model, String handler, String file, String recursivedir) {

        final String   tokens[]    = model.getTokenizer().tokenize(handler);
        final Span     nameSpans[] = model.getNameFinder().find(tokens);
        final double[] spanProbs   = model.getNameFinder().probs(nameSpans);
        List<Probability> probabilityList = new ArrayList<>();

        for (int i = 0; i < nameSpans.length; i++) {
            log.info("Span: " + nameSpans[i].toString());
            log.info("Covered text is: " + tokens[nameSpans[i].getStart()]);
            log.info("Probability is: " + spanProbs[i]);
            probabilityList.add(new Probability(tokens[nameSpans[i].getStart()], spanProbs[i]));
        }

        model.getNameFinder().clearAdaptiveData();

        final double averageProbability = calculateAverage(probabilityList);

        if (averageProbability >= config.getProbabilityThreshold()) {
            final FileMatchMetaData result = new FileMatchMetaData(recursivedir, file);

            result.setAverageProbability(averageProbability);
            result.setModel(model.getName());
            fileMatches.add(result);
        }
        
        return fileMatches;
    }
 
Example #25
Source File: LexicalLibOpenNlpImpl.java    From SciGraph with Apache License 2.0 5 votes vote down vote up
@Override
@Beta
public List<Token<String>> getEntities(String text) {
  int sentenceStart = 0;
  List<Token<String>> retChunks = new LinkedList<>();
  for (String sentence: extractSentences(text)) {
    String[] tokens = tokenizer.tokenize(sentence);
    Span[] spans = tokenizer.tokenizePos(sentence);
    String[] tags = tagger.tag(tokens);

    for (int i = 0; i < tags.length; i++) {
      List<String> chunk = new LinkedList<>();
      int start = i;
      
      if (PhraseChunker.START_NOUN_TAGS.contains(tags[i])) {
        chunk.add(tokens[i]);
        while (i + 1 < tags.length && PhraseChunker.CONTINUE_NOUN_TAGS.contains(tags[i + 1])) {
          chunk.add(tokens[i+1]);
          i++;
        }
        retChunks.add(new NounChunk(on(' ').join(chunk).replace(" ,", ","), 
            sentenceStart + spans[start].getStart(), sentenceStart + spans[i].getEnd()));
      } else if (PhraseChunker.START_VERB_TAGS.contains(tags[i])) {
        chunk.add(tokens[i]);
        while (i + 1 < tags.length && PhraseChunker.CONTINUE_VERB_TAGS.contains(tags[i + 1])) {
          chunk.add(tokens[i+1]);
          i++;
        }
        retChunks.add(new VerbChunk(on(' ').join(chunk).replace(" ,", ","), 
            sentenceStart + spans[start].getStart(), sentenceStart + spans[i].getEnd()));
      }
    }
    sentenceStart += spans[spans.length - 1].getEnd() + 2;
  }
  return retChunks;
}
 
Example #26
Source File: OpenNlpChunkerConceptProvider.java    From bioasq with Apache License 2.0 5 votes vote down vote up
private static List<Span> insertOutsideSpans(Span[] spans) {
  List<Span> spansWithO = new LinkedList<>(Arrays.asList(spans));
  IntStream.range(0, spans.length - 1).filter(i -> spans[i].getEnd() < spans[i + 1].getStart())
          .forEach(i -> spansWithO.add(spansWithO.indexOf(spans[i + 1]),
                  new Span(spans[i].getEnd(), spans[i + 1].getStart(), "O")));
  return spansWithO;
}
 
Example #27
Source File: OpenNlpNerRecommender.java    From inception with Apache License 2.0 5 votes vote down vote up
/**
 * Extract AnnotatedTokenPairs with info on predicted and gold label for each token of the given
 * sentence.
 */
private List<LabelPair> determineLabelsForASentence(String[] sentence,
        Span[] predictedNames, Span[] goldNames)
{
    int predictedNameIdx = 0;
    int goldNameIdx = 0;
    
    List<LabelPair> labelPairs = new ArrayList<>();
    // Spans store which tokens are part of it as [begin,end). 
    // Tokens are counted 0 to length of sentence.
    // Therefore go through all tokens, determine which span they are part of 
    // for predictions and gold ones. Assign label accordingly to the annotated-token.
    for (int i = 0; i < sentence.length; i++) {

        String predictedLabel = NO_NE_TAG;
        if (predictedNameIdx < predictedNames.length) {
            Span predictedName = predictedNames[predictedNameIdx];
            predictedLabel = determineLabel(predictedName, i);

            if (i > predictedName.getEnd()) {
                predictedNameIdx++;
            }
        }

        String goldLabel = NO_NE_TAG;
        if (goldNameIdx < goldNames.length) {
            Span goldName = goldNames[goldNameIdx];
            goldLabel = determineLabel(goldName, i);
            if (i > goldName.getEnd()) {
                goldNameIdx++;
            }
        }

        labelPairs.add(new LabelPair(goldLabel, predictedLabel));

    }
    return labelPairs;
}
 
Example #28
Source File: LexicalLibOpenNlpImpl.java    From SciGraph with Apache License 2.0 5 votes vote down vote up
@Override
public List<PosToken> tagPOS(String sentence) {
  String[] tokens = tokenizer.tokenize(sentence);
  Span[] spans = tokenizer.tokenizePos(sentence);
  String[] tags = tagger.tag(tokens);
  List<PosToken> poss = new ArrayList<>();
  for (int i = 0; i < tokens.length; i++) {
    poss.add(new PosToken(tokens[i], tags[i], spans[i].getStart(), spans[i].getEnd()));
  }
  return poss;
}
 
Example #29
Source File: CorefParse.java    From knowledge-extraction with Apache License 2.0 5 votes vote down vote up
private void show(Parse p) {
	int start;
	start = p.getSpan().getStart();
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print("(");
		System.out.print(p.getType());
		if (parseMap.containsKey(p)) {
			System.out.print("#" + parseMap.get(p));
		}
		// System.out.print(p.hashCode()+"-"+parseMap.containsKey(p));
		System.out.print(" ");
	}
	Parse[] children = p.getChildren();
	for (int pi = 0, pn = children.length; pi < pn; pi++) {
		Parse c = children[pi];
		Span s = c.getSpan();
		if (start < s.getStart()) {
			System.out.print(p.getText().substring(start, s.getStart()));
		}
		show(c);
		start = s.getEnd();
	}
	System.out.print(p.getText().substring(start, p.getSpan().getEnd()));
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print(")");
	}
}
 
Example #30
Source File: NERScorer.java    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}