opennlp.tools.util.Span#getStart

Source File: SentenceTokenizer.java From wiseowl with MIT License

6 votes

@Override
public final boolean incrementToken() throws IOException {
 if (sentences == null) {
     fillSentences();
   }
   
   if (tokenOffset >= sentences.length) {
     return false;
   }
   
   Span sentenceSpan = sentences[tokenOffset];
   clearAttributes();
   int start = sentenceSpan.getStart();
   int end   = sentenceSpan.getEnd();
   termAtt.copyBuffer(inputSentence, start, end - start);
   posIncrAtt.setPositionIncrement(1);
   offsetAtt.setOffset(start, end);
   tokenOffset++;
   
   return true;
}

Source File: MultiWordMatcher.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Get input text and join the multiwords found in the dictionary object.
 * 
 * @param tokens
 *          the input text
 * @return the output text with the joined multiwords
 */
public final String[] getTokensWithMultiWords(final String[] tokens) {
  final Span[] multiWordSpans = multiWordsToSpans(tokens);
  final List<String> tokenList = new ArrayList<String>(Arrays.asList(tokens));
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final int fromIndex = mwSpan.getStart() - counter;
    final int toIndex = mwSpan.getEnd() - counter;
    // System.err.println(fromIndex + " " + toIndex);
    // add to the counter the length of the sublist removed
    // to allow the fromIndex and toIndex to match wrt to the tokenList
    // indexes
    counter = counter + tokenList.subList(fromIndex, toIndex).size() - 1;
    // create the multiword joining the sublist
    final String multiWord = Joiner.on("#").join(
        tokenList.subList(fromIndex, toIndex));
    // remove the sublist containing the tokens to be replaced in the span
    tokenList.subList(fromIndex, toIndex).clear();
    // add the multiword containing the tokens in one Span
    tokenList.add(fromIndex, multiWord);
  }
  return tokenList.toArray(new String[tokenList.size()]);
}

Source File: Annotate.java From ixa-pipe-pos with Apache License 2.0

6 votes

/**
 * Creates the multiword spans. It gets an initial list of spans (one per
 * token) and creates a multiword span when a multiword is detected.
 * 
 * @param tokens
 *          the list of tokens
 * @param wfs
 *          the list of WFs
 * @param tokenSpans
 *          the list of initial token spans
 */
private void getMultiWordSpans(final String[] tokens, final List<WF> wfs,
    final List<ixa.kaflib.Span<WF>> tokenSpans) {
  final Span[] multiWordSpans = this.multiWordMatcher
      .multiWordsToSpans(tokens);
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final Integer fromIndex = mwSpan.getStart() - counter;
    final Integer toIndex = mwSpan.getEnd() - counter;
    // add to the counter the length of the span removed
    counter = counter + tokenSpans.subList(fromIndex, toIndex).size() - 1;
    // create multiword targets and Span
    final List<WF> wfTargets = wfs
        .subList(mwSpan.getStart(), mwSpan.getEnd());
    final ixa.kaflib.Span<WF> multiWordSpan = KAFDocument
        .newWFSpan(wfTargets);
    // remove the token Spans to be replaced by the multiword span
    tokenSpans.subList(fromIndex, toIndex).clear();
    // add the new Span containing several WFs (multiWordSpan)
    // the counter is used to allow matching the spans to the
    // tokenSpans list indexes
    tokenSpans.add(fromIndex, multiWordSpan);
  }
}

Source File: NamedEntityRecognitionUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        names.add("");
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
        }
        k++;
    }
    assertThat(names).contains("John","Leonard","Penny");
}

Source File: OpenNlpNerRecommender.java From inception with Apache License 2.0

5 votes

/**
 * Check that token index is part of the given span and return the span's label 
 * or no-label (token is outside span). 
 */
private String determineLabel(Span aName, int aTokenIdx)
{
    String label = NO_NE_TAG;

    if (aName.getStart() <= aTokenIdx && aName.getEnd() > aTokenIdx) {
        label = aName.getType();
    }

    return label;
}

Source File: OpenNLPTokenAnnotator.java From modernmt with Apache License 2.0

5 votes

@Override
public void annotate(TokenizedString string) {
    Span[] tokens = this.tokenizer.tokenizePos(string.toString());

    for (Span token : tokens) {
        int start = token.getStart();
        int end = token.getEnd();
        int length = end - start;

        string.setWord(start, start + length);
    }
}

Source File: CorefParse.java From knowledge-extraction with Apache License 2.0

5 votes

private void show(Parse p) {
	int start;
	start = p.getSpan().getStart();
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print("(");
		System.out.print(p.getType());
		if (parseMap.containsKey(p)) {
			System.out.print("#" + parseMap.get(p));
		}
		// System.out.print(p.hashCode()+"-"+parseMap.containsKey(p));
		System.out.print(" ");
	}
	Parse[] children = p.getChildren();
	for (int pi = 0, pn = children.length; pi < pn; pi++) {
		Parse c = children[pi];
		Span s = c.getSpan();
		if (start < s.getStart()) {
			System.out.print(p.getText().substring(start, s.getStart()));
		}
		show(c);
		start = s.getEnd();
	}
	System.out.print(p.getText().substring(start, p.getSpan().getEnd()));
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print(")");
	}
}

Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingOpenNLPChunker() {
        try (
                InputStream posModelStream = new FileInputStream(
                        getModelDir() + "\\en-pos-maxent.bin");
                InputStream chunkerStream = new FileInputStream(
                        getModelDir() + "\\en-chunker.bin");) {
                    POSModel model = new POSModel(posModelStream);
                    POSTaggerME tagger = new POSTaggerME(model);
                    
                    // Used to create sample data for trainer
//                    for (String sentence : sentences) {
//                        String sen[] = tokenizeSentence(sentence);
//                        String tags[] = tagger.tag(sen);
//                        for (int i = 0; i < tags.length; i++) {
////                    for (String token : sentence) {
//                            System.out.print(sen[i] + "/" + tags[i] + " ");
//                        }
//                        System.out.println();
//                    }
//                    System.out.println();

                    String tags[] = tagger.tag(sentence);
                    for (int i = 0; i < tags.length; i++) {
//                    for (String token : sentence) {
                        System.out.print(sentence[i] + "/" + tags[i] + " ");
                    }
                    System.out.println();

                    // chunker
                    System.out.println("------------Chunker -----------");
                    ChunkerModel chunkerModel = new ChunkerModel(chunkerStream);
                    ChunkerME chunkerME = new ChunkerME(chunkerModel);
                    String result[] = chunkerME.chunk(sentence, tags);

                    for (int i = 0; i < result.length; i++) {
                        System.out.println("[" + sentence[i] + "] " + result[i]);
                    }

                    System.out.println("------------Chunker Spans -----------");
                    Span[] spans = chunkerME.chunkAsSpans(sentence, tags);
                    for (Span span : spans) {
                        System.out.print("Type: " + span.getType() + " - " + " Begin: "
                                + span.getStart() + " End:" + span.getEnd()
                                + " Length: " + span.length() + "  [");
                        for (int j = span.getStart(); j < span.getEnd(); j++) {
                            System.out.print(sentence[j] + " ");
                        }
                        System.out.println("]");
                    }
                } catch (IOException ex) {
                    ex.printStackTrace();
                }

    }

Source File: OpenNlpService.java From elasticsearch-ingest-opennlp with Apache License 2.0

4 votes

static String createAnnotatedText(String content, List<ExtractedEntities> extractedEntities) {
    // these spans contain the real offset of each word in start/end variables!
    // the spans of the method argument contain the offset of each token, as mentioned in tokens!
    Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content);

    List<Span> spansList = new ArrayList<>();
    extractedEntities.stream()
            .map(ExtractedEntities::getSpans)
            .forEach(s -> spansList.addAll(Arrays.asList(s)));

    Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0]));
    String[] tokens = extractedEntities.get(0).getTokens();

    // shortcut if there is no enrichment to be done
    if (spans.length == 0) {
        return content;
    }

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < tokens.length; i++) {
        final int idx = i;
        String token = tokens[i];

        final Optional<Span> optionalSpan = Arrays.stream(spans).filter(s -> s.getStart() == idx).findFirst();
        if (optionalSpan.isPresent()) {
            Span span = optionalSpan.get();
            int start = span.getStart();
            int end = span.getEnd();
            String type = span.getType();

            String[] spanTokens = new String[end - start];
            int spanPosition = 0;
            for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) {
                spanTokens[spanPosition++] = tokens[tokenPosition];
            }
            String entityString = Strings.arrayToDelimitedString(spanTokens, " ");

            builder.append("[");
            builder.append(entityString);
            builder.append("](");
            builder.append(Strings.capitalize(type));
            builder.append("_");
            builder.append(entityString);
            builder.append(")");
            i = end - 1;
        } else {
            builder.append(token);
        }

        // only append a whitespace, if the offsets actually differ
        if (i < tokens.length - 1) {
            if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) {
                builder.append(" ");
            }
        }
    }

    return builder.toString();
}

Source File: NETagger.java From OpenEphyra with GNU General Public License v2.0

4 votes

/**
 * Adds named entity information to parses.
 * 
 * @param tag named entity type
 * @param names spans of tokens that are named entities
 * @param tokens parses for the tokens
 */
private static void addNames(String tag, List names, Parse[] tokens) {
	for (int i = 0; i < names.size(); i++) {
		Span nameTokenSpan = (Span) names.get(i);
		Parse startToken = tokens[nameTokenSpan.getStart()];
		Parse endToken = tokens[nameTokenSpan.getEnd()];
		Parse commonP = startToken.getCommonParent(endToken);
		
		if (commonP != null) {
			Span nameSpan = new Span(startToken.getSpan().getStart(),
									 endToken.getSpan().getEnd());
			
			if (nameSpan.equals(commonP.getSpan())) {
				// common parent matches exactly the named entity
				commonP.insert(new Parse(commonP.getText(), nameSpan, tag,
						1.0));
			} else {
				// common parent includes the named entity
				Parse[] kids = commonP.getChildren();
				boolean crossingKids = false;
				
				for (int j = 0; j < kids.length; j++)
					if (nameSpan.crosses(kids[j].getSpan()))
						crossingKids = true;
				
				if (!crossingKids) {
					// named entity does not cross children
					commonP.insert(new Parse(commonP.getText(), nameSpan,
							tag, 1.0));
				} else {
					// NE crosses children
					if (commonP.getType().equals("NP")) {
						Parse[] grandKids = kids[0].getChildren();
						
						Parse last = grandKids[grandKids.length - 1];
						if (grandKids.length > 1 &&
							nameSpan.contains(last.getSpan()))
							commonP.insert(new Parse(commonP.getText(),
									commonP.getSpan(), tag,1.0));
					}
				}
			}
		}
	}
}

Source File: StringUtils.java From ixa-pipe-pos with Apache License 2.0

3 votes

/**
 * 
 * It takes a NE span indexes and the tokens in a sentence and produces the
 * string to which the NE span corresponds to. This function is used to get
 * the Named Entity or Name textual representation from a {@link Span}
 * 
 * @param reducedSpan
 *          a {@link Span}
 * @param tokens
 *          an array of tokens
 * @return named entity string
 */
public static String getStringFromSpan(final Span reducedSpan,
    final String[] tokens) {
  final StringBuilder sb = new StringBuilder();
  for (int si = reducedSpan.getStart(); si < reducedSpan.getEnd(); si++) {
    sb.append(tokens[si]).append(" ");
  }
  return sb.toString().trim();
}

Java Code Examples for opennlp.tools.util.Span#getStart()