Java Code Examples for opennlp.tools.util.Span#getStart()

The following examples show how to use opennlp.tools.util.Span#getStart() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SentenceTokenizer.java    From wiseowl with MIT License 6 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
 if (sentences == null) {
     fillSentences();
   }
   
   if (tokenOffset >= sentences.length) {
     return false;
   }
   
   Span sentenceSpan = sentences[tokenOffset];
   clearAttributes();
   int start = sentenceSpan.getStart();
   int end   = sentenceSpan.getEnd();
   termAtt.copyBuffer(inputSentence, start, end - start);
   posIncrAtt.setPositionIncrement(1);
   offsetAtt.setOffset(start, end);
   tokenOffset++;
   
   return true;
}
 
Example 2
Source File: MultiWordMatcher.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Get input text and join the multiwords found in the dictionary object.
 * 
 * @param tokens
 *          the input text
 * @return the output text with the joined multiwords
 */
public final String[] getTokensWithMultiWords(final String[] tokens) {
  final Span[] multiWordSpans = multiWordsToSpans(tokens);
  final List<String> tokenList = new ArrayList<String>(Arrays.asList(tokens));
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final int fromIndex = mwSpan.getStart() - counter;
    final int toIndex = mwSpan.getEnd() - counter;
    // System.err.println(fromIndex + " " + toIndex);
    // add to the counter the length of the sublist removed
    // to allow the fromIndex and toIndex to match wrt to the tokenList
    // indexes
    counter = counter + tokenList.subList(fromIndex, toIndex).size() - 1;
    // create the multiword joining the sublist
    final String multiWord = Joiner.on("#").join(
        tokenList.subList(fromIndex, toIndex));
    // remove the sublist containing the tokens to be replaced in the span
    tokenList.subList(fromIndex, toIndex).clear();
    // add the multiword containing the tokens in one Span
    tokenList.add(fromIndex, multiWord);
  }
  return tokenList.toArray(new String[tokenList.size()]);
}
 
Example 3
Source File: Annotate.java    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
/**
 * Creates the multiword spans. It gets an initial list of spans (one per
 * token) and creates a multiword span when a multiword is detected.
 * 
 * @param tokens
 *          the list of tokens
 * @param wfs
 *          the list of WFs
 * @param tokenSpans
 *          the list of initial token spans
 */
private void getMultiWordSpans(final String[] tokens, final List<WF> wfs,
    final List<ixa.kaflib.Span<WF>> tokenSpans) {
  final Span[] multiWordSpans = this.multiWordMatcher
      .multiWordsToSpans(tokens);
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final Integer fromIndex = mwSpan.getStart() - counter;
    final Integer toIndex = mwSpan.getEnd() - counter;
    // add to the counter the length of the span removed
    counter = counter + tokenSpans.subList(fromIndex, toIndex).size() - 1;
    // create multiword targets and Span
    final List<WF> wfTargets = wfs
        .subList(mwSpan.getStart(), mwSpan.getEnd());
    final ixa.kaflib.Span<WF> multiWordSpan = KAFDocument
        .newWFSpan(wfTargets);
    // remove the token Spans to be replaced by the multiword span
    tokenSpans.subList(fromIndex, toIndex).clear();
    // add the new Span containing several WFs (multiWordSpan)
    // the counter is used to allow matching the spans to the
    // tokenSpans list indexes
    tokenSpans.add(fromIndex, multiWordSpan);
  }
}
 
Example 4
Source File: NamedEntityRecognitionUnitTest.java    From tutorials with MIT License 6 votes vote down vote up
@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        names.add("");
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
        }
        k++;
    }
    assertThat(names).contains("John","Leonard","Penny");
}
 
Example 5
Source File: OpenNlpNerRecommender.java    From inception with Apache License 2.0 5 votes vote down vote up
/**
 * Check that token index is part of the given span and return the span's label 
 * or no-label (token is outside span). 
 */
private String determineLabel(Span aName, int aTokenIdx)
{
    String label = NO_NE_TAG;

    if (aName.getStart() <= aTokenIdx && aName.getEnd() > aTokenIdx) {
        label = aName.getType();
    }

    return label;
}
 
Example 6
Source File: OpenNLPTokenAnnotator.java    From modernmt with Apache License 2.0 5 votes vote down vote up
@Override
public void annotate(TokenizedString string) {
    Span[] tokens = this.tokenizer.tokenizePos(string.toString());

    for (Span token : tokens) {
        int start = token.getStart();
        int end = token.getEnd();
        int length = end - start;

        string.setWord(start, start + length);
    }
}
 
Example 7
Source File: CorefParse.java    From knowledge-extraction with Apache License 2.0 5 votes vote down vote up
private void show(Parse p) {
	int start;
	start = p.getSpan().getStart();
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print("(");
		System.out.print(p.getType());
		if (parseMap.containsKey(p)) {
			System.out.print("#" + parseMap.get(p));
		}
		// System.out.print(p.hashCode()+"-"+parseMap.containsKey(p));
		System.out.print(" ");
	}
	Parse[] children = p.getChildren();
	for (int pi = 0, pn = children.length; pi < pn; pi++) {
		Parse c = children[pi];
		Span s = c.getSpan();
		if (start < s.getStart()) {
			System.out.print(p.getText().substring(start, s.getStart()));
		}
		show(c);
		start = s.getEnd();
	}
	System.out.print(p.getText().substring(start, p.getSpan().getEnd()));
	if (!p.getType().equals(Parser.TOK_NODE)) {
		System.out.print(")");
	}
}
 
Example 8
Source File: NERScorer.java    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}
 
Example 9
Source File: Chapter5.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
private static void usingOpenNLPChunker() {
        try (
                InputStream posModelStream = new FileInputStream(
                        getModelDir() + "\\en-pos-maxent.bin");
                InputStream chunkerStream = new FileInputStream(
                        getModelDir() + "\\en-chunker.bin");) {
                    POSModel model = new POSModel(posModelStream);
                    POSTaggerME tagger = new POSTaggerME(model);
                    
                    // Used to create sample data for trainer
//                    for (String sentence : sentences) {
//                        String sen[] = tokenizeSentence(sentence);
//                        String tags[] = tagger.tag(sen);
//                        for (int i = 0; i < tags.length; i++) {
////                    for (String token : sentence) {
//                            System.out.print(sen[i] + "/" + tags[i] + " ");
//                        }
//                        System.out.println();
//                    }
//                    System.out.println();

                    String tags[] = tagger.tag(sentence);
                    for (int i = 0; i < tags.length; i++) {
//                    for (String token : sentence) {
                        System.out.print(sentence[i] + "/" + tags[i] + " ");
                    }
                    System.out.println();

                    // chunker
                    System.out.println("------------Chunker -----------");
                    ChunkerModel chunkerModel = new ChunkerModel(chunkerStream);
                    ChunkerME chunkerME = new ChunkerME(chunkerModel);
                    String result[] = chunkerME.chunk(sentence, tags);

                    for (int i = 0; i < result.length; i++) {
                        System.out.println("[" + sentence[i] + "] " + result[i]);
                    }

                    System.out.println("------------Chunker Spans -----------");
                    Span[] spans = chunkerME.chunkAsSpans(sentence, tags);
                    for (Span span : spans) {
                        System.out.print("Type: " + span.getType() + " - " + " Begin: "
                                + span.getStart() + " End:" + span.getEnd()
                                + " Length: " + span.length() + "  [");
                        for (int j = span.getStart(); j < span.getEnd(); j++) {
                            System.out.print(sentence[j] + " ");
                        }
                        System.out.println("]");
                    }
                } catch (IOException ex) {
                    ex.printStackTrace();
                }

    }
 
Example 10
Source File: OpenNlpService.java    From elasticsearch-ingest-opennlp with Apache License 2.0 4 votes vote down vote up
static String createAnnotatedText(String content, List<ExtractedEntities> extractedEntities) {
    // these spans contain the real offset of each word in start/end variables!
    // the spans of the method argument contain the offset of each token, as mentioned in tokens!
    Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content);

    List<Span> spansList = new ArrayList<>();
    extractedEntities.stream()
            .map(ExtractedEntities::getSpans)
            .forEach(s -> spansList.addAll(Arrays.asList(s)));

    Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0]));
    String[] tokens = extractedEntities.get(0).getTokens();

    // shortcut if there is no enrichment to be done
    if (spans.length == 0) {
        return content;
    }

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < tokens.length; i++) {
        final int idx = i;
        String token = tokens[i];

        final Optional<Span> optionalSpan = Arrays.stream(spans).filter(s -> s.getStart() == idx).findFirst();
        if (optionalSpan.isPresent()) {
            Span span = optionalSpan.get();
            int start = span.getStart();
            int end = span.getEnd();
            String type = span.getType();

            String[] spanTokens = new String[end - start];
            int spanPosition = 0;
            for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) {
                spanTokens[spanPosition++] = tokens[tokenPosition];
            }
            String entityString = Strings.arrayToDelimitedString(spanTokens, " ");

            builder.append("[");
            builder.append(entityString);
            builder.append("](");
            builder.append(Strings.capitalize(type));
            builder.append("_");
            builder.append(entityString);
            builder.append(")");
            i = end - 1;
        } else {
            builder.append(token);
        }

        // only append a whitespace, if the offsets actually differ
        if (i < tokens.length - 1) {
            if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) {
                builder.append(" ");
            }
        }
    }

    return builder.toString();
}
 
Example 11
Source File: NETagger.java    From OpenEphyra with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Adds named entity information to parses.
 * 
 * @param tag named entity type
 * @param names spans of tokens that are named entities
 * @param tokens parses for the tokens
 */
private static void addNames(String tag, List names, Parse[] tokens) {
	for (int i = 0; i < names.size(); i++) {
		Span nameTokenSpan = (Span) names.get(i);
		Parse startToken = tokens[nameTokenSpan.getStart()];
		Parse endToken = tokens[nameTokenSpan.getEnd()];
		Parse commonP = startToken.getCommonParent(endToken);
		
		if (commonP != null) {
			Span nameSpan = new Span(startToken.getSpan().getStart(),
									 endToken.getSpan().getEnd());
			
			if (nameSpan.equals(commonP.getSpan())) {
				// common parent matches exactly the named entity
				commonP.insert(new Parse(commonP.getText(), nameSpan, tag,
						1.0));
			} else {
				// common parent includes the named entity
				Parse[] kids = commonP.getChildren();
				boolean crossingKids = false;
				
				for (int j = 0; j < kids.length; j++)
					if (nameSpan.crosses(kids[j].getSpan()))
						crossingKids = true;
				
				if (!crossingKids) {
					// named entity does not cross children
					commonP.insert(new Parse(commonP.getText(), nameSpan,
							tag, 1.0));
				} else {
					// NE crosses children
					if (commonP.getType().equals("NP")) {
						Parse[] grandKids = kids[0].getChildren();
						
						Parse last = grandKids[grandKids.length - 1];
						if (grandKids.length > 1 &&
							nameSpan.contains(last.getSpan()))
							commonP.insert(new Parse(commonP.getText(),
									commonP.getSpan(), tag,1.0));
					}
				}
			}
		}
	}
}
 
Example 12
Source File: StringUtils.java    From ixa-pipe-pos with Apache License 2.0 3 votes vote down vote up
/**
 * 
 * It takes a NE span indexes and the tokens in a sentence and produces the
 * string to which the NE span corresponds to. This function is used to get
 * the Named Entity or Name textual representation from a {@link Span}
 * 
 * @param reducedSpan
 *          a {@link Span}
 * @param tokens
 *          an array of tokens
 * @return named entity string
 */
public static String getStringFromSpan(final Span reducedSpan,
    final String[] tokens) {
  final StringBuilder sb = new StringBuilder();
  for (int si = reducedSpan.getStart(); si < reducedSpan.getEnd(); si++) {
    sb.append(tokens[si]).append(" ");
  }
  return sb.toString().trim();
}