Java Code Examples for opennlp.tools.util.Span#getStart()
The following examples show how to use
opennlp.tools.util.Span#getStart() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SentenceTokenizer.java From wiseowl with MIT License | 6 votes |
@Override public final boolean incrementToken() throws IOException { if (sentences == null) { fillSentences(); } if (tokenOffset >= sentences.length) { return false; } Span sentenceSpan = sentences[tokenOffset]; clearAttributes(); int start = sentenceSpan.getStart(); int end = sentenceSpan.getEnd(); termAtt.copyBuffer(inputSentence, start, end - start); posIncrAtt.setPositionIncrement(1); offsetAtt.setOffset(start, end); tokenOffset++; return true; }
Example 2
Source File: MultiWordMatcher.java From ixa-pipe-pos with Apache License 2.0 | 6 votes |
/** * Get input text and join the multiwords found in the dictionary object. * * @param tokens * the input text * @return the output text with the joined multiwords */ public final String[] getTokensWithMultiWords(final String[] tokens) { final Span[] multiWordSpans = multiWordsToSpans(tokens); final List<String> tokenList = new ArrayList<String>(Arrays.asList(tokens)); int counter = 0; for (final Span mwSpan : multiWordSpans) { final int fromIndex = mwSpan.getStart() - counter; final int toIndex = mwSpan.getEnd() - counter; // System.err.println(fromIndex + " " + toIndex); // add to the counter the length of the sublist removed // to allow the fromIndex and toIndex to match wrt to the tokenList // indexes counter = counter + tokenList.subList(fromIndex, toIndex).size() - 1; // create the multiword joining the sublist final String multiWord = Joiner.on("#").join( tokenList.subList(fromIndex, toIndex)); // remove the sublist containing the tokens to be replaced in the span tokenList.subList(fromIndex, toIndex).clear(); // add the multiword containing the tokens in one Span tokenList.add(fromIndex, multiWord); } return tokenList.toArray(new String[tokenList.size()]); }
Example 3
Source File: Annotate.java From ixa-pipe-pos with Apache License 2.0 | 6 votes |
/** * Creates the multiword spans. It gets an initial list of spans (one per * token) and creates a multiword span when a multiword is detected. * * @param tokens * the list of tokens * @param wfs * the list of WFs * @param tokenSpans * the list of initial token spans */ private void getMultiWordSpans(final String[] tokens, final List<WF> wfs, final List<ixa.kaflib.Span<WF>> tokenSpans) { final Span[] multiWordSpans = this.multiWordMatcher .multiWordsToSpans(tokens); int counter = 0; for (final Span mwSpan : multiWordSpans) { final Integer fromIndex = mwSpan.getStart() - counter; final Integer toIndex = mwSpan.getEnd() - counter; // add to the counter the length of the span removed counter = counter + tokenSpans.subList(fromIndex, toIndex).size() - 1; // create multiword targets and Span final List<WF> wfTargets = wfs .subList(mwSpan.getStart(), mwSpan.getEnd()); final ixa.kaflib.Span<WF> multiWordSpan = KAFDocument .newWFSpan(wfTargets); // remove the token Spans to be replaced by the multiword span tokenSpans.subList(fromIndex, toIndex).clear(); // add the new Span containing several WFs (multiWordSpan) // the counter is used to allow matching the spans to the // tokenSpans list indexes tokenSpans.add(fromIndex, multiWordSpan); } }
Example 4
Source File: NamedEntityRecognitionUnitTest.java From tutorials with MIT License | 6 votes |
@Test public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny."); InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin"); TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder); NameFinderME nameFinderME = new NameFinderME(model); List<Span> spans = Arrays.asList(nameFinderME.find(tokens)); assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]"); List<String> names = new ArrayList<String>(); int k = 0; for (Span s : spans) { names.add(""); for (int index = s.getStart(); index < s.getEnd(); index++) { names.set(k, names.get(k) + tokens[index]); } k++; } assertThat(names).contains("John","Leonard","Penny"); }
Example 5
Source File: OpenNlpNerRecommender.java From inception with Apache License 2.0 | 5 votes |
/** * Check that token index is part of the given span and return the span's label * or no-label (token is outside span). */ private String determineLabel(Span aName, int aTokenIdx) { String label = NO_NE_TAG; if (aName.getStart() <= aTokenIdx && aName.getEnd() > aTokenIdx) { label = aName.getType(); } return label; }
Example 6
Source File: OpenNLPTokenAnnotator.java From modernmt with Apache License 2.0 | 5 votes |
@Override public void annotate(TokenizedString string) { Span[] tokens = this.tokenizer.tokenizePos(string.toString()); for (Span token : tokens) { int start = token.getStart(); int end = token.getEnd(); int length = end - start; string.setWord(start, start + length); } }
Example 7
Source File: CorefParse.java From knowledge-extraction with Apache License 2.0 | 5 votes |
private void show(Parse p) { int start; start = p.getSpan().getStart(); if (!p.getType().equals(Parser.TOK_NODE)) { System.out.print("("); System.out.print(p.getType()); if (parseMap.containsKey(p)) { System.out.print("#" + parseMap.get(p)); } // System.out.print(p.hashCode()+"-"+parseMap.containsKey(p)); System.out.print(" "); } Parse[] children = p.getChildren(); for (int pi = 0, pn = children.length; pi < pn; pi++) { Parse c = children[pi]; Span s = c.getSpan(); if (start < s.getStart()) { System.out.print(p.getText().substring(start, s.getStart())); } show(c); start = s.getEnd(); } System.out.print(p.getText().substring(start, p.getSpan().getEnd())); if (!p.getType().equals(Parser.TOK_NODE)) { System.out.print(")"); } }
Example 8
Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
public Parse[] parsePassageText(String p) throws InvalidFormatException{ if (!modelsAreInitialized)init(); //initialize SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel); NameFinderME nameFinder = new NameFinderME(this.nerModel); Parser parser = ParserFactory.create( this.parserModel, 20, // beam size 0.95); // advance percentage //find sentences, tokenize each, parse each, return top parse for each String[] sentences = sentenceDetector.sentDetect(p); Parse[] results = new Parse[sentences.length]; for (int i=0;i<sentences.length;i++){ //String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]); //StringTokenizer st = new StringTokenizer(tks[i]); //There are several tokenizers available. SimpleTokenizer works best Tokenizer tokenizer = SimpleTokenizer.INSTANCE; for (int si = 0; si < sentences.length; si++) { Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]); String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]); Span[] names = nameFinder.find(tokens); for (int ni = 0; ni < names.length; ni++) { Span startSpan = tokenSpans[names[ni].getStart()]; int nameStart = startSpan.getStart(); Span endSpan = tokenSpans[names[ni].getEnd() - 1]; int nameEnd = endSpan.getEnd(); String name = sentences[si].substring(nameStart, nameEnd); System.out.println(name); } } String sent= StringUtils.join(tokenizer," "); System.out.println("Found sentence " + sent); Parse[] sentResults = ParserTool.parseLine(sent,parser, 1); results[i]=sentResults[0]; } return results; }
Example 9
Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 4 votes |
private static void usingOpenNLPChunker() { try ( InputStream posModelStream = new FileInputStream( getModelDir() + "\\en-pos-maxent.bin"); InputStream chunkerStream = new FileInputStream( getModelDir() + "\\en-chunker.bin");) { POSModel model = new POSModel(posModelStream); POSTaggerME tagger = new POSTaggerME(model); // Used to create sample data for trainer // for (String sentence : sentences) { // String sen[] = tokenizeSentence(sentence); // String tags[] = tagger.tag(sen); // for (int i = 0; i < tags.length; i++) { //// for (String token : sentence) { // System.out.print(sen[i] + "/" + tags[i] + " "); // } // System.out.println(); // } // System.out.println(); String tags[] = tagger.tag(sentence); for (int i = 0; i < tags.length; i++) { // for (String token : sentence) { System.out.print(sentence[i] + "/" + tags[i] + " "); } System.out.println(); // chunker System.out.println("------------Chunker -----------"); ChunkerModel chunkerModel = new ChunkerModel(chunkerStream); ChunkerME chunkerME = new ChunkerME(chunkerModel); String result[] = chunkerME.chunk(sentence, tags); for (int i = 0; i < result.length; i++) { System.out.println("[" + sentence[i] + "] " + result[i]); } System.out.println("------------Chunker Spans -----------"); Span[] spans = chunkerME.chunkAsSpans(sentence, tags); for (Span span : spans) { System.out.print("Type: " + span.getType() + " - " + " Begin: " + span.getStart() + " End:" + span.getEnd() + " Length: " + span.length() + " ["); for (int j = span.getStart(); j < span.getEnd(); j++) { System.out.print(sentence[j] + " "); } System.out.println("]"); } } catch (IOException ex) { ex.printStackTrace(); } }
Example 10
Source File: OpenNlpService.java From elasticsearch-ingest-opennlp with Apache License 2.0 | 4 votes |
static String createAnnotatedText(String content, List<ExtractedEntities> extractedEntities) { // these spans contain the real offset of each word in start/end variables! // the spans of the method argument contain the offset of each token, as mentioned in tokens! Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content); List<Span> spansList = new ArrayList<>(); extractedEntities.stream() .map(ExtractedEntities::getSpans) .forEach(s -> spansList.addAll(Arrays.asList(s))); Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0])); String[] tokens = extractedEntities.get(0).getTokens(); // shortcut if there is no enrichment to be done if (spans.length == 0) { return content; } StringBuilder builder = new StringBuilder(); for (int i = 0; i < tokens.length; i++) { final int idx = i; String token = tokens[i]; final Optional<Span> optionalSpan = Arrays.stream(spans).filter(s -> s.getStart() == idx).findFirst(); if (optionalSpan.isPresent()) { Span span = optionalSpan.get(); int start = span.getStart(); int end = span.getEnd(); String type = span.getType(); String[] spanTokens = new String[end - start]; int spanPosition = 0; for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) { spanTokens[spanPosition++] = tokens[tokenPosition]; } String entityString = Strings.arrayToDelimitedString(spanTokens, " "); builder.append("["); builder.append(entityString); builder.append("]("); builder.append(Strings.capitalize(type)); builder.append("_"); builder.append(entityString); builder.append(")"); i = end - 1; } else { builder.append(token); } // only append a whitespace, if the offsets actually differ if (i < tokens.length - 1) { if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) { builder.append(" "); } } } return builder.toString(); }
Example 11
Source File: NETagger.java From OpenEphyra with GNU General Public License v2.0 | 4 votes |
/** * Adds named entity information to parses. * * @param tag named entity type * @param names spans of tokens that are named entities * @param tokens parses for the tokens */ private static void addNames(String tag, List names, Parse[] tokens) { for (int i = 0; i < names.size(); i++) { Span nameTokenSpan = (Span) names.get(i); Parse startToken = tokens[nameTokenSpan.getStart()]; Parse endToken = tokens[nameTokenSpan.getEnd()]; Parse commonP = startToken.getCommonParent(endToken); if (commonP != null) { Span nameSpan = new Span(startToken.getSpan().getStart(), endToken.getSpan().getEnd()); if (nameSpan.equals(commonP.getSpan())) { // common parent matches exactly the named entity commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0)); } else { // common parent includes the named entity Parse[] kids = commonP.getChildren(); boolean crossingKids = false; for (int j = 0; j < kids.length; j++) if (nameSpan.crosses(kids[j].getSpan())) crossingKids = true; if (!crossingKids) { // named entity does not cross children commonP.insert(new Parse(commonP.getText(), nameSpan, tag, 1.0)); } else { // NE crosses children if (commonP.getType().equals("NP")) { Parse[] grandKids = kids[0].getChildren(); Parse last = grandKids[grandKids.length - 1]; if (grandKids.length > 1 && nameSpan.contains(last.getSpan())) commonP.insert(new Parse(commonP.getText(), commonP.getSpan(), tag,1.0)); } } } } } }
Example 12
Source File: StringUtils.java From ixa-pipe-pos with Apache License 2.0 | 3 votes |
/** * * It takes a NE span indexes and the tokens in a sentence and produces the * string to which the NE span corresponds to. This function is used to get * the Named Entity or Name textual representation from a {@link Span} * * @param reducedSpan * a {@link Span} * @param tokens * an array of tokens * @return named entity string */ public static String getStringFromSpan(final Span reducedSpan, final String[] tokens) { final StringBuilder sb = new StringBuilder(); for (int si = reducedSpan.getStart(); si < reducedSpan.getEnd(); si++) { sb.append(tokens[si]).append(" "); } return sb.toString().trim(); }