Java Code Examples for

The following examples show how to use . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File:    From wiseowl with MIT License 6 votes vote down vote up
public final boolean incrementToken() throws IOException {
 if (sentences == null) {
   if (tokenOffset >= sentences.length) {
     return false;
   Span sentenceSpan = sentences[tokenOffset];
   int start = sentenceSpan.getStart();
   int end   = sentenceSpan.getEnd();
   termAtt.copyBuffer(inputSentence, start, end - start);
   offsetAtt.setOffset(start, end);
   return true;
Example 2
Source File:    From relex with Apache License 2.0 6 votes vote down vote up
 * Get the next sentence out of the buffered text.
 * Return null if there are no complete sentences in the buffer.
public String getNextSentence()
	// punt if no sentence detector
	if (detector == null)
		String rc = buffer;
		buffer = null;
		return rc;

	Span spans[] = detector.sentPosDetect(buffer);
	if (0 == spans.length) return null;

	start = 0;
	for (Span span : spans)
		end = span.getEnd();
		if (foundSentence(buffer)) break;
	if (!foundSentence(buffer)) return null;

	buffer = buffer.substring(trimmedEnd);
	return trimmedSentence;
Example 3
Source File:    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
 * Get input text and join the multiwords found in the dictionary object.
 * @param tokens
 *          the input text
 * @return the output text with the joined multiwords
public final String[] getTokensWithMultiWords(final String[] tokens) {
  final Span[] multiWordSpans = multiWordsToSpans(tokens);
  final List<String> tokenList = new ArrayList<String>(Arrays.asList(tokens));
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final int fromIndex = mwSpan.getStart() - counter;
    final int toIndex = mwSpan.getEnd() - counter;
    // System.err.println(fromIndex + " " + toIndex);
    // add to the counter the length of the sublist removed
    // to allow the fromIndex and toIndex to match wrt to the tokenList
    // indexes
    counter = counter + tokenList.subList(fromIndex, toIndex).size() - 1;
    // create the multiword joining the sublist
    final String multiWord = Joiner.on("#").join(
        tokenList.subList(fromIndex, toIndex));
    // remove the sublist containing the tokens to be replaced in the span
    tokenList.subList(fromIndex, toIndex).clear();
    // add the multiword containing the tokens in one Span
    tokenList.add(fromIndex, multiWord);
  return tokenList.toArray(new String[tokenList.size()]);
Example 4
Source File:    From ixa-pipe-pos with Apache License 2.0 6 votes vote down vote up
 * Creates the multiword spans. It gets an initial list of spans (one per
 * token) and creates a multiword span when a multiword is detected.
 * @param tokens
 *          the list of tokens
 * @param wfs
 *          the list of WFs
 * @param tokenSpans
 *          the list of initial token spans
private void getMultiWordSpans(final String[] tokens, final List<WF> wfs,
    final List<ixa.kaflib.Span<WF>> tokenSpans) {
  final Span[] multiWordSpans = this.multiWordMatcher
  int counter = 0;
  for (final Span mwSpan : multiWordSpans) {
    final Integer fromIndex = mwSpan.getStart() - counter;
    final Integer toIndex = mwSpan.getEnd() - counter;
    // add to the counter the length of the span removed
    counter = counter + tokenSpans.subList(fromIndex, toIndex).size() - 1;
    // create multiword targets and Span
    final List<WF> wfTargets = wfs
        .subList(mwSpan.getStart(), mwSpan.getEnd());
    final ixa.kaflib.Span<WF> multiWordSpan = KAFDocument
    // remove the token Spans to be replaced by the multiword span
    tokenSpans.subList(fromIndex, toIndex).clear();
    // add the new Span containing several WFs (multiWordSpan)
    // the counter is used to allow matching the spans to the
    // tokenSpans list indexes
    tokenSpans.add(fromIndex, multiWordSpan);
Example 5
Source File:    From tutorials with MIT License 6 votes vote down vote up
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
Example 6
Source File:    From inception with Apache License 2.0 5 votes vote down vote up
 * Extract AnnotatedTokenPairs with info on predicted and gold label for each token of the given
 * sentence.
private List<LabelPair> determineLabelsForASentence(String[] sentence,
        Span[] predictedNames, Span[] goldNames)
    int predictedNameIdx = 0;
    int goldNameIdx = 0;
    List<LabelPair> labelPairs = new ArrayList<>();
    // Spans store which tokens are part of it as [begin,end). 
    // Tokens are counted 0 to length of sentence.
    // Therefore go through all tokens, determine which span they are part of 
    // for predictions and gold ones. Assign label accordingly to the annotated-token.
    for (int i = 0; i < sentence.length; i++) {

        String predictedLabel = NO_NE_TAG;
        if (predictedNameIdx < predictedNames.length) {
            Span predictedName = predictedNames[predictedNameIdx];
            predictedLabel = determineLabel(predictedName, i);

            if (i > predictedName.getEnd()) {

        String goldLabel = NO_NE_TAG;
        if (goldNameIdx < goldNames.length) {
            Span goldName = goldNames[goldNameIdx];
            goldLabel = determineLabel(goldName, i);
            if (i > goldName.getEnd()) {

        labelPairs.add(new LabelPair(goldLabel, predictedLabel));

    return labelPairs;
Example 7
Source File:    From inception with Apache License 2.0 5 votes vote down vote up
 * Check that token index is part of the given span and return the span's label 
 * or no-label (token is outside span). 
private String determineLabel(Span aName, int aTokenIdx)
    String label = NO_NE_TAG;

    if (aName.getStart() <= aTokenIdx && aName.getEnd() > aTokenIdx) {
        label = aName.getType();

    return label;
Example 8
Source File:    From modernmt with Apache License 2.0 5 votes vote down vote up
public void annotate(TokenizedString string) {
    Span[] tokens = this.tokenizer.tokenizePos(string.toString());

    for (Span token : tokens) {
        int start = token.getStart();
        int end = token.getEnd();
        int length = end - start;

        string.setWord(start, start + length);
Example 9
Source File:    From knowledge-extraction with Apache License 2.0 5 votes vote down vote up
private void show(Parse p) {
	int start;
	start = p.getSpan().getStart();
	if (!p.getType().equals(Parser.TOK_NODE)) {
		if (parseMap.containsKey(p)) {
			System.out.print("#" + parseMap.get(p));
		// System.out.print(p.hashCode()+"-"+parseMap.containsKey(p));
		System.out.print(" ");
	Parse[] children = p.getChildren();
	for (int pi = 0, pn = children.length; pi < pn; pi++) {
		Parse c = children[pi];
		Span s = c.getSpan();
		if (start < s.getStart()) {
			System.out.print(p.getText().substring(start, s.getStart()));
		start = s.getEnd();
	System.out.print(p.getText().substring(start, p.getSpan().getEnd()));
	if (!p.getType().equals(Parser.TOK_NODE)) {
Example 10
Source File:    From uncc2014watsonsim with GNU General Public License v2.0 5 votes vote down vote up
public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
	return results;
Example 11
Source File:    From relex with Apache License 2.0 5 votes vote down vote up
private void _process(String docText)
	lst = new ArrayList<TextInterval>();
	snl = new ArrayList<String>();
	if (docText == null) return;

	Span spans[] = detector.sentPosDetect(buffer);

	start = 0;
	end = 0;
	for (Span span : spans)
		int sentenceEnd = span.getEnd();
		int prevstart = start;
		start = end; // from previous loop iteration
		end = sentenceEnd;
		if (!foundSentence(docText))
			// go back to previous start
			start = prevstart;
			end = prevstart;

		if (DEBUG > 0) System.out.println(start + "," + end + ": " + trimmedSentence);
		lst.add(new TextInterval(trimmedStart, trimmedEnd));
Example 12
Source File:    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 4 votes vote down vote up
private static void usingOpenNLPChunker() {
        try (
                InputStream posModelStream = new FileInputStream(
                        getModelDir() + "\\en-pos-maxent.bin");
                InputStream chunkerStream = new FileInputStream(
                        getModelDir() + "\\en-chunker.bin");) {
                    POSModel model = new POSModel(posModelStream);
                    POSTaggerME tagger = new POSTaggerME(model);
                    // Used to create sample data for trainer
//                    for (String sentence : sentences) {
//                        String sen[] = tokenizeSentence(sentence);
//                        String tags[] = tagger.tag(sen);
//                        for (int i = 0; i < tags.length; i++) {
////                    for (String token : sentence) {
//                            System.out.print(sen[i] + "/" + tags[i] + " ");
//                        }
//                        System.out.println();
//                    }
//                    System.out.println();

                    String tags[] = tagger.tag(sentence);
                    for (int i = 0; i < tags.length; i++) {
//                    for (String token : sentence) {
                        System.out.print(sentence[i] + "/" + tags[i] + " ");

                    // chunker
                    System.out.println("------------Chunker -----------");
                    ChunkerModel chunkerModel = new ChunkerModel(chunkerStream);
                    ChunkerME chunkerME = new ChunkerME(chunkerModel);
                    String result[] = chunkerME.chunk(sentence, tags);

                    for (int i = 0; i < result.length; i++) {
                        System.out.println("[" + sentence[i] + "] " + result[i]);

                    System.out.println("------------Chunker Spans -----------");
                    Span[] spans = chunkerME.chunkAsSpans(sentence, tags);
                    for (Span span : spans) {
                        System.out.print("Type: " + span.getType() + " - " + " Begin: "
                                + span.getStart() + " End:" + span.getEnd()
                                + " Length: " + span.length() + "  [");
                        for (int j = span.getStart(); j < span.getEnd(); j++) {
                            System.out.print(sentence[j] + " ");
                } catch (IOException ex) {

Example 13
Source File:    From elasticsearch-ingest-opennlp with Apache License 2.0 4 votes vote down vote up
static String createAnnotatedText(String content, List<ExtractedEntities> extractedEntities) {
    // these spans contain the real offset of each word in start/end variables!
    // the spans of the method argument contain the offset of each token, as mentioned in tokens!
    Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content);

    List<Span> spansList = new ArrayList<>();
            .forEach(s -> spansList.addAll(Arrays.asList(s)));

    Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0]));
    String[] tokens = extractedEntities.get(0).getTokens();

    // shortcut if there is no enrichment to be done
    if (spans.length == 0) {
        return content;

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < tokens.length; i++) {
        final int idx = i;
        String token = tokens[i];

        final Optional<Span> optionalSpan = -> s.getStart() == idx).findFirst();
        if (optionalSpan.isPresent()) {
            Span span = optionalSpan.get();
            int start = span.getStart();
            int end = span.getEnd();
            String type = span.getType();

            String[] spanTokens = new String[end - start];
            int spanPosition = 0;
            for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) {
                spanTokens[spanPosition++] = tokens[tokenPosition];
            String entityString = Strings.arrayToDelimitedString(spanTokens, " ");

            i = end - 1;
        } else {

        // only append a whitespace, if the offsets actually differ
        if (i < tokens.length - 1) {
            if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) {
                builder.append(" ");

    return builder.toString();
Example 14
Source File:    From OpenEphyra with GNU General Public License v2.0 4 votes vote down vote up
 * Adds named entity information to parses.
 * @param tag named entity type
 * @param names spans of tokens that are named entities
 * @param tokens parses for the tokens
private static void addNames(String tag, List names, Parse[] tokens) {
	for (int i = 0; i < names.size(); i++) {
		Span nameTokenSpan = (Span) names.get(i);
		Parse startToken = tokens[nameTokenSpan.getStart()];
		Parse endToken = tokens[nameTokenSpan.getEnd()];
		Parse commonP = startToken.getCommonParent(endToken);
		if (commonP != null) {
			Span nameSpan = new Span(startToken.getSpan().getStart(),
			if (nameSpan.equals(commonP.getSpan())) {
				// common parent matches exactly the named entity
				commonP.insert(new Parse(commonP.getText(), nameSpan, tag,
			} else {
				// common parent includes the named entity
				Parse[] kids = commonP.getChildren();
				boolean crossingKids = false;
				for (int j = 0; j < kids.length; j++)
					if (nameSpan.crosses(kids[j].getSpan()))
						crossingKids = true;
				if (!crossingKids) {
					// named entity does not cross children
					commonP.insert(new Parse(commonP.getText(), nameSpan,
							tag, 1.0));
				} else {
					// NE crosses children
					if (commonP.getType().equals("NP")) {
						Parse[] grandKids = kids[0].getChildren();
						Parse last = grandKids[grandKids.length - 1];
						if (grandKids.length > 1 &&
							commonP.insert(new Parse(commonP.getText(),
									commonP.getSpan(), tag,1.0));
Example 15
Source File:    From ixa-pipe-pos with Apache License 2.0 3 votes vote down vote up
 * It takes a NE span indexes and the tokens in a sentence and produces the
 * string to which the NE span corresponds to. This function is used to get
 * the Named Entity or Name textual representation from a {@link Span}
 * @param reducedSpan
 *          a {@link Span}
 * @param tokens
 *          an array of tokens
 * @return named entity string
public static String getStringFromSpan(final Span reducedSpan,
    final String[] tokens) {
  final StringBuilder sb = new StringBuilder();
  for (int si = reducedSpan.getStart(); si < reducedSpan.getEnd(); si++) {
    sb.append(tokens[si]).append(" ");
  return sb.toString().trim();