Java Code Examples for org.grobid.core.layout.LayoutToken#getText()

The following examples show how to use org.grobid.core.layout.LayoutToken#getText() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimilarityScorer.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
/**
 * Normalise LayoutTokens sequence as an array of words correspond to word embeddings
 */
private List<String> toStringEmbeddings(List<LayoutToken> tokens, String lang) {
	List<String> toks = new ArrayList<String>();
	for(LayoutToken token : tokens) {
		String word = token.getText();

		if (word == null || word.trim().length() == 0)
			continue;
		if (ProcessText.delimiters.indexOf(word) != -1)
			continue;

		// unicode normalization
		word = UnicodeUtil.normaliseText(word);

		// remove possible remaining punctuations
		word = word.replaceAll("\\p{P}", "");

		// flatten numerical chars
		word = word.replaceAll("\\d", "0");

		// lower case everything (to be evaluated!)
		word = word.toLowerCase();
		word = word.replace("\t", "");

		if (word.trim().length() == 0)
			continue;

		try {
			if (!Stopwords.getInstance().isStopword(word, lang))
				toks.add(word);
		} catch(Exception e) {
			LOGGER.warn("Problem getting Stopwords instance", e);
			toks.add(word);
		}
	}
	return toks;
}
 
Example 2
Source File: NerdRestProcessFile.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
private boolean needToLowerCase(List<LayoutToken> layoutTokens) {
    if (isAllUpperCase(LayoutTokensUtil.toText(layoutTokens))) {
        return true;
    } else {
        int count = 0;
        int total = 0;
        for (LayoutToken token : layoutTokens) {
            final String tokenText = token.getText();
            if (!TextUtilities.fullPunctuations.contains(tokenText)) {
                total++;

                if (tokenText.length() == 1) {
                    if (TextUtilities.isAllUpperCase(tokenText)) {
                        count++;
                    }
                } else if (tokenText.length() > 1) {
                    if (Character.isUpperCase(tokenText.charAt(0))
                            && TextUtilities.isAllLowerCase(tokenText.substring(1, tokenText.length()))) {
                        count++;
                    }
                }
            }
        }
        if (count == total) {
            return true;
        }
    }
    return false;
}
 
Example 3
Source File: NERParserCommon.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
public static String toFeatureVectorLayout(List<LayoutToken> tokens, LexiconPositionsIndexes positionsIndexes) {
    StringBuffer ress = new StringBuffer();
    int posit = 0; // keep track of the position index in the list of positions

    for (LayoutToken token : tokens) {
        if ((token.getText() == null) ||
                (token.getText().length() == 0) ||
                token.getText().equals(" ") ||
                token.getText().equals("\t") ||
                token.getText().equals("\n") ||
                token.getText().equals("\r") ||
                token.getText().equals("\u00A0")) {
            continue;
        }

        // check if the token is a known NE
        // do we have a NE at position posit?
        boolean isLocationToken = LexiconPositionsIndexes
                .isTokenInLexicon(positionsIndexes.getLocalLocationPositions(), posit);
        boolean isPersonTitleToken = LexiconPositionsIndexes
                .isTokenInLexicon(positionsIndexes.getLocalPersonTitlePositions(), posit);
        boolean isOrganisationToken = LexiconPositionsIndexes
                .isTokenInLexicon(positionsIndexes.getLocalOrganisationPositions(), posit);
        boolean isOrgFormToken = LexiconPositionsIndexes
                .isTokenInLexicon(positionsIndexes.getLocalOrgFormPositions(), posit);

        ress.append(FeaturesVectorNER
                .addFeaturesNER(token.getText(),
                        isLocationToken, isPersonTitleToken, isOrganisationToken, isOrgFormToken)
                .printVector());
        ress.append("\n");
        posit++;
    }
    ress.append("\n");
    return ress.toString();
}
 
Example 4
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
public Map<Mention, Mention> acronymCandidates(List<LayoutToken> tokens) {
    Map<Mention, Mention> acronyms = null;

    // detect possible acronym
    boolean openParenthesis = false;
    int posParenthesis = -1;
    int i = 0;
    LayoutToken acronym = null;
    for (LayoutToken token : tokens) {
        if (token.getText() == null) {
            i++;
            continue;
        }
        if (token.getText().equals("(")) {
            openParenthesis = true;
            posParenthesis = i;
            acronym = null;
        } else if (token.getText().equals(")")) {
            openParenthesis = false;
        } else if (openParenthesis) {
            if (isAllUpperCaseOrDigitOrDot(token.getText())) {
                acronym = token;
            } else {
                acronym = null;
            }
        }

        if ((acronym != null) && (!openParenthesis)) {
            // check if this possible acronym matches an immediately preceeding term
            int j = posParenthesis;
            int k = acronym.getText().length();
            boolean stop = false;
            while ((k > 0) && (!stop)) {
                k--;
                char c = acronym.getText().toLowerCase().charAt(k);
                while ((j > 0) && (!stop)) {
                    j--;
                    if (tokens.get(j) != null) {
                        String tok = tokens.get(j).getText();
                        if (tok.trim().length() == 0 || delimiters.contains(tok))
                            continue;
                        boolean numericMatch = false;
                        if ((tok.length() > 1) && StringUtils.isNumeric(tok)) {
                            //System.out.println("acronym: " + acronym.getText());
                            //System.out.println("tok: " + tok);
                            // when the token is all digit, it often appears in full as such in the
                            // acronym (e.g. GDF15)
                            String acronymCurrentPrefix = acronym.getText().substring(0, k + 1);
                            //System.out.println("acronymCurrentPrefix: " + acronymCurrentPrefix);
                            if (acronymCurrentPrefix.endsWith(tok)) {
                                // there is a full number match
                                k = k - tok.length() + 1;
                                numericMatch = true;
                                //System.out.println("numericMatch is: " + numericMatch);
                            }
                        }

                        if ((tok.toLowerCase().charAt(0) == c) || numericMatch) {
                            if (k == 0) {
                                if (acronyms == null)
                                    acronyms = new HashMap<>();
                                List<LayoutToken> baseTokens = new ArrayList<>();
                                StringBuilder builder = new StringBuilder();
                                for (int l = j; l < posParenthesis; l++) {
                                    builder.append(tokens.get(l));
                                    baseTokens.add(tokens.get(l));
                                }

                                Mention entityAcronym = new Mention();
                                entityAcronym.setRawName(acronym.getText());
                                entityAcronym.setNormalisedName(builder.toString().trim());
                                entityAcronym.setOffsetStart(acronym.getOffset());
                                entityAcronym.setOffsetEnd(acronym.getOffset() + acronym.getText().length());
                                entityAcronym.setType(null);
                                entityAcronym.setIsAcronym(true);
                                entityAcronym.setLayoutTokens(Arrays.asList(acronym));

                                Mention entityBase = new Mention(builder.toString().trim());
                                entityBase.setOffsetStart(tokens.get(j).getOffset());
                                entityBase.setOffsetEnd(tokens.get(j).getOffset() + entityBase.getRawName().length());
                                entityBase.setLayoutTokens(baseTokens);

                                acronyms.put(entityAcronym, entityBase);
                                stop = true;
                            } else
                                break;
                        } else {
                            stop = true;
                        }
                    }
                }
            }
            acronym = null;
            posParenthesis = -1;
        }
        i++;
    }

    return acronyms;
}