Java Code Examples for org.grobid.core.layout.LayoutToken#getOffset()

The following examples show how to use org.grobid.core.layout.LayoutToken#getOffset() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Utilities.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
public static List<LayoutToken> getWindow(int start, int end, List<LayoutToken> tokens, int size, String lang) {
	List<LayoutToken> subTokens = new ArrayList<LayoutToken>();

	// first locate the entity in the token list
	int pos = 0;
	for(LayoutToken token : tokens) {
		if ( (token.getOffset() >= start) && ((token.getOffset()+token.getText().length()) <= end) )
			break;
		pos++;
	}

	int posStart = pos - size;
	if (posStart < 0)
		posStart = 0;
	int posEnd = pos + size;
	if (posEnd >= tokens.size())
		posEnd = tokens.size()-1;

	for(int p = posStart; p <= posEnd; p++) {
		if (p != pos) {
			subTokens.add(tokens.get(p));
		}
	}

	return subTokens;
}
 
Example 2
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 4 votes vote down vote up
/**
 * Add entities corresponding to acronym definitions to a query
 */
public List<Mention> propagateAcronyms(NerdQuery nerdQuery) {
    if ((nerdQuery == null) || (nerdQuery.getContext() == null))
        return null;
    Map<Mention, Mention> acronyms = nerdQuery.getContext().getAcronyms();
    if (acronyms == null)
        return null;

    String text = nerdQuery.getText();
    List<LayoutToken> tokens = nerdQuery.getTokens();
    if (CollectionUtils.isEmpty(tokens)) {
        if (StringUtils.isEmpty(text)) {
            LOGGER.error("All possible content to process are empty - process stops.");
            return null;
        } else {
            Language language = getLanguage(nerdQuery, text);
            tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text, language);
        }
    }

    List<Mention> entities = new ArrayList<>();
    // iterate for every token in layout token list
    //outer:
    for (int i = 0; i < tokens.size(); i++) {
        // get the text and the offsets for every token
        final LayoutToken token = tokens.get(i);

        // find the acronym saved in the map to be compared with the current token
        for (Map.Entry<Mention, Mention> entry : acronyms.entrySet()) {
            Mention acronym = entry.getKey();
            Mention base = entry.getValue();

            List<LayoutToken> layoutTokensAcronym = acronym.getLayoutTokens();

            //we check whether the sequence correspond to the acronym and if so we get it as result
            final List<LayoutToken> matchedSequence = getSequenceMatch(tokens, i, layoutTokensAcronym);
            if (isEmpty(matchedSequence)) {
                continue;
            }

            int offsetStart = token.getOffset();
            int offsetEnd = offsetStart + LayoutTokensUtil.toText(matchedSequence).length();

            Mention entity = new Mention(acronym.getRawName());
            entity.setNormalisedName(base.getRawName());
            entity.setOffsetStart(offsetStart);
            entity.setOffsetEnd(offsetEnd);
            entity.setLayoutTokens(matchedSequence);
            entity.setBoundingBoxes(BoundingBoxCalculator.calculate(entity.getLayoutTokens()));

            entities.add(entity);

            // Since we matched the acronym, we look forward and move on
            i += layoutTokensAcronym.size();
            //continue outer;
            break;
        }
    }
    return entities;
}