org.grobid.core.utilities.LayoutTokensUtil Java Examples

The following examples show how to use org.grobid.core.utilities.LayoutTokensUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NERParsers.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
/**
 * Extract all occurrences of named entity from list of LayoutToken of unknown language.
 * A language identifier is used to determine the language, and the token sequence is 
 * processed if the identified language is supported.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) throws GrobidResourceException {
    // run language identifier
    LanguageUtilities languageIdentifier = LanguageUtilities.getInstance();                     
    Language resultLang = null;
    synchronized (languageIdentifier) {       
        resultLang = languageIdentifier.runLanguageId(LayoutTokensUtil.toText(tokens), 2000); 
    }

    return extractNE(tokens, resultLang);
}
 
Example #2
Source File: TaggingTokenClusteror.java    From science-result-extractor with Apache License 2.0 4 votes vote down vote up
public List<TaggingTokenCluster> cluster() {
        
        List<TaggingTokenCluster> result = new ArrayList<>();

        PeekingIterator<LabeledTokensContainer> it = Iterators.peekingIterator(taggingTokenSynchronizer);
        if (!it.hasNext() || (it.peek() == null)) {
            return Collections.emptyList();
        }

        // a boolean is introduced to indicate the start of the sequence in the case the label
        // has no beginning indicator (e.g. I-)
        boolean begin = true;
        TaggingTokenCluster curCluster = new TaggingTokenCluster(it.peek().getTaggingLabel());
        BoundingBox curBox=null;
 
        
        
        while (it.hasNext()) {
            LabeledTokensContainer cont = it.next();
            BoundingBox b = BoundingBox.fromLayoutToken(cont.getLayoutTokens().get(0));
            if(!curCluster.concatTokens().isEmpty()){
                curBox = BoundingBox.fromLayoutToken(curCluster.concatTokens().get(0));
                if(b.distanceTo(curBox)>600){
                    curCluster = new TaggingTokenCluster(cont.getTaggingLabel());
                    result.add(curCluster);
                }
            }
            
            if (begin || cont.isBeginning() || cont.getTaggingLabel() != curCluster.getTaggingLabel()) {
                curCluster = new TaggingTokenCluster(cont.getTaggingLabel());
                result.add(curCluster);
            }
            
            //for table, seperate caption and content
            if(curCluster!=null){
                String tableStr = LayoutTokensUtil.normalizeText(curCluster.concatTokens());
                if(tableStr.matches(".*?(Table|TABLE) \\d+(:|\\.| [A-Z]).*?")){
//                if(tableStr.matches(".*?(Table|TABLE|Figure|FIGURE) \\d+(:|\\.).*?")){
                    if(toText(curCluster.getLastContainer().getLayoutTokens()).equalsIgnoreCase(". \n\n")){ 
                        curCluster = new TaggingTokenCluster(cont.getTaggingLabel());
                        result.add(curCluster);
                    }
                }
            }
                    
                    
            curCluster.addLabeledTokensContainer(cont);
            if (begin)
                begin = false;
        }

        return result;
    }
 
Example #3
Source File: StringProcessorTest.java    From entity-fishing with Apache License 2.0 3 votes vote down vote up
@Test
public void adjustLetterCase_fullCase_shouldLowerCase() {

    String input = "THIS IS A TITLE";

    final List<LayoutToken> layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
    final List<LayoutToken> result = StringProcessor.adjustLetterCase(layoutTokens);

    assertThat(LayoutTokensUtil.toText(result), is(lowerCase(input)));
    
}
 
Example #4
Source File: StringProcessorTest.java    From entity-fishing with Apache License 2.0 3 votes vote down vote up
@Test
public void adjustLetterCase_fullCase_2_shouldLowerCase() {

    String input = "THIS IS A TITLE.";

    final List<LayoutToken> layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
    final List<LayoutToken> result = StringProcessor.adjustLetterCase(layoutTokens);

    assertThat(LayoutTokensUtil.toText(result), is(lowerCase(input)));

}
 
Example #5
Source File: StringProcessorTest.java    From entity-fishing with Apache License 2.0 3 votes vote down vote up
@Test
public void adjustLetterCase_initialUpperCase_shouldLowerCase() {

    String input = "This Is A Title";

    final List<LayoutToken> layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
    final List<LayoutToken> result = StringProcessor.adjustLetterCase(layoutTokens);

    assertThat(LayoutTokensUtil.toText(result), is(input.charAt(0) + lowerCase(input.substring(1, input.length()))));
}
 
Example #6
Source File: StringProcessorTest.java    From entity-fishing with Apache License 2.0 3 votes vote down vote up
@Test
public void adjustLetterCase_initialUpperCase_2_shouldLowerCase() {

    String input = "This Is A, Title";

    final List<LayoutToken> layoutTokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);
    final List<LayoutToken> result = StringProcessor.adjustLetterCase(layoutTokens);

    assertThat(LayoutTokensUtil.toText(result), is(input.charAt(0) + lowerCase(input.substring(1, input.length()))));

}