org.grobid.core.layout.LayoutToken Java Examples

The following examples show how to use org.grobid.core.layout.LayoutToken. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
@Test
public void testAcronymsTokens() {
    String input = "Figure 4. \n" +
            "Canonical Correspondence Analysis (CCA) diagram showing the ordination of anopheline species along the\n" +
            "first two axes and their correlation with environmental variables. The first axis is horizontal, second vertical. Direction\n" +
            "and length of arrows shows the degree of correlation between mosquito larvae and the variables.";
    List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
    Map<Mention, Mention> acronyms = processText.acronymCandidates(tokens);

    assertNotNull(acronyms);
    for (Map.Entry<Mention, Mention> entry : acronyms.entrySet()) {
        Mention base = entry.getValue();
        Mention acronym = entry.getKey();

        assertEquals(input.substring(acronym.getOffsetStart(), acronym.getOffsetEnd()).trim(), "CCA");
        assertEquals(base.getRawName(), "Canonical Correspondence Analysis");

        assertThat(acronym.getOffsetStart(), is(46));
        assertThat(acronym.getOffsetEnd(), is(49));
    }
}
 
Example #2
Source File: NLPLeaderboardFigParser.java    From science-result-extractor with Apache License 2.0 6 votes vote down vote up
/**
     * The processing here is called from the full text parser in cascade.
     * Start and end position in the higher level tokenization are indicated in
     * the resulting Figure object.
     */
    public Figure processing(List<LayoutToken> tokenizationFigure, String featureVector) {

        String res;
        try {
            res = label(featureVector);
        } catch (Exception e) {
            throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
        }
        if (res == null) {
            return null;
        }
//        List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(res);

//		System.out.println(Joiner.on("\n").join(labeled));
//		System.out.println("----------------------");
//		System.out.println("----------------------");

//		return getExtractionResult(tokenizationFigure, labeled);
        return getExtractionResult(tokenizationFigure, res);
    }
 
Example #3
Source File: NERParsers.java    From grobid-ner with Apache License 2.0 6 votes vote down vote up
/**
 * Extract all occurrences of named entity from a list of LayoutToken and a given language.
 */
public List<Entity> extractNE(List<LayoutToken> tokens, Language lang) throws GrobidResourceException {

    if ((tokens == null) || (tokens.size() == 0))
        return null;

    //text = text.replace("\n", " ");

    if (lang == null) {
        return extractNE(tokens);
    }

    NERParser parser = parsers.get(lang.getLang());
    if (parser == null) {
        throw new GrobidResourceException("The automatically identified language is currently not supported by grobid-ner: " +
            lang.getLang());
    }

    return parser.extractNE(tokens);
}
 
Example #4
Source File: NERParserCommonTest.java    From grobid-ner with Apache License 2.0 6 votes vote down vote up
@Test
public void testresultExtraction_clusteror_simple2() throws Exception {
    final String input = "Austria Hungary fought the enemies with Germany.";
    String result = "Austria\taustria\tA\tAu\tAus\tAust\tAustr\ta\tia\tria\ttria\tstria\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t1\t1\t1\t1\tXxxx\tXx\t0\tB-LOCATION\n" +
            "Hungary\thungary\tA\tAu\tAus\tAust\tAustr\ta\tia\tria\ttria\tstria\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t1\t1\t1\t1\tXxxx\tXx\t0\tLOCATION\n" +
            "fought\tfought\tf\tfo\tfou\tfoug\tfough\tt\tht\tght\tught\tought\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "the\tthe\tt\tth\tthe\tthe\tthe\te\the\tthe\tthe\tthe\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxx\tx\t0\tO\n" +
            "enemies\tenemies\te\ten\tene\tenem\tenemi\ts\tes\ties\tmies\temies\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "with\twith\tw\twi\twit\twith\twith\th\tth\tith\twith\twith\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "Germany\tgermany\tG\tGe\tGer\tGerm\tGerma\ty\tny\tany\tmany\trmany\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\tXxxx\tXx\t0\tB-LOCATION\n" +
            ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t.\t.\t0\tO";
    List<LayoutToken> tokenisation = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


    final List<Entity> entities = target.resultExtraction(GrobidModels.ENTITIES_NER, result, tokenisation);

    assertThat(entities, hasSize(2));
    assertThat(entities.get(0).getRawName(), is("Austria Hungary"));
    assertThat(entities.get(0).getType(), is(LOCATION));
    assertThat(entities.get(0).getOffsetStart(), is(0));
    assertThat(entities.get(0).getOffsetEnd(), is(15));
    assertThat(input.substring(entities.get(0).getOffsetStart(), entities.get(0).getOffsetEnd()), is("Austria Hungary"));
}
 
Example #5
Source File: NERFrParser.java    From grobid-ner with Apache License 2.0 6 votes vote down vote up
/**
 * Extract all occurrences of named entities from a list of LayoutToken
 * coming from a document with fixed/preserved layout, e.g. PDF. 
 * The positions of the recognized entities are given with coordinates in 
 * the input document.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) {
    if (tokens == null)
        return null;

    LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
    positionsIndexes.computeIndexes(tokens);

    String res = NERParserCommon.toFeatureVectorLayout(tokens, positionsIndexes);
    String result = label(res);
    //List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);

    //String text = LayoutTokensUtil.toText(tokens);
    List<Entity> entities = nerParserCommon.resultExtraction(GrobidModels.ENTITIES_NERFR, result, tokens);

    // we use now the sense tagger for the recognized named entity
    //List<Sense> senses = senseTagger.extractSenses(labeled, tokens, positionsIndexes);

    //NERParserCommon.merge(entities, senses);

    return entities;
}
 
Example #6
Source File: SimilarityScorer.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
public float getCentroidScore(NerdCandidate candidate, List<LayoutToken> tokens, String lang) {
	if (candidate.getWikidataId() == null)
		return 0.0F;
	CentroidEntityScorer scorer = centroidScorers.get(lang);
	if (scorer != null) {
		List<String> terms = toStringEmbeddings(tokens, lang);
           //System.out.println("\n"+candidate.toString());
           //System.out.println(terms.toString());
		float score = scorer.score(candidate.getWikidataId(), terms);
		//System.out.println("score: " + score);
		if (score < 0.0F)
			score = 0.0F;
		return score;
	} else {
		LOGGER.warn(lang + " centroid scorer is null!");
		return 0.0F;
	}
}
 
Example #7
Source File: NEREnParser.java    From grobid-ner with Apache License 2.0 6 votes vote down vote up
/**
 * Extract all occurrences of named entities from a list of LayoutToken
 * coming from a document with fixed/preserved layout, e.g. PDF. 
 * The positions of the recognized entities are given with coordinates in 
 * the input document.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) {
    if (tokens == null)
        return null;
    
    LexiconPositionsIndexes positionsIndexes = new LexiconPositionsIndexes(lexicon);
    positionsIndexes.computeIndexes(tokens);

    String res = NERParserCommon.toFeatureVectorLayout(tokens, positionsIndexes);
    String result = label(res);
    //List<Pair<String, String>> labeled = GenericTaggerUtils.getTokensAndLabels(result);

    //String text = LayoutTokensUtil.toText(tokens);
    List<Entity> entities = nerParserCommon.resultExtraction(GrobidModels.ENTITIES_NER, result, tokens);

    // we use now the sense tagger for the recognized named entity
    //List<Sense> senses = senseTagger.extractSenses(labeled, tokens, positionsIndexes);

    //NERParserCommon.merge(entities, senses);

    return entities;
}
 
Example #8
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
/**
 * This is the entry point for a NerdQuery to have its textual content processed.
 * The mthod will generate a list of recognized named entities produced by a list
 * of mention recognition modules specified in the list field 'mention' of the NerdQuery
 * object. Each mention recognition method will be applied sequencially in the order
 * given in the list field 'mention'.
 *
 * @param nerdQuery the NERD query to be processed
 * @return the list of identified mentions
 */
public List<Mention> process(NerdQuery nerdQuery) throws NerdException {
    String text = nerdQuery.getTextOrShortText();

    List<LayoutToken> tokens = nerdQuery.getTokens();

    if (isBlank(text) && isEmpty(tokens)) {
        LOGGER.warn("No content to process.");
        return new ArrayList<>();
    }

    if (isNotBlank(text))
        return processText(nerdQuery);
    else
        return processTokens(nerdQuery);
}
 
Example #9
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
/**
 * Precondition: list of LayoutToken in the query object is not empty
 */
private List<Mention> processTokens(NerdQuery nerdQuery) throws NerdException {
    List<LayoutToken> tokens = nerdQuery.getTokens();
    List<Mention> results = new ArrayList<>();

    Language language = nerdQuery.getLanguage();

    // get the list of requested mention types
    List<ProcessText.MentionMethod> mentionTypes = nerdQuery.getMentions();

    // we process the whole text, sentence info does not apply to layout documents
    try {
        for (ProcessText.MentionMethod mentionType : mentionTypes) {
            List<Mention> localResults = getMentions(tokens, language, mentionType);

            results.addAll(localResults);
        }
    } catch (Exception e) {
        throw new NerdException("NERD error when processing text.", e);
    }

    return results;
}
 
Example #10
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
/**
 * Utility method to process a list of layout tokens and return the NER mentions
 **/
private List<Mention> extractNER(List<LayoutToken> tokens, Language language) {
    List<Mention> results = new ArrayList<>();

    if (isEmpty(tokens)) {
        LOGGER.warn("Trying to extract NE mention from empty content. Returning empty list.");
        return results;
    }

    String lang = language.getLang();
    if ((lang == null) || (!lang.equals("en") && !lang.equals("fr")))
        return new ArrayList<>();

    try {
        List<Entity> entityResults = nerParsers.extractNE(tokens, language);
        for (Entity entityResult : entityResults) {
            Mention mention = new Mention(entityResult);
            mention.setSource(MentionMethod.ner);
            results.add(mention);
        }
    } catch (Exception e) {
        LOGGER.error("NER extraction failed", e);
    }

    return results;
}
 
Example #11
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
@Test
public void testNGram_LayoutTokens_oneGram_shouldWork() throws Exception {
    final String input = "this is it.";

    final List<LayoutToken> inputLayoutTokens = GrobidAnalyzer.getInstance()
            .tokenizeWithLayoutToken(input, new Language("en"));

    final List<StringPos> result = processText.ngrams(inputLayoutTokens, 1);
    System.out.println(result);

    assertThat(result, hasSize(6));
    assertThat(result.get(0), is(new StringPos("this", 0)));
    assertThat(result.get(1), is(new StringPos(" ", 4)));
    assertThat(result.get(2), is(new StringPos("is", 5)));
    assertThat(result.get(3), is(new StringPos(" ", 7)));
}
 
Example #12
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
@Test
public void testParagraphSegmentation() {
    // create a dummy super long text to be segmented
    List<LayoutToken> tokens = new ArrayList<>();
    for (int i = 0; i < 1000; i++) {
        if (i == 250) {
            tokens.add(new LayoutToken("\n"));
        }
        if (i == 500) {
            tokens.add(new LayoutToken("\n"));
            tokens.add(new LayoutToken("\n"));
        }
        tokens.add(new LayoutToken("blabla"));
        tokens.add(new LayoutToken(" "));
    }

    List<List<LayoutToken>> segments = ProcessText.segmentInParagraphs(tokens);
    assertThat(segments, hasSize(5));
}
 
Example #13
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
public static List<List<LayoutToken>> segmentInParagraphs(List<LayoutToken> tokens) {
    // heuristics: double end of line, if not simple end of line (not aligned with
    // previous line), and if still not we segment arbitrarly the monolithic block
    List<List<LayoutToken>> result = new ArrayList<>();
    result.add(tokens);

    // we recursively segment too large segments, starting with one unique segment
    // which is the whole text

    while (true) {
        result = subSsegmentInParagraphs(result);
        if (!containsTooLargeSegment(result))
            break;
    }

    return result;
}
 
Example #14
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
@Test
    public void testAcronymsTokensMixedCase() {
        String input = "Cigarette smoke (CS)-induced airway epithelial senescence has been implicated in " +
                "the pathogenesis of chronic obstructive pulmonary disease (COPD).";
        List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
        Map<Mention, Mention> acronyms = processText.acronymCandidates(tokens);
        assertNotNull(acronyms);
        for (Map.Entry<Mention, Mention> entry : acronyms.entrySet()) {
            Mention base = entry.getValue();
            Mention acronym = entry.getKey();
//System.out.println("acronym: " + input.substring(acronym.start, acronym.end) + " / base: " + base.getRawName());
            if (input.substring(acronym.getOffsetStart(), acronym.getOffsetEnd()).trim().equals("CS")) {
                assertEquals(base.getRawName(), "Cigarette smoke");
            } else {
                assertEquals(input.substring(acronym.getOffsetStart(), acronym.getOffsetEnd()).trim(), "COPD");
                assertEquals(base.getRawName(), "chronic obstructive pulmonary disease");
            }
        }
    }
 
Example #15
Source File: Utilities.java    From entity-fishing with Apache License 2.0 6 votes vote down vote up
public static List<LayoutToken> getWindow(int start, int end, List<LayoutToken> tokens, int size, String lang) {
	List<LayoutToken> subTokens = new ArrayList<LayoutToken>();

	// first locate the entity in the token list
	int pos = 0;
	for(LayoutToken token : tokens) {
		if ( (token.getOffset() >= start) && ((token.getOffset()+token.getText().length()) <= end) )
			break;
		pos++;
	}

	int posStart = pos - size;
	if (posStart < 0)
		posStart = 0;
	int posEnd = pos + size;
	if (posEnd >= tokens.size())
		posEnd = tokens.size()-1;

	for(int p = posStart; p <= posEnd; p++) {
		if (p != pos) {
			subTokens.add(tokens.get(p));
		}
	}

	return subTokens;
}
 
Example #16
Source File: NERParsers.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
/**
 * Extract all occurrences of named entity from list of LayoutToken of unknown language.
 * A language identifier is used to determine the language, and the token sequence is 
 * processed if the identified language is supported.
 */
public List<Entity> extractNE(List<LayoutToken> tokens) throws GrobidResourceException {
    // run language identifier
    LanguageUtilities languageIdentifier = LanguageUtilities.getInstance();                     
    Language resultLang = null;
    synchronized (languageIdentifier) {       
        resultLang = languageIdentifier.runLanguageId(LayoutTokensUtil.toText(tokens), 2000); 
    }

    return extractNE(tokens, resultLang);
}
 
Example #17
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
@Ignore("This test is not testing anything")
public void extractMentionsWikipedia() throws Exception {
    final String input = "this is it.";

    final Language language = new Language("en");
    final List<LayoutToken> inputLayoutTokens = GrobidAnalyzer.getInstance()
            .tokenizeWithLayoutToken(input, language);

    System.out.println(processText.extractMentionsWikipedia(inputLayoutTokens, language));

    System.out.println(processText.extractMentionsWikipedia(input, language));

}
 
Example #18
Source File: NLPLeaderboardTable.java    From science-result-extractor with Apache License 2.0 5 votes vote down vote up
public List<String> getAssociatedTagsStr_column() {
    List<String> columns = new ArrayList();
    for (TableCell cell : associatedTags_column.values()) {
        String s = "";
        for(LayoutToken lt: cell.lt){
            s = s + " " + lt.t();
        }
        columns.add(s.trim());
    }
    return columns;
}
 
Example #19
Source File: NerdRestProcessFile.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
protected boolean isTitle(List<LayoutToken> layoutTokens) {
    int count = 0;
    int total = 0;
    for (LayoutToken layoutToken : layoutTokens) {
        if (!TextUtilities.delimiters.contains(layoutToken.getText())) {
            if (layoutToken.getLabels().contains(TaggingLabels.HEADER_TITLE)) {
                count++;
            }
            total++;
        }
    }

    return count == total;
}
 
Example #20
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
/**
 * NER processing of a sequence of LayoutTokens. Generate list of named entity
 * mentions.
 *
 * @param tokens the sequence of LayoutToken objects
 * @return the list of identified mentions
 */
public List<Mention> processNER(List<LayoutToken> tokens, Language language) throws NerdException {
    List<Mention> results = extractNER(tokens, language);

    Collections.sort(results);

    // associate bounding boxes to identified mentions
    List<Mention> finalResults = new ArrayList<>();

    for (Mention entity : results) {
        // synchronize layout token with the selected n-grams
        List<LayoutToken> entityTokens = entity.getLayoutTokens();

        if (entityTokens != null)
            entity.setBoundingBoxes(BoundingBoxCalculator.calculate(entityTokens));
        else
            LOGGER.warn("processNER: LayoutToken sequence not found for mention: " + entity.getRawName());
        // we have an additional check of validity based on language
        if (validEntity(entity, language.getLang())) {
            if (!finalResults.contains(entity)) {
                finalResults.add(entity);
            }
        }
    }

    return finalResults;
}
 
Example #21
Source File: NLPLeaderboardTable.java    From science-result-extractor with Apache License 2.0 5 votes vote down vote up
public List<String> getAssociatedTagsStr_row() {
    List<String> rows = new ArrayList();
    for (TableCell cell : associatedTags_row.values()) {
        String s = "";
        for(LayoutToken lt: cell.lt){
            s = s + " " + lt.t();
        }
        rows.add(s.trim());
    }
    return rows;
}
 
Example #22
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
/**
 * Processing of some raw text by extracting all non-trivial ngrams.
 * Generate a list of entity mentions that will be instanciated by
 * Wikipedia labels (anchors and titles).
 *
 * @param tokens the sequence of tokens to be parsed
 * @return the list of identified entities.
 */
public List<Mention> processWikipedia(List<LayoutToken> tokens, Language lang) throws NerdException {
    if ((tokens == null) || (tokens.size() == 0)) {
        //System.out.println("Content to be processed is empty.");
        LOGGER.error("Content to be processed is empty.");
        return null;
    }

    List<Mention> results = new ArrayList<>();
    try {
        List<Mention> subPool = extractMentionsWikipedia(tokens, lang);

        Collections.sort(subPool);
        for (Mention candidate : subPool) {
            List<LayoutToken> entityTokens = candidate.getLayoutTokens();

            if (entityTokens != null)
                candidate.setBoundingBoxes(BoundingBoxCalculator.calculate(entityTokens));
            else
                LOGGER.warn("processWikipedia: LayoutToken sequence not found for mention: " + candidate.rawName);
            // we have an additional check of validity based on language
            if (validEntity(candidate, lang.getLang())) {
                if (!results.contains(candidate))
                    results.add(candidate);
            }
        }
    } catch (Exception e) {
        throw new NerdException("NERD error when processing text.", e);
    }

    return results;
}
 
Example #23
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
public void testParagraphSegmentationMonolithic() {
    // create a dummy super long text to be segmented
    List<LayoutToken> tokens = new ArrayList<>();
    for (int i = 0; i < 1000; i++) {
        tokens.add(new LayoutToken("blabla"));
        tokens.add(new LayoutToken(" "));
    }

    List<List<LayoutToken>> segments = ProcessText.segmentInParagraphs(tokens);
    assertThat(segments, hasSize(4));
}
 
Example #24
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetSequenceMatch_multiTokenAcronym_shouldWork() throws Exception {

    String text = "We are proving that the P.C.T. is working fine. P.C.T. will work just fine.";

    final List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

    final LayoutToken acronymLayoutToken1 = new LayoutToken("P");
    acronymLayoutToken1.setOffset(24);
    final LayoutToken acronymLayoutToken2 = new LayoutToken(".");
    acronymLayoutToken2.setOffset(25);
    final LayoutToken acronymLayoutToken3 = new LayoutToken("C");
    acronymLayoutToken3.setOffset(26);
    final LayoutToken acronymLayoutToken4 = new LayoutToken(".");
    acronymLayoutToken4.setOffset(27);
    final LayoutToken acronymLayoutToken5 = new LayoutToken("T");
    acronymLayoutToken5.setOffset(28);
    final LayoutToken acronymLayoutToken6 = new LayoutToken(".");
    acronymLayoutToken6.setOffset(29);

    List<LayoutToken> layoutTokenAcronym = Arrays.asList(acronymLayoutToken1, acronymLayoutToken2,
            acronymLayoutToken3, acronymLayoutToken4, acronymLayoutToken5, acronymLayoutToken6);

    final List<LayoutToken> sequenceMatch = processText.getSequenceMatch(tokens, 24, layoutTokenAcronym);
    assertThat(sequenceMatch, hasSize(6));
    assertThat(sequenceMatch.get(0), is(tokens.get(24)));
}
 
Example #25
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetSequenceMatch_singleTokenAcronym_shouldWork() throws Exception {

    String text = "We are proving that the PCT is working fine. PCT will work just fine.";

    final List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(text);

    final LayoutToken pct = new LayoutToken("PCT");
    pct.setOffset(24);
        final List<LayoutToken> sequenceMatch = processText.getSequenceMatch(tokens, 19, Arrays.asList(pct));
    assertThat(sequenceMatch, hasSize(1));
    assertThat(sequenceMatch.get(0), is(tokens.get(19)));
}
 
Example #26
Source File: TaggingTokenCluster.java    From science-result-extractor with Apache License 2.0 5 votes vote down vote up
public List<LayoutToken> concatTokens() {

        Iterable<LayoutToken> it = Iterables.concat(Iterables.transform(labeledTokensContainers, new Function<LabeledTokensContainer, List<LayoutToken>>() {
            @Override
            public List<LayoutToken> apply(LabeledTokensContainer labeledTokensContainer) {
                return labeledTokensContainer.getLayoutTokens();
            }
        }));
        return Lists.newArrayList(it);
    }
 
Example #27
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
private static boolean containsTooLargeSegment(List<List<LayoutToken>> segments) {
    for (List<LayoutToken> segment : segments) {
        if (segment.size() > MAXIMAL_PARAGRAPH_LENGTH) {
            return true;
        }
    }
    return false;
}
 
Example #28
Source File: NERParserCommonTest.java    From grobid-ner with Apache License 2.0 5 votes vote down vote up
@Test
public void testresultExtraction_clusteror_simple() throws Exception {
    final String input = "Austria fought the enemies with Germany.";
    String result = "Austria\taustria\tA\tAu\tAus\tAust\tAustr\ta\tia\tria\ttria\tstria\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t1\t1\t1\t1\tXxxx\tXx\t0\tB-UNKNOWN\n" +
            "fought\tfought\tf\tfo\tfou\tfoug\tfough\tt\tht\tght\tught\tought\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "the\tthe\tt\tth\tthe\tthe\tthe\te\the\tthe\tthe\tthe\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxx\tx\t0\tO\n" +
            "enemies\tenemies\te\ten\tene\tenem\tenemi\ts\tes\ties\tmies\temies\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "with\twith\tw\twi\twit\twith\twith\th\tth\tith\twith\twith\tNOCAPS\tNODIGIT\t0\t1\t0\t0\t0\t0\t0\t0\t0\t0\t0\txxxx\tx\t0\tO\n" +
            "Germany\tgermany\tG\tGe\tGer\tGerm\tGerma\ty\tny\tany\tmany\trmany\tINITCAP\tNODIGIT\t0\t0\t0\t0\t1\t0\t0\t0\t0\t0\t0\tXxxx\tXx\t0\tB-LOCATION\n" +
            ".\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tALLCAPS\tNODIGIT\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t.\t.\t0\tO";
    List<LayoutToken> tokenisation = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input);


    final List<Entity> entities = target.resultExtraction(GrobidModels.ENTITIES_NER, result, tokenisation);

    assertThat(entities, hasSize(2));

    final Entity entity0 = entities.get(0);
    assertThat(entity0.getRawName(), is("Austria"));
    assertThat(entity0.getOffsetStart(), is(0));
    assertThat(entity0.getOffsetEnd(), is(7));

    final Entity entity1 = entities.get(1);
    assertThat(entity1.getRawName(), is("Germany"));
    assertThat(entity1.getOffsetStart(), is(32));
    assertThat(entity1.getOffsetEnd(), is(39));
}
 
Example #29
Source File: ProcessTextTest.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
@Test
public void testAcronymsTokensAllLower() {
    String input = "A graphical model or probabilistic graphical model (PGM) is a probabilistic model.";
    List<LayoutToken> tokens = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input, new Language("en", 1.0));
    Map<Mention, Mention> acronyms = processText.acronymCandidates(tokens);
    assertThat(acronyms.entrySet(), hasSize(1));

    final ArrayList<Mention> keys = new ArrayList<>(acronyms.keySet());
    final Mention shortAcronym = keys.get(0);
    final Mention extendedAcronym = acronyms.get(shortAcronym);

    assertThat(extendedAcronym.getRawName(), is("probabilistic graphical model"));
    assertThat(input.substring(shortAcronym.getOffsetStart(), shortAcronym.getOffsetEnd()), is("PGM"));
}
 
Example #30
Source File: ProcessText.java    From entity-fishing with Apache License 2.0 5 votes vote down vote up
private List<Mention> getMentions(List<LayoutToken> tokens, Language language, MentionMethod mentionType) {
    List<Mention> localResults = new ArrayList<>();

    if (mentionType == MentionMethod.ner) {
        localResults = processNER(tokens, language);
    } else if (mentionType == MentionMethod.wikipedia) {
        localResults = processWikipedia(tokens, language);
    } /*else if (mentionType == ProcessText.MentionMethod.species) {
        localResults = processSpecies(tokens, language);
    }*/
    return localResults;
}